diff --git a/docs/reference/feature-servers/registry-server.md b/docs/reference/feature-servers/registry-server.md index 180ce408285..1c8adb2f88d 100644 --- a/docs/reference/feature-servers/registry-server.md +++ b/docs/reference/feature-servers/registry-server.md @@ -1123,7 +1123,72 @@ Please refer the [page](./../../../docs/getting-started/concepts/permission.md) **Note**: Recent visits are automatically logged when users access registry objects via the REST API. The logging behavior can be configured through the `feature_server.recent_visit_logging` section in `feature_store.yaml` (see configuration section below). ---- +#### Get Popular Tags +- **Endpoint**: `GET /api/v1/metrics/popular_tags` +- **Description**: Discover Feature Views by popular tags. Returns the most popular tags (tags assigned to maximum number of feature views) with their associated feature views. If no project is specified, returns popular tags across all projects. +- **Parameters**: + - `project` (optional): Project name for popular tags (returns all projects if not specified) + - `limit` (optional, default: 4): Number of popular tags to return + - `allow_cache` (optional, default: true): Whether to allow cached responses +- **Examples**: + ```bash + # Basic usage (all projects) + curl -H "Authorization: Bearer " \ + "http://localhost:6572/api/v1/metrics/popular_tags" + + # Specific project + curl -H "Authorization: Bearer " \ + "http://localhost:6572/api/v1/metrics/popular_tags?project=my_project" + + # Custom limit + curl -H "Authorization: Bearer " \ + "http://localhost:6572/api/v1/metrics/popular_tags?project=my_project&limit=3" + ``` +- **Response Model**: `PopularTagsResponse` +- **Response Example**: + ```json + { + "popular_tags": [ + { + "tag_key": "environment", + "tag_value": "production", + "feature_views": [ + { + "name": "user_features", + "project": "my_project" + }, + { + "name": "order_features", + "project": "my_project" + } + ], + "total_feature_views": 2 + }, + { + "tag_key": "team", + "tag_value": "ml_team", + "feature_views": [ + { + "name": "user_features", + "project": "my_project" + } + ], + "total_feature_views": 1 + } + ], + "metadata": { + "totalFeatureViews": 3, + "totalTags": 2, + "limit": 4 + } + } + ``` + +**Response Models:** +- `FeatureViewInfo`: Contains feature view name and project +- `PopularTagInfo`: Contains tag information and associated feature views +- `PopularTagsMetadata`: Contains metadata about the response +- `PopularTagsResponse`: Main response model containing popular tags and metadata ## Registry Server Configuration: Recent Visit Logging @@ -1162,3 +1227,4 @@ feature_server: - Only the most recent `limit` visits per user are stored - Metrics endpoints (`/metrics/*`) are automatically excluded from logging to prevent circular references - Visit data is stored per user and per project in the registry metadata + diff --git a/sdk/python/feast/api/registry/rest/metrics.py b/sdk/python/feast/api/registry/rest/metrics.py index f58d403b6b8..a3c836d9c57 100644 --- a/sdk/python/feast/api/registry/rest/metrics.py +++ b/sdk/python/feast/api/registry/rest/metrics.py @@ -1,9 +1,11 @@ import json import logging -from typing import Optional +from typing import Dict, List, Optional -from fastapi import APIRouter, Depends, Query, Request +from fastapi import APIRouter, Depends, HTTPException, Query, Request +from pydantic import BaseModel, Field +from feast.api.registry.rest.feature_views import _extract_feature_view_from_any from feast.api.registry.rest.rest_utils import ( get_pagination_params, get_sorting_params, @@ -13,6 +15,47 @@ from feast.protos.feast.registry import RegistryServer_pb2 +class FeatureViewInfo(BaseModel): + """Feature view information in popular tags response.""" + + name: str = Field(..., description="Name of the feature view") + project: str = Field(..., description="Project name of the feature view") + + +class PopularTagInfo(BaseModel): + """Popular tag information with associated feature views.""" + + tag_key: str = Field(..., description="Tag key") + tag_value: str = Field(..., description="Tag value") + feature_views: List[FeatureViewInfo] = Field( + ..., description="List of feature views with this tag" + ) + total_feature_views: int = Field( + ..., description="Total number of feature views with this tag" + ) + + +class PopularTagsMetadata(BaseModel): + """Metadata for popular tags response.""" + + totalFeatureViews: int = Field( + ..., description="Total number of feature views processed" + ) + totalTags: int = Field(..., description="Total number of unique tags found") + limit: int = Field(..., description="Number of popular tags requested") + + +class PopularTagsResponse(BaseModel): + """Response model for popular tags endpoint.""" + + popular_tags: List[PopularTagInfo] = Field( + ..., description="List of popular tags with their associated feature views" + ) + metadata: PopularTagsMetadata = Field( + ..., description="Metadata about the response" + ) + + def get_metrics_router(grpc_handler, server=None) -> APIRouter: logger = logging.getLogger(__name__) router = APIRouter() @@ -96,6 +139,155 @@ def count_resources_for_project(project_name: str): total_counts[k] += counts[k] return {"total": total_counts, "perProject": all_counts} + @router.get( + "/metrics/popular_tags", tags=["Metrics"], response_model=PopularTagsResponse + ) + async def popular_tags( + project: Optional[str] = Query( + None, + description="Project name for popular tags (optional, returns all projects if not specified)", + ), + limit: int = Query(4, description="Number of popular tags to return"), + allow_cache: bool = Query(default=True), + ): + """ + Discover Feature Views by popular tags. Returns the most popular tags + (tags assigned to maximum number of feature views) with their associated feature views. + If no project is specified, returns popular tags across all projects. + """ + + def build_tag_collection( + feature_views: List[Dict], + ) -> Dict[str, Dict[str, List[Dict]]]: + """Build a collection of tags grouped by tag key and tag value.""" + tag_collection: Dict[str, Dict[str, List[Dict]]] = {} + + for fv in feature_views: + tags = fv.get("spec", {}).get("tags", {}) + if not tags: + continue + + for tag_key, tag_value in tags.items(): + if tag_key not in tag_collection: + tag_collection[tag_key] = {} + + if tag_value not in tag_collection[tag_key]: + tag_collection[tag_key][tag_value] = [] + + tag_collection[tag_key][tag_value].append(fv) + + return tag_collection + + def find_most_popular_tags( + tag_collection: Dict[str, Dict[str, List[Dict]]], + ) -> List[Dict]: + """Find the most popular tags based on total feature view count.""" + tag_popularity = [] + + for tag_key, tag_values_map in tag_collection.items(): + for tag_value, fv_entries in tag_values_map.items(): + total_feature_views = len(fv_entries) + tag_popularity.append( + { + "tag_key": tag_key, + "tag_value": tag_value, + "feature_views": fv_entries, + "total_feature_views": total_feature_views, + } + ) + + return sorted( + tag_popularity, + key=lambda x: (x["total_feature_views"], x["tag_key"]), + reverse=True, + ) + + def get_feature_views_for_project(project_name: str) -> List[Dict]: + """Get feature views for a specific project.""" + req = RegistryServer_pb2.ListAllFeatureViewsRequest( + project=project_name, + allow_cache=allow_cache, + ) + response = grpc_call(grpc_handler.ListAllFeatureViews, req) + any_feature_views = response.get("featureViews", []) + feature_views = [] + for any_feature_view in any_feature_views: + feature_view = _extract_feature_view_from_any(any_feature_view) + if feature_view: + feature_view["project"] = project_name + feature_views.append(feature_view) + return feature_views + + try: + if project: + feature_views = get_feature_views_for_project(project) + else: + projects_resp = grpc_call( + grpc_handler.ListProjects, + RegistryServer_pb2.ListProjectsRequest(allow_cache=allow_cache), + ) + projects = projects_resp.get("projects", []) + feature_views = [] + for project_info in projects: + project_name = project_info["spec"]["name"] + project_feature_views = get_feature_views_for_project(project_name) + feature_views.extend(project_feature_views) + + if not feature_views: + return PopularTagsResponse( + popular_tags=[], + metadata=PopularTagsMetadata( + totalFeatureViews=0, + totalTags=0, + limit=limit, + ), + ) + + tag_collection = build_tag_collection(feature_views) + + if not tag_collection: + return PopularTagsResponse( + popular_tags=[], + metadata=PopularTagsMetadata( + totalFeatureViews=len(feature_views), + totalTags=0, + limit=limit, + ), + ) + popular_tags = find_most_popular_tags(tag_collection) + top_popular_tags = popular_tags[:limit] + formatted_tags = [] + for tag_info in top_popular_tags: + feature_view_infos = [ + FeatureViewInfo( + name=fv.get("spec", {}).get("name", "unknown"), + project=fv.get("project", "unknown"), + ) + for fv in tag_info["feature_views"] + ] + + formatted_tag = PopularTagInfo( + tag_key=tag_info["tag_key"], + tag_value=tag_info["tag_value"], + feature_views=feature_view_infos, + total_feature_views=tag_info["total_feature_views"], + ) + formatted_tags.append(formatted_tag) + + return PopularTagsResponse( + popular_tags=formatted_tags, + metadata=PopularTagsMetadata( + totalFeatureViews=len(feature_views), + totalTags=len(popular_tags), + limit=limit, + ), + ) + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Failed to generate popular tags: {str(e)}", + ) + @router.get("/metrics/recently_visited", tags=["Metrics"]) async def recently_visited( request: Request, diff --git a/sdk/python/tests/unit/api/test_api_rest_registry.py b/sdk/python/tests/unit/api/test_api_rest_registry.py index 21f2cbec4bc..506a5a92937 100644 --- a/sdk/python/tests/unit/api/test_api_rest_registry.py +++ b/sdk/python/tests/unit/api/test_api_rest_registry.py @@ -63,10 +63,39 @@ def fastapi_test_app(): Field(name="income", dtype=Float64), ], source=user_profile_source, + tags={"environment": "production", "team": "ml", "version": "1.0"}, ) + user_behavior_feature_view = FeatureView( + name="user_behavior", + entities=[user_id_entity], + ttl=None, + schema=[ + Field(name="click_count", dtype=Int64), + Field(name="session_duration", dtype=Float64), + ], + source=user_profile_source, + tags={"environment": "staging", "team": "analytics", "version": "2.0"}, + ) + + user_preferences_feature_view = FeatureView( + name="user_preferences", + entities=[user_id_entity], + ttl=None, + schema=[ + Field(name="preferred_category", dtype=Int64), + Field(name="engagement_score", dtype=Float64), + ], + source=user_profile_source, + tags={"environment": "production", "team": "analytics", "version": "1.5"}, + ) + user_feature_service = FeatureService( name="user_service", - features=[user_profile_feature_view], + features=[ + user_profile_feature_view, + user_behavior_feature_view, + user_preferences_feature_view, + ], ) # Create a saved dataset for testing @@ -80,7 +109,15 @@ def fastapi_test_app(): ) # Apply objects - store.apply([user_id_entity, user_profile_feature_view, user_feature_service]) + store.apply( + [ + user_id_entity, + user_profile_feature_view, + user_behavior_feature_view, + user_preferences_feature_view, + user_feature_service, + ] + ) store._registry.apply_saved_dataset(test_saved_dataset, "demo_project") # Build REST app with registered routes @@ -1482,3 +1519,65 @@ def test_metrics_recently_visited_user_isolation(fastapi_test_app): # All visits should be for anonymous user for visit in visits: assert visit["user"] == "anonymous" + + +def test_metrics_popular_tags_via_rest(fastapi_test_app): + """Test the /metrics/popular_tags endpoint.""" + response = fastapi_test_app.get("/metrics/popular_tags?project=demo_project") + assert response.status_code == 200 + data = response.json() + + assert "popular_tags" in data + assert "metadata" in data + + metadata = data["metadata"] + assert "totalFeatureViews" in metadata + assert "totalTags" in metadata + assert "limit" in metadata + + assert metadata["totalFeatureViews"] >= 3 + assert metadata["totalTags"] >= 3 # environment, team, version + + popular_tags = data["popular_tags"] + assert isinstance(popular_tags, list) + assert len(popular_tags) > 0 + + for tag_info in popular_tags: + assert "tag_key" in tag_info + assert "tag_value" in tag_info + assert "feature_views" in tag_info + assert "total_feature_views" in tag_info + assert isinstance(tag_info["total_feature_views"], int) + assert tag_info["total_feature_views"] > 0 + assert isinstance(tag_info["feature_views"], list) + + for fv in tag_info["feature_views"]: + assert "name" in fv + assert "project" in fv + assert isinstance(fv["name"], str) + assert isinstance(fv["project"], str) + + response = fastapi_test_app.get( + "/metrics/popular_tags?project=demo_project&limit=2" + ) + assert response.status_code == 200 + data = response.json() + + assert len(data["popular_tags"]) <= 2 + + response = fastapi_test_app.get("/metrics/popular_tags") + assert response.status_code == 200 + data = response.json() + + assert "popular_tags" in data + assert "metadata" in data + + popular_tags = data["popular_tags"] + assert isinstance(popular_tags, list) + # May be empty if no feature views exist, but structure should be correct + if len(popular_tags) > 0: + for tag_info in popular_tags: + assert "tag_key" in tag_info + assert "tag_value" in tag_info + assert "feature_views" in tag_info + assert "total_feature_views" in tag_info