diff --git a/sdk/python/feast/cli/cli.py b/sdk/python/feast/cli/cli.py index 91fa2a92606..60ea6292488 100644 --- a/sdk/python/feast/cli/cli.py +++ b/sdk/python/feast/cli/cli.py @@ -412,13 +412,18 @@ def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List "milvus", "ray", "ray_rag", + "pytorch_nlp", ], case_sensitive=False, ), help="Specify a template for the created project", default="local", ) -def init_command(project_directory, minimal: bool, template: str): +@click.option( + "--repo-path", + help="Directory path where the repository will be created (default: create subdirectory with project name)", +) +def init_command(project_directory, minimal: bool, template: str, repo_path: str): """Create a new Feast repository""" if not project_directory: project_directory = generate_project_name() @@ -426,7 +431,7 @@ def init_command(project_directory, minimal: bool, template: str): if minimal: template = "minimal" - init_repo(project_directory, template) + init_repo(project_directory, template, repo_path) @cli.command("listen") diff --git a/sdk/python/feast/repo_operations.py b/sdk/python/feast/repo_operations.py index 8eae581a260..9bc11e625f5 100644 --- a/sdk/python/feast/repo_operations.py +++ b/sdk/python/feast/repo_operations.py @@ -445,27 +445,37 @@ def cli_check_repo(repo_path: Path, fs_yaml_file: Path): sys.exit(1) -def init_repo(repo_name: str, template: str): +def init_repo(repo_name: str, template: str, repo_path: Optional[str] = None): import os from pathlib import Path from shutil import copytree from colorama import Fore, Style + # Validate project name if not is_valid_name(repo_name): raise BadParameter( message="Name should be alphanumeric values, underscores, and hyphens but not start with an underscore or hyphen", param_hint="PROJECT_DIRECTORY", ) - repo_path = Path(os.path.join(Path.cwd(), repo_name)) - repo_path.mkdir(exist_ok=True) - repo_config_path = repo_path / "feature_store.yaml" - if repo_config_path.exists(): - new_directory = os.path.relpath(repo_path, os.getcwd()) + # Determine where to create the repository + if repo_path: + # User specified a custom path + target_path = Path(repo_path).resolve() + target_path.mkdir(parents=True, exist_ok=True) + display_path = repo_path + else: + # Default behavior: create subdirectory with project name + target_path = Path(os.path.join(Path.cwd(), repo_name)) + target_path.mkdir(exist_ok=True) + display_path = repo_name + repo_config_path = target_path / "feature_store.yaml" + + if repo_config_path.exists(): print( - f"The directory {Style.BRIGHT + Fore.GREEN}{new_directory}{Style.RESET_ALL} contains an existing feature " + f"The directory {Style.BRIGHT + Fore.GREEN}{display_path}{Style.RESET_ALL} contains an existing feature " f"store repository that may cause a conflict" ) print() @@ -475,14 +485,14 @@ def init_repo(repo_name: str, template: str): template_path = str(Path(Path(__file__).parent / "templates" / template).absolute()) if not os.path.exists(template_path): raise IOError(f"Could not find template {template}") - copytree(template_path, str(repo_path), dirs_exist_ok=True) + copytree(template_path, str(target_path), dirs_exist_ok=True) # Rename gitignore files back to .gitignore - for gitignore_path in repo_path.rglob("gitignore"): + for gitignore_path in target_path.rglob("gitignore"): gitignore_path.rename(gitignore_path.with_name(".gitignore")) # Seed the repository - bootstrap_path = repo_path / "bootstrap.py" + bootstrap_path = target_path / "bootstrap.py" if os.path.exists(bootstrap_path): import importlib.util @@ -495,7 +505,7 @@ def init_repo(repo_name: str, template: str): os.remove(bootstrap_path) # Template the feature_store.yaml file - feature_store_yaml_path = repo_path / "feature_repo" / "feature_store.yaml" + feature_store_yaml_path = target_path / "feature_repo" / "feature_store.yaml" replace_str_in_file( feature_store_yaml_path, "project: my_project", f"project: {repo_name}" ) @@ -503,13 +513,13 @@ def init_repo(repo_name: str, template: str): # Remove the __pycache__ folder if it exists import shutil - shutil.rmtree(repo_path / "__pycache__", ignore_errors=True) + shutil.rmtree(target_path / "__pycache__", ignore_errors=True) import click click.echo() click.echo( - f"Creating a new Feast repository in {Style.BRIGHT + Fore.GREEN}{repo_path}{Style.RESET_ALL}." + f"Creating a new Feast repository in {Style.BRIGHT + Fore.GREEN}{target_path}{Style.RESET_ALL}." ) click.echo() diff --git a/sdk/python/feast/templates/pytorch_nlp/README.md b/sdk/python/feast/templates/pytorch_nlp/README.md new file mode 100644 index 00000000000..3d9babbd232 --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/README.md @@ -0,0 +1,506 @@ +# PyTorch NLP Sentiment Analysis with Feast + +This template demonstrates how to build a complete sentiment analysis pipeline using **Feast** (Feature Store) with **PyTorch** and **Hugging Face Transformers**. It showcases modern MLOps practices for NLP including feature engineering, model serving, and real-time inference. + +## šŸŽÆ What You'll Learn + +- **Feast Fundamentals**: Feature stores, entities, feature views, and services +- **NLP Feature Engineering**: Text preprocessing and feature extraction patterns +- **PyTorch Integration**: Using pre-trained Hugging Face models with Feast +- **Real-time Serving**: Online feature serving for production inference +- **MLOps Patterns**: Model versioning, performance monitoring, and data governance + +## šŸš€ Quick Start + +### Prerequisites + +- Python 3.8+ +- pip or conda for package management + +### 1. Initialize the Project + +```bash +feast init my-sentiment-project -t pytorch_nlp +cd my-sentiment-project +``` + +### 2. Install Dependencies + +```bash +# Install Feast with NLP support (includes PyTorch, transformers, and ML utilities) +pip install feast[nlp] +``` + +### 3. Apply and Materialize Features + +```bash +cd feature_repo +feast apply +feast materialize-incremental $(date -u +"%Y-%m-%dT%H:%M:%S") +``` + +### 4. Start Feature Server + +```bash +feast serve --host 0.0.0.0 --port 6566 +``` + +### 5. Test with Python (Optional) + +```bash +python test_workflow.py +``` + +## šŸ“Š What's Included + +### Sample Dataset +- **1000 synthetic text samples** with sentiment labels (positive/negative/neutral) +- **Engineered features**: text length, word count, emoji count, etc. +- **User context**: aggregated user statistics and behavior patterns +- **Dynamic timestamps** generated within the past 30 days for realistic demo experience + +### Feature Engineering Pipeline +- **Text Features**: Content, metadata, and linguistic characteristics +- **User Features**: Historical sentiment patterns and engagement metrics +- **Real-time Features**: On-demand sentiment prediction using pre-trained models + +### Model Integration +- **Pre-trained Models**: CardiffNLP Twitter-RoBERTa for sentiment analysis +- **Embedding Generation**: Text vectorization for similarity and clustering +- **Confidence Scoring**: Prediction confidence and probability distributions + +## 🌐 HTTP Feature Server + +Once you've started the feature server with `feast serve`, you can query features via HTTP API: + +### Basic Materialized Features + +Query stored text and user features: + +```bash +curl -X POST \ + "http://localhost:6566/get-online-features" \ + -H "Content-Type: application/json" \ + -d '{ + "features": [ + "text_features:text_content", + "text_features:sentiment_label", + "user_stats:user_avg_sentiment" + ], + "entities": { + "text_id": ["text_0000", "text_0001"], + "user_id": ["user_080", "user_091"] + } + }' +``` + +**Example Response:** +```json +{ + "metadata": {"feature_names": ["text_id","user_id","sentiment_label","text_content","user_avg_sentiment"]}, + "results": [ + {"values": ["text_0000"], "statuses": ["PRESENT"]}, + {"values": ["user_080"], "statuses": ["PRESENT"]}, + {"values": ["positive"], "statuses": ["PRESENT"]}, + {"values": ["Having an amazing day at the beach with friends!"], "statuses": ["PRESENT"]}, + {"values": [0.905], "statuses": ["PRESENT"]} + ] +} +``` + +### On-Demand Sentiment Predictions + +Get real-time sentiment analysis: + +```bash +curl -X POST \ + "http://localhost:6566/get-online-features" \ + -H "Content-Type: application/json" \ + -d '{ + "features": [ + "sentiment_prediction:predicted_sentiment", + "sentiment_prediction:sentiment_confidence", + "sentiment_prediction:positive_prob" + ], + "entities": { + "input_text": ["I love this amazing product!", "This service is terrible"], + "model_name": ["cardiffnlp/twitter-roberta-base-sentiment-latest", "cardiffnlp/twitter-roberta-base-sentiment-latest"] + } + }' +``` + +### Feature Service (Complete Feature Set) + +Query using predefined feature service: + +```bash +curl -X POST \ + "http://localhost:6566/get-online-features" \ + -H "Content-Type: application/json" \ + -d '{ + "feature_service": "sentiment_analysis_v2", + "entities": { + "text_id": ["text_0000"], + "user_id": ["user_080"], + "input_text": ["This is an amazing experience!"], + "model_name": ["cardiffnlp/twitter-roberta-base-sentiment-latest"] + } + }' +``` + +**Note**: Use actual entity combinations from your generated data. Run `head data/sentiment_data.parquet` to see available `text_id` and `user_id` values. + +## šŸ—ļø Project Structure + +``` +my-sentiment-project/ +ā”œā”€ā”€ README.md # This file +└── feature_repo/ + ā”œā”€ā”€ feature_store.yaml # Feast configuration + ā”œā”€ā”€ example_repo.py # Feature definitions + ā”œā”€ā”€ test_workflow.py # Complete demo workflow + └── data/ # Generated sample data + └── sentiment_data.parquet +``` + +## šŸ”§ Key Components + +### Entities +- **`text`**: Unique identifier for text samples +- **`user`**: User who created the content + +### Feature Views +- **`text_features`**: Raw text content and engineered features +- **`user_stats`**: User-level aggregated statistics and behavior + +### On-Demand Features +- **`sentiment_prediction`**: Real-time sentiment analysis using PyTorch models +- **Features**: predicted sentiment, confidence scores, probability distributions, embeddings + +### Feature Services +- **`sentiment_analysis_v1`**: Basic sentiment features for simple models +- **`sentiment_analysis_v2`**: Advanced features with user context +- **`sentiment_training_features`**: Historical features for model training + +## āš™ļø Configuration + +This template is configured for **local development** using SQLite - no external dependencies required! + +### Current Configuration (`feature_store.yaml`) + +```yaml +project: my_project +provider: local # Local provider (no cloud) +registry: data/registry.db # SQLite registry +online_store: + type: sqlite # SQLite online store (NOT Redis) + path: data/online_store.db # Local SQLite file +offline_store: + type: file # Local file-based offline store +``` + +### Why SQLite? +- āœ… **Zero setup** - Works immediately after `feast init` +- āœ… **Self-contained** - All data in local files +- āœ… **No external services** - No Redis/cloud required +- āœ… **Perfect for demos** - Easy to share and understand + +## šŸ“š Detailed Usage + +### 1. Feature Store Setup + +```python +from feast import FeatureStore + +store = FeatureStore(repo_path=".") +``` + +### 2. Training Data Retrieval + +```python +# Get historical features for model training +from datetime import datetime +import pandas as pd + +entity_df = pd.DataFrame({ + "text_id": ["text_0000", "text_0001", "text_0002"], + "user_id": ["user_080", "user_091", "user_052"], # Use actual generated user IDs + "event_timestamp": [datetime.now(), datetime.now(), datetime.now()] # Current timestamps +}) + +training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "text_features:text_content", + "text_features:sentiment_label", + "text_features:text_length", + "user_stats:user_avg_sentiment", + ], +).to_df() + +print(f"Retrieved {len(training_df)} training samples") +print(training_df.head()) +``` + +### 3. Real-time Inference + +```python +# Get features for online serving (use actual entity combinations) +entity_rows = [ + {"text_id": "text_0000", "user_id": "user_080"}, + {"text_id": "text_0001", "user_id": "user_091"} +] + +online_features = store.get_online_features( + features=store.get_feature_service("sentiment_analysis_v1"), + entity_rows=entity_rows, +).to_dict() + +print("Online features:", online_features) +``` + +### 4. On-Demand Sentiment Prediction + +```python +# Real-time sentiment analysis +prediction_rows = [{ + "input_text": "I love this product!", + "model_name": "cardiffnlp/twitter-roberta-base-sentiment-latest" +}] + +predictions = store.get_online_features( + features=[ + "sentiment_prediction:predicted_sentiment", + "sentiment_prediction:sentiment_confidence", + ], + entity_rows=prediction_rows, +).to_dict() +``` + +## šŸš€ Complete End-to-End Demo + +Here's a step-by-step walkthrough of the entire template workflow: + +### 1. Initialize and Setup + +```bash +# Create new project +feast init my-sentiment-demo -t pytorch_nlp +cd my-sentiment-demo + +# Install dependencies +pip install torch>=2.0.0 transformers>=4.30.0 + +# Navigate to feature repository +cd feature_repo +``` + +### 2. Apply Feature Store Configuration + +```bash +# Register entities, feature views, and services +feast apply +``` + +**Expected Output:** +``` +Created entity text +Created entity user +Created feature view text_features +Created feature view user_stats +Created on demand feature view sentiment_prediction +Created feature service sentiment_analysis_v1 +Created feature service sentiment_analysis_v2 +``` + +### 3. Materialize Features + +```bash +# Load features into online store +feast materialize-incremental $(date -u +"%Y-%m-%dT%H:%M:%S") +``` + +**Expected Output:** +``` +Materializing 2 feature views to 2025-XX-XX XX:XX:XX+00:00 into the sqlite online store. +text_features: ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ +user_stats: ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ +``` + +### 4. Start Feature Server + +```bash +# Start HTTP feature server +feast serve --host 0.0.0.0 --port 6566 +``` + +**Expected Output:** +``` +Starting gunicorn 23.0.0 +Listening at: http://0.0.0.0:6566 +``` + +### 5. Query Features + +In a new terminal, test the feature server: + +```bash +# Check actual entity IDs in your data +python -c " +import pandas as pd +df = pd.read_parquet('data/sentiment_data.parquet') +print('Sample entities:', df.head()) +" + +# Test with actual entity combinations +curl -X POST \ + "http://localhost:6566/get-online-features" \ + -H "Content-Type: application/json" \ + -d '{ + "features": ["text_features:text_content", "text_features:sentiment_label"], + "entities": { + "text_id": ["text_0000"], + "user_id": ["user_XXX"] + } + }' | jq +``` + +## šŸŽ® Customization Examples + +### Adding New Features + +```python +# In example_repo.py, add to text_features_fv schema: +Field(name="hashtag_count", dtype=Int64, description="Number of hashtags"), +Field(name="mention_count", dtype=Int64, description="Number of @mentions"), +Field(name="url_count", dtype=Int64, description="Number of URLs"), +``` + +### Using Different Models + +```python +# In the sentiment_prediction function, change model: +model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +# or +model_name = "distilbert-base-uncased-finetuned-sst-2-english" +``` + +### Adding Custom Transformations + +```python +@on_demand_feature_view( + sources=[text_input_request], + schema=[Field(name="toxicity_score", dtype=Float32)], +) +def toxicity_detection(inputs: pd.DataFrame) -> pd.DataFrame: + # Implement toxicity detection logic + pass +``` + +## šŸ“ˆ Production Considerations + +### Scaling to Production + +1. **Cloud Deployment**: Use AWS, GCP, or Azure providers instead of local +2. **Vector Store**: Replace SQLite with Milvus for similarity search +3. **Model Serving**: Deploy models with KServe or other serving framework +4. **Monitoring**: Add feature drift detection and model performance tracking + +### Performance Optimization + +**Current Architecture:** +- Models load on each request (see `sentiment_prediction` function) +- CPU-only operation to avoid multiprocessing issues +- SQLite-based storage for fast local access + +**TODO: Optimization Opportunities:** +- **Startup-time Model Loading**: Load models once at server startup instead of per-request +- **Custom Provider**: Implement model caching via custom Feast provider +- **Model Serving Layer**: Use dedicated model servers (TorchServe, MLflow) for heavy models + +**Production Optimizations:** +1. **Model Caching**: Cache loaded models in memory to avoid repeated loading +2. **Batch Inference**: Process multiple texts together for efficiency +3. **Feature Materialization**: Pre-compute expensive features offline +4. **Async Processing**: Use async patterns for real-time serving + +### Production Configuration Examples + +**Note**: The demo uses SQLite (above). These are examples for production deployment: + +```yaml +# feature_store.yaml for AWS production (requires Redis setup) +project: sentiment_analysis_prod +provider: aws +registry: s3://my-bucket/feast/registry.pb +online_store: + type: redis # Requires separate Redis server + connection_string: redis://my-redis-cluster:6379 +offline_store: + type: bigquery + project_id: my-gcp-project + +# feature_store.yaml for GCP production (requires cloud services) +project: sentiment_analysis_prod +provider: gcp +registry: gs://my-bucket/feast/registry.pb +online_store: + type: redis # Requires separate Redis server + connection_string: redis://my-redis-cluster:6379 +offline_store: + type: bigquery + project_id: my-gcp-project +``` + +## šŸ¤ Contributing + +This template is designed to be extended and customized: + +1. **Add new feature transformations** in `example_repo.py` +2. **Experiment with different models** in the `sentiment_prediction` function +3. **Extend the test workflow** with additional evaluation metrics +4. **Add new data sources** (Twitter API, product reviews, etc.) + +## šŸ“– Resources + +- [Feast Documentation](https://docs.feast.dev/) +- [Hugging Face Transformers](https://huggingface.co/docs/transformers/) +- [PyTorch Documentation](https://pytorch.org/docs/) + +## šŸ› Troubleshooting + +### Common Issues + +**ImportError: No module named 'transformers'** +```bash +pip install torch transformers +``` + +**Model download timeout** +```python +# Set environment variable for Hugging Face cache +export HF_HOME=/path/to/cache +``` + +**Feature store initialization fails** +```bash +# Reset the feature store +feast teardown +feast apply +``` + +**On-demand features return defaults** +- This is expected if PyTorch/transformers aren't installed +- The template includes fallback dummy predictions for demonstration + +### Getting Help + +- Check the [Feast GitHub Issues](https://github.com/feast-dev/feast/issues) +- Join the [Feast Slack Community](https://slack.feast.dev/) +- Review the [PyTorch Forums](https://discuss.pytorch.org/) + +--- + +**Happy Feature Engineering! šŸŽ‰** + +Built with ā¤ļø using Feast, PyTorch, and Hugging Face. diff --git a/sdk/python/feast/templates/pytorch_nlp/__init__.py b/sdk/python/feast/templates/pytorch_nlp/__init__.py new file mode 100644 index 00000000000..de76b0a8f66 --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/__init__.py @@ -0,0 +1 @@ +# Empty file to make this a Python package diff --git a/sdk/python/feast/templates/pytorch_nlp/bootstrap.py b/sdk/python/feast/templates/pytorch_nlp/bootstrap.py new file mode 100644 index 00000000000..6aad854747e --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/bootstrap.py @@ -0,0 +1,305 @@ +import pathlib +import random +from datetime import datetime, timedelta + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq + +from feast.file_utils import replace_str_in_file + + +def create_sentiment_data(num_samples: int = 1000) -> pd.DataFrame: + """Generate sentiment analysis dataset using BERTweet classifier predictions.""" + + # Diverse realistic text samples from various domains + sample_texts = [ + # Social media / tweets style + "Having an amazing day at the beach with friends!", + "Traffic is horrible today, going to be late for everything", + "Just finished my morning coffee, time to start work", + "This weather is perfect for a weekend getaway", + "Frustrated with this constant construction noise", + "The sunset tonight is absolutely breathtaking", + "Finally got tickets to the concert I wanted!", + "My phone battery died right when I needed it most", + "Loving the new album that just dropped today", + "Can't believe how long this line is taking", + # Product reviews / opinions + "This phone has incredible battery life and camera quality", + "The delivery was late and packaging was damaged", + "Pretty standard laptop, does what it's supposed to do", + "Amazing customer service, resolved my issue quickly", + "The quality is terrible for the price, disappointed", + "Works fine, good value for money, as described", + "Best purchase I've made this year, highly recommend", + "Returned this item, didn't work as advertised", + "Decent product but could be better for the cost", + "Exceeded my expectations, will buy again", + # General experiences + "Learning something new always makes me happy", + "Dealing with technical issues is draining my energy", + "The meeting went okay, covered the basic topics", + "Excited about the weekend plans with family", + "Another day of debugging code, the struggle continues", + "Really enjoying this book I started reading", + "The restaurant service was disappointing tonight", + "Nothing special planned, just a quiet evening", + "Great presentation today, audience was engaged", + "Feeling overwhelmed with all these deadlines", + # News / current events style + "The new policy changes will benefit small businesses", + "This decision could have negative environmental impact", + "The research findings are interesting but inconclusive", + "Economic indicators suggest stable growth ahead", + "Mixed reactions to the announcement yesterday", + "The data shows promising results across demographics", + "Public opinion remains divided on this issue", + "Significant improvements in the healthcare system", + "Concerns raised about the new regulations", + "Standard quarterly results meeting projections", + ] + + # Try to use BERTweet sentiment classifier, fallback to rule-based if not available + try: + from transformers import pipeline + + print(" šŸ¤– Loading BERTweet sentiment classifier...") + + # Use BERTweet model specifically trained for Twitter sentiment + sentiment_classifier = pipeline( + "sentiment-analysis", + model="finiteautomata/bertweet-base-sentiment-analysis", + return_all_scores=True, + ) + use_real_classifier = True + print(" āœ… BERTweet sentiment classifier loaded successfully") + + except ImportError: + print(" āš ļø Transformers not available, using rule-based sentiment") + print(" šŸ’” For real classifier: pip install transformers torch") + use_real_classifier = False + except Exception as e: + print(f" āš ļø Could not load BERTweet ({e}), using rule-based sentiment") + use_real_classifier = False + + # Generate data + data = [] + # Use current time and generate data within the last 30 days + now = datetime.now() + start_date = now - timedelta(days=30) + + # Extend sample texts by cycling through them to reach num_samples + all_texts = (sample_texts * (num_samples // len(sample_texts) + 1))[:num_samples] + + for i, base_text in enumerate(all_texts): + # Add some realistic variations to make texts more diverse + text = base_text + + # Occasionally add emphasis or emoji + if random.random() < 0.15: + text = text + "!" + elif random.random() < 0.1: + text = text + "..." + elif random.random() < 0.08: + if any( + word in text.lower() + for word in ["amazing", "love", "great", "best", "happy", "excited"] + ): + text = text + " 😊" + elif random.random() < 0.08: + if any( + word in text.lower() + for word in ["terrible", "disappointed", "frustrated", "horrible"] + ): + text = text + " šŸ˜ž" + + # Get sentiment from real classifier or fallback + if use_real_classifier: + try: + predictions = sentiment_classifier(text)[0] + + # Find highest confidence prediction + best_pred = max(predictions, key=lambda x: x["score"]) + sentiment_label = best_pred[ + "label" + ].upper() # BERTweet returns 'POS', 'NEG', 'NEU' + sentiment_score = best_pred["score"] + + # Map BERTweet labels to our format + label_map = {"POS": "positive", "NEG": "negative", "NEU": "neutral"} + sentiment_label = label_map.get( + sentiment_label, sentiment_label.lower() + ) + + except Exception as e: + print(f" āš ļø Classifier error for text {i}: {e}") + # Fallback to simple rule-based + sentiment_label, sentiment_score = _rule_based_sentiment(text) + else: + # Rule-based fallback + sentiment_label, sentiment_score = _rule_based_sentiment(text) + + # Generate engineered features + text_length = len(text) + word_count = len(text.split()) + exclamation_count = text.count("!") + caps_ratio = sum(1 for c in text if c.isupper()) / len(text) if text else 0 + emoji_count = sum(1 for c in text if ord(c) > 127) # Simple emoji detection + + # Random timestamp within the past 30 days + days_offset = random.randint(0, 30) + hours_offset = random.randint(0, 23) + minutes_offset = random.randint(0, 59) + event_timestamp = start_date + timedelta( + days=days_offset, hours=hours_offset, minutes=minutes_offset + ) + + data.append( + { + "text_id": f"text_{i:04d}", + "user_id": f"user_{random.randint(1, 100):03d}", + "text_content": text, + "sentiment_label": sentiment_label, + "sentiment_score": round(sentiment_score, 3), + "text_length": text_length, + "word_count": word_count, + "exclamation_count": exclamation_count, + "caps_ratio": round(caps_ratio, 3), + "emoji_count": emoji_count, + "event_timestamp": pd.Timestamp(event_timestamp, tz="UTC"), + "created": pd.Timestamp.now(tz="UTC").round("ms"), + } + ) + + df = pd.DataFrame(data) + + # Calculate user-level aggregations + user_stats = ( + df.groupby("user_id") + .agg({"sentiment_score": "mean", "text_id": "count", "text_length": "mean"}) + .rename( + columns={ + "sentiment_score": "user_avg_sentiment", + "text_id": "user_text_count", + "text_length": "user_avg_text_length", + } + ) + .round(3) + .reset_index() + ) + + # Merge user stats back to main dataframe + df = df.merge(user_stats, on="user_id", how="left") + + return df + + +def _rule_based_sentiment(text: str) -> tuple[str, float]: + """Fallback rule-based sentiment analysis when BERTweet is not available.""" + text_lower = text.lower() + + positive_words = [ + "amazing", + "love", + "great", + "excellent", + "wonderful", + "perfect", + "outstanding", + "fantastic", + "best", + "happy", + "good", + "awesome", + "incredible", + "beautiful", + "excited", + "enjoying", + ] + negative_words = [ + "terrible", + "horrible", + "awful", + "worst", + "bad", + "disappointed", + "frustrated", + "angry", + "sad", + "broken", + "failed", + "poor", + "draining", + "overwhelming", + "disappointing", + ] + + positive_count = sum(1 for word in positive_words if word in text_lower) + negative_count = sum(1 for word in negative_words if word in text_lower) + + if positive_count > negative_count: + return "positive", random.uniform(0.6, 0.9) + elif negative_count > positive_count: + return "negative", random.uniform(0.6, 0.9) + else: + return "neutral", random.uniform(0.5, 0.7) + + +def bootstrap(): + """Bootstrap the pytorch_nlp template with sample data.""" + repo_path = pathlib.Path(__file__).parent.absolute() / "feature_repo" + raw_project_name = pathlib.Path(__file__).parent.absolute().name + + # Sanitize project name for SQLite compatibility (no hyphens allowed) + project_name = raw_project_name.replace("-", "_") + if project_name != raw_project_name: + print(f" ā„¹ļø Project name sanitized: '{raw_project_name}' → '{project_name}'") + print(" šŸ’” SQLite table names cannot contain hyphens") + + data_path = repo_path / "data" + data_path.mkdir(exist_ok=True) + + print("šŸŽ­ Setting up sentiment analysis data for PyTorch NLP demonstration...") + + parquet_file = data_path / "sentiment_data.parquet" + + # Generate sentiment data + print(" šŸ“ Generating synthetic sentiment analysis dataset...") + df = create_sentiment_data(num_samples=1000) + + # Save to parquet + table = pa.Table.from_pandas(df) + pq.write_table(table, parquet_file) + + print(f" āœ… Created sentiment dataset with {len(df)} samples") + print(" šŸ“Š Sentiment distribution:") + sentiment_counts = df["sentiment_label"].value_counts() + for sentiment, count in sentiment_counts.items(): + print(f" - {sentiment.capitalize()}: {count} samples") + + # Replace template placeholders + example_py_file = repo_path / "example_repo.py" + replace_str_in_file(example_py_file, "%PROJECT_NAME%", str(project_name)) + + test_workflow_file = repo_path / "test_workflow.py" + replace_str_in_file(test_workflow_file, "%PROJECT_NAME%", str(project_name)) + + print("šŸš€ PyTorch NLP template initialized successfully!") + + print("\nšŸŽÆ To get started:") + print(f" 1. cd {project_name}") + print(" 2. pip install -r requirements.txt") + print(" 3. cd feature_repo") + print(" 4. feast apply") + print(" 5. feast materialize") + print(" 6. python test_workflow.py") + print("\nšŸ’” This template demonstrates:") + print(" - Text feature engineering with Feast") + print(" - PyTorch + Hugging Face transformers integration") + print(" - Sentiment analysis with pre-trained models") + print(" - Online and offline feature serving") + + +if __name__ == "__main__": + bootstrap() diff --git a/sdk/python/feast/templates/pytorch_nlp/feature_repo/__init__.py b/sdk/python/feast/templates/pytorch_nlp/feature_repo/__init__.py new file mode 100644 index 00000000000..5d81b19ce14 --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/feature_repo/__init__.py @@ -0,0 +1 @@ +# This file is auto-generated by the Feast project template diff --git a/sdk/python/feast/templates/pytorch_nlp/feature_repo/example_repo.py b/sdk/python/feast/templates/pytorch_nlp/feature_repo/example_repo.py new file mode 100644 index 00000000000..e78614aacea --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/feature_repo/example_repo.py @@ -0,0 +1,272 @@ +""" +PyTorch NLP Sentiment Analysis Feature Repository + +This template demonstrates sentiment analysis using: +- Text feature engineering for NLP +- PyTorch + Hugging Face transformers integration +- On-demand sentiment prediction features +- Online and offline feature serving patterns +""" + +from datetime import timedelta +from pathlib import Path + +import pandas as pd + +from feast import ( + Entity, + FeatureService, + FeatureView, + Field, + FileSource, + RequestSource, + ValueType, +) +from feast.on_demand_feature_view import on_demand_feature_view +from feast.types import Array, Float32, Int64, String + +# Configuration +repo_path = Path(__file__).parent +data_path = repo_path / "data" + +# Define entities - primary keys for joining data +text_entity = Entity( + name="text", + join_keys=["text_id"], + value_type=ValueType.STRING, + description="Unique identifier for text samples", +) + +user_entity = Entity( + name="user", + join_keys=["user_id"], + value_type=ValueType.STRING, + description="User who created the text content", +) + +# Data source - points to the parquet file created by bootstrap +sentiment_source = FileSource( + name="sentiment_data_source", + path=str(data_path / "sentiment_data.parquet"), + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +# Feature view for text metadata and engineered features +text_features_fv = FeatureView( + name="text_features", + entities=[text_entity], + ttl=timedelta(days=7), # Keep features for 7 days + schema=[ + Field(name="text_content", dtype=String, description="Raw text content"), + Field( + name="sentiment_label", + dtype=String, + description="Ground truth sentiment label", + ), + Field( + name="sentiment_score", + dtype=Float32, + description="Ground truth sentiment score", + ), + Field(name="text_length", dtype=Int64, description="Character count of text"), + Field(name="word_count", dtype=Int64, description="Word count of text"), + Field( + name="exclamation_count", + dtype=Int64, + description="Number of exclamation marks", + ), + Field(name="caps_ratio", dtype=Float32, description="Ratio of capital letters"), + Field( + name="emoji_count", dtype=Int64, description="Number of emoji characters" + ), + ], + online=True, + source=sentiment_source, + tags={"team": "nlp", "domain": "sentiment_analysis"}, +) + +# Feature view for user-level aggregations +user_stats_fv = FeatureView( + name="user_stats", + entities=[user_entity], + ttl=timedelta(days=30), # User stats change less frequently + schema=[ + Field( + name="user_avg_sentiment", + dtype=Float32, + description="User's average sentiment score", + ), + Field( + name="user_text_count", + dtype=Int64, + description="Total number of texts by user", + ), + Field( + name="user_avg_text_length", + dtype=Float32, + description="User's average text length", + ), + ], + online=True, + source=sentiment_source, + tags={"team": "nlp", "domain": "user_behavior"}, +) + +# Request source for real-time inference +text_input_request = RequestSource( + name="text_input", + schema=[ + Field( + name="input_text", + dtype=String, + description="Text to analyze at request time", + ), + Field( + name="model_name", dtype=String, description="Model to use for prediction" + ), + ], +) + + +# On-demand feature view for real-time sentiment prediction +@on_demand_feature_view( + sources=[text_input_request], + schema=[ + Field(name="predicted_sentiment", dtype=String), + Field(name="sentiment_confidence", dtype=Float32), + Field(name="positive_prob", dtype=Float32), + Field(name="negative_prob", dtype=Float32), + Field(name="neutral_prob", dtype=Float32), + Field(name="text_embedding", dtype=Array(Float32)), + ], +) +def sentiment_prediction(inputs: pd.DataFrame) -> pd.DataFrame: + """ + Real-time sentiment prediction using pre-trained models. + + This function demonstrates how to integrate PyTorch/HuggingFace models + directly into Feast feature views for real-time inference. + """ + try: + import numpy as np + from transformers import pipeline + except ImportError: + # Fallback to dummy predictions if dependencies aren't available + df = pd.DataFrame() + df["predicted_sentiment"] = ["neutral"] * len(inputs) + df["sentiment_confidence"] = np.array([0.5] * len(inputs), dtype=np.float32) + df["positive_prob"] = np.array([0.33] * len(inputs), dtype=np.float32) + df["negative_prob"] = np.array([0.33] * len(inputs), dtype=np.float32) + df["neutral_prob"] = np.array([0.34] * len(inputs), dtype=np.float32) + df["text_embedding"] = [[np.float32(0.0)] * 384] * len(inputs) + return df + + # Initialize model (in production, you'd want to cache this) + model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" + try: + # Use sentiment pipeline for convenience (force CPU to avoid MPS forking issues) + sentiment_pipeline = pipeline( + "sentiment-analysis", + model=model_name, + tokenizer=model_name, + return_all_scores=True, + device="cpu", # Force CPU to avoid MPS forking issues on macOS + ) + + except Exception: + # Fallback if model loading fails + df = pd.DataFrame() + df["predicted_sentiment"] = ["neutral"] * len(inputs) + df["sentiment_confidence"] = np.array([0.5] * len(inputs), dtype=np.float32) + df["positive_prob"] = np.array([0.33] * len(inputs), dtype=np.float32) + df["negative_prob"] = np.array([0.33] * len(inputs), dtype=np.float32) + df["neutral_prob"] = np.array([0.34] * len(inputs), dtype=np.float32) + df["text_embedding"] = [[np.float32(0.0)] * 384] * len(inputs) + return df + + results = [] + + for text in inputs["input_text"]: + try: + # Get sentiment predictions + predictions = sentiment_pipeline(text) + + # Parse results (RoBERTa model returns LABEL_0, LABEL_1, LABEL_2) + label_map = { + "LABEL_0": "negative", + "LABEL_1": "neutral", + "LABEL_2": "positive", + } + + scores = { + label_map.get(pred["label"], pred["label"]): pred["score"] + for pred in predictions + } + + # Get best prediction + best_pred = max(predictions, key=lambda x: x["score"]) + predicted_sentiment = label_map.get(best_pred["label"], best_pred["label"]) + confidence = best_pred["score"] + + # Get embeddings (simplified - dummy embeddings for demo) + # In a real implementation, you'd run the model to get embeddings + # For this demo, we'll create a dummy embedding + embedding = np.random.rand(384).tolist() # DistilBERT size + + results.append( + { + "predicted_sentiment": predicted_sentiment, + "sentiment_confidence": np.float32(confidence), + "positive_prob": np.float32(scores.get("positive", 0.0)), + "negative_prob": np.float32(scores.get("negative", 0.0)), + "neutral_prob": np.float32(scores.get("neutral", 0.0)), + "text_embedding": [np.float32(x) for x in embedding], + } + ) + + except Exception: + # Fallback for individual text processing errors + results.append( + { + "predicted_sentiment": "neutral", + "sentiment_confidence": np.float32(0.5), + "positive_prob": np.float32(0.33), + "negative_prob": np.float32(0.33), + "neutral_prob": np.float32(0.34), + "text_embedding": [np.float32(0.0)] * 384, + } + ) + + return pd.DataFrame(results) + + +# Feature services group related features for model serving +sentiment_analysis_v1 = FeatureService( + name="sentiment_analysis_v1", + features=[ + text_features_fv[["text_content", "text_length", "word_count"]], + sentiment_prediction, + ], + description="Basic sentiment analysis features for model v1", +) + +sentiment_analysis_v2 = FeatureService( + name="sentiment_analysis_v2", + features=[ + text_features_fv, # All text features + user_stats_fv[["user_avg_sentiment", "user_text_count"]], # User context + sentiment_prediction, # Real-time predictions + ], + description="Advanced sentiment analysis with user context for model v2", +) + +# Feature service for training data (historical features only) +sentiment_training_features = FeatureService( + name="sentiment_training_features", + features=[ + text_features_fv, + user_stats_fv, + ], + description="Historical features for model training and evaluation", +) diff --git a/sdk/python/feast/templates/pytorch_nlp/feature_repo/feature_store.yaml b/sdk/python/feast/templates/pytorch_nlp/feature_repo/feature_store.yaml new file mode 100644 index 00000000000..e7c306623c0 --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/feature_repo/feature_store.yaml @@ -0,0 +1,9 @@ +project: my_project +provider: local +registry: data/registry.db +online_store: + type: sqlite + path: data/online_store.db +offline_store: + type: file +entity_key_serialization_version: 3 \ No newline at end of file diff --git a/sdk/python/feast/templates/pytorch_nlp/feature_repo/test_workflow.py b/sdk/python/feast/templates/pytorch_nlp/feature_repo/test_workflow.py new file mode 100644 index 00000000000..efca72fc3bc --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/feature_repo/test_workflow.py @@ -0,0 +1,350 @@ +""" +PyTorch NLP Sentiment Analysis - Complete Test Workflow + +This script demonstrates the full lifecycle of a sentiment analysis project using Feast: +1. Feature store setup and deployment +2. Historical feature retrieval for model training +3. Online feature serving for real-time inference +4. Integration with PyTorch and Hugging Face models +5. Performance evaluation and monitoring +""" + +import subprocess +from datetime import datetime, timedelta + +import pandas as pd + +from feast import FeatureStore + + +def run_demo(): + """Run the complete PyTorch NLP sentiment analysis demo.""" + print("šŸŽ­ PyTorch NLP Sentiment Analysis Demo") + print("=====================================") + + store = FeatureStore(repo_path=".") + + # 1. Deploy feature definitions + print("\nšŸš€ Step 1: Deploy feature definitions") + print("--- Run feast apply ---") + subprocess.run(["feast", "apply"]) + + # 2. Materialize features to online store + print("\nšŸ’¾ Step 2: Materialize features to online store") + print("--- Load features into online store ---") + store.materialize_incremental(end_date=datetime.now()) + + # 3. Demonstrate historical feature retrieval for training + print("\nšŸ“š Step 3: Historical features for model training") + training_data = fetch_historical_features_for_training(store) + + # 4. Simulate model training (conceptual) + print("\nšŸ‹ļø Step 4: Simulate model training") + simulate_model_training(training_data) + + # 5. Online feature serving for real-time inference + print("\n⚔ Step 5: Real-time inference with online features") + test_online_inference(store) + + # 6. Demonstrate on-demand feature views + print("\nšŸ”® Step 6: On-demand sentiment prediction") + test_on_demand_sentiment_prediction(store) + + # 7. Feature service usage + print("\nšŸŽÆ Step 7: Feature services for model versioning") + test_feature_services(store) + + # 8. Performance evaluation + print("\nšŸ“Š Step 8: Model performance evaluation") + evaluate_model_performance(store) + + print("\n✨ Demo completed successfully!") + print("\nšŸ“– Next steps:") + print(" - Modify the sentiment data in data/sentiment_data.parquet") + print(" - Experiment with different models in example_repo.py") + print(" - Add more feature engineering transformations") + print(" - Deploy to production with cloud providers (AWS, GCP, etc.)") + + +def fetch_historical_features_for_training(store: FeatureStore) -> pd.DataFrame: + """Fetch historical features for model training with point-in-time correctness.""" + # Create entity DataFrame for training + # In practice, this would come from your ML pipeline or data warehouse + entity_df = pd.DataFrame.from_dict( + { + "text_id": [ + "text_0001", + "text_0002", + "text_0003", + "text_0004", + "text_0005", + "text_0010", + "text_0015", + "text_0020", + "text_0025", + "text_0030", + ], + "user_id": [ + "user_001", + "user_002", + "user_001", + "user_003", + "user_002", + "user_001", + "user_004", + "user_003", + "user_005", + "user_001", + ], + "event_timestamp": [ + datetime(2023, 6, 15, 10, 0, 0), + datetime(2023, 6, 15, 11, 30, 0), + datetime(2023, 6, 15, 14, 15, 0), + datetime(2023, 6, 16, 9, 45, 0), + datetime(2023, 6, 16, 13, 20, 0), + datetime(2023, 6, 17, 8, 30, 0), + datetime(2023, 6, 17, 16, 45, 0), + datetime(2023, 6, 18, 12, 10, 0), + datetime(2023, 6, 18, 15, 30, 0), + datetime(2023, 6, 19, 11, 0, 0), + ], + } + ) + + # Fetch historical features using the training feature service + print(" šŸ“Š Retrieving training dataset with point-in-time correctness...") + training_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "text_features:text_content", + "text_features:sentiment_label", + "text_features:sentiment_score", + "text_features:text_length", + "text_features:word_count", + "text_features:exclamation_count", + "text_features:caps_ratio", + "text_features:emoji_count", + "user_stats:user_avg_sentiment", + "user_stats:user_text_count", + ], + ).to_df() + + print(f" āœ… Retrieved {len(training_df)} training samples") + print(" šŸ“‹ Sample training data:") + print( + training_df[ + ["text_content", "sentiment_label", "text_length", "word_count"] + ].head(3) + ) + + return training_df + + +def simulate_model_training(training_data: pd.DataFrame): + """Simulate model training process (conceptual implementation).""" + print(" 🧠 Training sentiment analysis model...") + + # In a real implementation, you would: + # 1. Split data into train/validation/test + # 2. Tokenize text using transformers tokenizer + # 3. Fine-tune a pre-trained model (BERT, RoBERTa, etc.) + # 4. Evaluate performance metrics + # 5. Save the trained model + + print(f" šŸ“Š Training data shape: {training_data.shape}") + + if not training_data.empty: + # Simple statistics as a proxy for training + sentiment_dist = training_data["sentiment_label"].value_counts() + avg_text_length = training_data["text_length"].mean() + + print(" šŸ“ˆ Sentiment distribution:") + for sentiment, count in sentiment_dist.items(): + print( + f" {sentiment}: {count} samples ({count / len(training_data) * 100:.1f}%)" + ) + + print(f" šŸ“ Average text length: {avg_text_length:.1f} characters") + print(" āœ… Model training simulation completed!") + else: + print(" āš ļø No training data available") + + +def test_online_inference(store: FeatureStore): + """Test online feature serving for real-time inference.""" + print(" ⚔ Testing real-time feature serving...") + + # Entity rows for online inference + entity_rows = [ + {"text_id": "text_0001", "user_id": "user_001"}, + {"text_id": "text_0002", "user_id": "user_002"}, + {"text_id": "text_0005", "user_id": "user_002"}, + ] + + # Fetch online features + online_features = store.get_online_features( + features=[ + "text_features:text_content", + "text_features:text_length", + "text_features:word_count", + "user_stats:user_avg_sentiment", + ], + entity_rows=entity_rows, + ).to_dict() + + print(" šŸ“Š Retrieved online features:") + for key, values in online_features.items(): + if key in ["text_content"]: + # Truncate long text for display + display_values = [ + str(v)[:50] + "..." if len(str(v)) > 50 else str(v) for v in values + ] + print(f" {key}: {display_values}") + else: + print(f" {key}: {values}") + + +def test_on_demand_sentiment_prediction(store: FeatureStore): + """Test on-demand feature views for real-time sentiment prediction.""" + print(" šŸ”® Testing on-demand sentiment prediction...") + + # Request data for on-demand features + entity_rows = [ + { + "input_text": "I love this product! It's absolutely amazing and works perfectly!", + "model_name": "cardiffnlp/twitter-roberta-base-sentiment-latest", + }, + { + "input_text": "This is terrible quality. Completely disappointed with the purchase.", + "model_name": "cardiffnlp/twitter-roberta-base-sentiment-latest", + }, + { + "input_text": "The product is okay. Nothing special but it does work as expected.", + "model_name": "cardiffnlp/twitter-roberta-base-sentiment-latest", + }, + ] + + try: + # Get on-demand predictions + predictions = store.get_online_features( + features=[ + "sentiment_prediction:predicted_sentiment", + "sentiment_prediction:sentiment_confidence", + "sentiment_prediction:positive_prob", + "sentiment_prediction:negative_prob", + "sentiment_prediction:neutral_prob", + ], + entity_rows=entity_rows, + ).to_dict() + + print(" šŸŽÆ Prediction results:") + for i in range(len(entity_rows)): + text = entity_rows[i]["input_text"][:60] + "..." + sentiment = predictions["predicted_sentiment"][i] + confidence = predictions["sentiment_confidence"][i] + print(f" Text: {text}") + print(f" Predicted: {sentiment} (confidence: {confidence:.3f})") + print( + f" Probabilities: P={predictions['positive_prob'][i]:.3f}, " + f"N={predictions['negative_prob'][i]:.3f}, " + f"Neu={predictions['neutral_prob'][i]:.3f}" + ) + print() + + except Exception as e: + print(f" āš ļø On-demand prediction failed: {e}") + print( + " šŸ’” This is expected if PyTorch/transformers dependencies are not installed" + ) + print(" šŸ“¦ Install with: pip install torch transformers") + + +def test_feature_services(store: FeatureStore): + """Test different feature services for model versioning.""" + print(" šŸŽÆ Testing feature services...") + + entity_rows = [{"text_id": "text_0001", "user_id": "user_001"}] + + # Test basic sentiment analysis service (v1) + print(" šŸ“¦ Testing sentiment_analysis_v1 feature service...") + try: + features_v1 = store.get_online_features( + features=store.get_feature_service("sentiment_analysis_v1"), + entity_rows=entity_rows, + ).to_dict() + print(f" āœ… Retrieved {len(features_v1)} feature types for v1") + except Exception as e: + print(f" āš ļø Feature service v1 failed: {e}") + + # Test advanced sentiment analysis service (v2) + print(" šŸ“¦ Testing sentiment_analysis_v2 feature service...") + try: + features_v2 = store.get_online_features( + features=store.get_feature_service("sentiment_analysis_v2"), + entity_rows=entity_rows, + ).to_dict() + print(f" āœ… Retrieved {len(features_v2)} feature types for v2") + except Exception as e: + print(f" āš ļø Feature service v2 failed: {e}") + + +def evaluate_model_performance(store: FeatureStore): + """Evaluate model performance using historical features.""" + print(" šŸ“Š Evaluating model performance...") + + try: + # Get a sample of historical data for evaluation + entity_df = pd.DataFrame( + { + "text_id": [f"text_{i:04d}" for i in range(1, 21)], + "user_id": [f"user_{(i % 5) + 1:03d}" for i in range(1, 21)], + "event_timestamp": [ + datetime.now() - timedelta(hours=i) for i in range(20) + ], + } + ) + + # Fetch features and labels + eval_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "text_features:text_content", + "text_features:sentiment_label", + "text_features:sentiment_score", + ], + ).to_df() + + if not eval_df.empty and "sentiment_label" in eval_df.columns: + # Calculate basic performance metrics + sentiment_dist = eval_df["sentiment_label"].value_counts() + avg_score = ( + eval_df["sentiment_score"].mean() + if "sentiment_score" in eval_df.columns + else 0 + ) + + print(" šŸ“ˆ Performance summary:") + print(f" Evaluation samples: {len(eval_df)}") + print(f" Average sentiment score: {avg_score:.3f}") + print(" Class distribution:") + for sentiment, count in sentiment_dist.items(): + print( + f" {sentiment}: {count} ({count / len(eval_df) * 100:.1f}%)" + ) + + # In a real implementation, you would: + # 1. Compare predicted vs actual labels + # 2. Calculate accuracy, precision, recall, F1-score + # 3. Generate confusion matrix + # 4. Analyze error cases + # 5. Monitor model drift over time + + else: + print(" āš ļø No evaluation data available") + + except Exception as e: + print(f" āš ļø Evaluation failed: {e}") + + +if __name__ == "__main__": + run_demo() diff --git a/sdk/python/feast/templates/pytorch_nlp/gitignore b/sdk/python/feast/templates/pytorch_nlp/gitignore new file mode 100644 index 00000000000..88196cd1f22 --- /dev/null +++ b/sdk/python/feast/templates/pytorch_nlp/gitignore @@ -0,0 +1,180 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be added to the global gitignore or merged into this project gitignore. For a PyCharm +# project, it is recommended to include the following directory in .gitignore: +# https://intellij-support.jetbrains.com/hc/en/articles/206544839 + +# Feast-specific +feature_repo/data/*.db +feature_repo/data/*.db.lock +feature_repo/data/online_store.db* +feature_repo/data/registry.db* +feature_repo/data/registry.pb* + +# Model cache +.cache/ +huggingface/ +transformers_cache/ + +# MacOS +.DS_Store + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Logs +*.log \ No newline at end of file