From bcde20c2795dfd58bea97dfbba639491d372f96d Mon Sep 17 00:00:00 2001 From: Theodor Mihalache Date: Fri, 11 Oct 2024 13:39:08 -0400 Subject: [PATCH 1/3] fix: changes following issue 4593 Signed-off-by: Theodor Mihalache --- docs/getting-started/concepts/README.md | 4 ++ docs/getting-started/concepts/feature-view.md | 2 +- docs/getting-started/concepts/overview.md | 6 +-- docs/getting-started/concepts/project.md | 19 ++++++++ docs/getting-started/quickstart.md | 35 +++++++++++---- sdk/python/feast/templates/athena/.gitignore | 44 +++++++++++++++++++ sdk/python/feast/templates/aws/.gitignore | 44 +++++++++++++++++++ .../feast/templates/cassandra/.gitignore | 44 +++++++++++++++++++ .../feast/templates/cassandra/bootstrap.py | 4 +- sdk/python/feast/templates/gcp/.gitignore | 44 +++++++++++++++++++ .../feast/templates/hazelcast/.gitignore | 44 +++++++++++++++++++ .../feast/templates/hazelcast/bootstrap.py | 4 +- sdk/python/feast/templates/hbase/.gitignore | 44 +++++++++++++++++++ sdk/python/feast/templates/hbase/bootstrap.py | 4 +- sdk/python/feast/templates/local/.gitignore | 44 +++++++++++++++++++ sdk/python/feast/templates/local/bootstrap.py | 8 +++- sdk/python/feast/templates/minimal/.gitignore | 44 +++++++++++++++++++ .../feast/templates/postgres/.gitignore | 44 +++++++++++++++++++ .../feast/templates/snowflake/.gitignore | 44 +++++++++++++++++++ sdk/python/feast/templates/spark/.gitignore | 44 +++++++++++++++++++ 20 files changed, 550 insertions(+), 20 deletions(-) create mode 100644 docs/getting-started/concepts/project.md create mode 100644 sdk/python/feast/templates/athena/.gitignore create mode 100644 sdk/python/feast/templates/aws/.gitignore create mode 100644 sdk/python/feast/templates/cassandra/.gitignore create mode 100644 sdk/python/feast/templates/gcp/.gitignore create mode 100644 sdk/python/feast/templates/hazelcast/.gitignore create mode 100644 sdk/python/feast/templates/hbase/.gitignore create mode 100644 sdk/python/feast/templates/local/.gitignore create mode 100644 sdk/python/feast/templates/minimal/.gitignore create mode 100644 sdk/python/feast/templates/postgres/.gitignore create mode 100644 sdk/python/feast/templates/snowflake/.gitignore create mode 100644 sdk/python/feast/templates/spark/.gitignore diff --git a/docs/getting-started/concepts/README.md b/docs/getting-started/concepts/README.md index a32c53b5f4e..95e1a14bf1f 100644 --- a/docs/getting-started/concepts/README.md +++ b/docs/getting-started/concepts/README.md @@ -4,6 +4,10 @@ [overview.md](overview.md) {% endcontent-ref %} +{% content-ref url="project.md" %} +[project.md](project.md) +{% endcontent-ref %} + {% content-ref url="data-ingestion.md" %} [data-ingestion.md](data-ingestion.md) {% endcontent-ref %} diff --git a/docs/getting-started/concepts/feature-view.md b/docs/getting-started/concepts/feature-view.md index ccb380497d8..6ebe4feacff 100644 --- a/docs/getting-started/concepts/feature-view.md +++ b/docs/getting-started/concepts/feature-view.md @@ -14,7 +14,7 @@ Feature views consist of: * zero or more [entities](entity.md) * If the features are not related to a specific object, the feature view might not have entities; see [feature views without entities](feature-view.md#feature-views-without-entities) below. * a name to uniquely identify this feature view in the project. -* (optional, but recommended) a schema specifying one or more [features](feature-view.md#feature) (without this, Feast will infer the schema by reading from the data source) +* (optional, but recommended) a schema specifying one or more [features](feature-view.md#field) (without this, Feast will infer the schema by reading from the data source) * (optional, but recommended) metadata (for example, description, or other free-form metadata via `tags`) * (optional) a TTL, which limits how far back Feast will look when generating historical datasets diff --git a/docs/getting-started/concepts/overview.md b/docs/getting-started/concepts/overview.md index ffbad86c037..f17db15170d 100644 --- a/docs/getting-started/concepts/overview.md +++ b/docs/getting-started/concepts/overview.md @@ -2,11 +2,7 @@ ### Feast project structure -The top-level namespace within Feast is a **project**. Users define one or more [feature views](feature-view.md) within a project. Each feature view contains one or more [features](feature-view.md#feature). These features typically relate to one or more [entities](entity.md). A feature view must always have a [data source](data-ingestion.md), which in turn is used during the generation of training [datasets](feature-retrieval.md#dataset) and when materializing feature values into the online store. - -![](<../../.gitbook/assets/image (7).png>) - -**Projects** provide complete isolation of feature stores at the infrastructure level. This is accomplished through resource namespacing, e.g., prefixing table names with the associated project. Each project should be considered a completely separate universe of entities and features. It is not possible to retrieve features from multiple projects in a single request. We recommend having a single feature store and a single project per environment (`dev`, `staging`, `prod`). +The top-level namespace within Feast is a [project](project.md). ### Data ingestion diff --git a/docs/getting-started/concepts/project.md b/docs/getting-started/concepts/project.md new file mode 100644 index 00000000000..713e1410f61 --- /dev/null +++ b/docs/getting-started/concepts/project.md @@ -0,0 +1,19 @@ +# Project + +Projects provide complete isolation of feature stores at the infrastructure level. This is accomplished through resource namespacing, e.g., prefixing table names with the associated project. Each project should be considered a completely separate universe of entities and features. It is not possible to retrieve features from multiple projects in a single request. We recommend having a single feature store and a single project per environment (`dev`, `staging`, `prod`). + +![](<../../.gitbook/assets/image (7).png>) + +Users define one or more [feature views](feature-view.md) within a project. Each feature view contains one or more [features](feature-view.md#field). These features typically relate to one or more [entities](entity.md). A feature view must always have a [data source](data-ingestion.md), which in turn is used during the generation of training [datasets](feature-retrieval.md#dataset) and when materializing feature values into the online store. + +The concept of project provide the following benefits: + +**Logical Grouping**: Projects group related features together, making it easier to manage and track them. + +**Feature Definitions**: Within a project, you can define features, including their metadata, types, and sources. This helps standardize how features are created and consumed. + +**Isolation**: Projects provide a way to isolate different environments, such as development, testing, and production, ensuring that changes in one project do not affect others. + +**Collaboration**: By organizing features within projects, teams can collaborate more effectively, with clear boundaries around the features they are responsible for. + +**Access Control**: Projects can implement permissions, allowing different users or teams to access only the features relevant to their work. \ No newline at end of file diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index 4afd0086d9b..76ba606f64b 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -1,6 +1,23 @@ # Quickstart -In this tutorial we will +## What is Feast? + +Feast (Feature Store) is an open-source feature store designed to facilitate the management and serving of machine learning features in a way that supports both batch and real-time applications. + +For more info refer to [Introduction to feast](../README.md) + +## Prerequisites +* Ensure that you have Python (3.9 or above) installed. +* It is recommended to create and work in a virtual environment: + ```sh + # create & activate a virtual environment + python -m venv venv/ + source venv/bin/activate + ``` + +## Overview + +In this tutorial we will: 1. Deploy a local feature store with a **Parquet file offline store** and **Sqlite online store**. 2. Build a training dataset using our time series features from our **Parquet files**. @@ -9,7 +26,7 @@ In this tutorial we will 5. Read the latest features from the online store for real-time inference. 6. Explore the (experimental) Feast UI -## Overview +***Note*** - Feast can used as an executable or as a server, please refer to [feast feature server](../reference/feature-servers/python-feature-server.md) In this tutorial, we'll use Feast to generate training data and power online model inference for a ride-sharing driver satisfaction prediction model. Feast solves several common issues in this flow: @@ -279,7 +296,7 @@ There's an included `test_workflow.py` file which runs through a full sample wor 7. Verify online features are updated / fresher We'll walk through some snippets of code below and explain -### Step 3a: Register feature definitions and deploy your feature store +### Step 4: Register feature definitions and deploy your feature store The `apply` command scans python files in the current directory for feature view/entity definitions, registers the objects, and deploys infrastructure. In this example, it reads `example_repo.py` and sets up SQLite online store tables. Note that we had specified SQLite as the default online store by @@ -311,7 +328,7 @@ Created sqlite table my_project_driver_hourly_stats {% endtab %} {% endtabs %} -### Step 3b: Generating training data or powering batch scoring models +### Step 5: Generating training data or powering batch scoring models To train a model, we need features and labels. Often, this label data is stored separately (e.g. you have one table storing user survey results and another set of tables with feature values). Feast can help generate the features that map to these labels. @@ -466,7 +483,7 @@ print(training_df.head()) ``` {% endtab %} {% endtabs %} -### Step 3c: Ingest batch features into your online store +### Step 6: Ingest batch features into your online store We now serialize the latest values of features since the beginning of time to prepare for serving (note: `materialize-incremental` serializes all new features since the last `materialize` call). @@ -499,7 +516,7 @@ Materializing 2 feature views to 2024-04-19 10:59:58-04:00 into the sqlite onlin {% endtab %} {% endtabs %} -### Step 3d: Fetching feature vectors for inference +### Step 7: Fetching feature vectors for inference At inference time, we need to quickly read the latest feature values for different drivers (which otherwise might have existed only in batch sources) from the online feature store using `get_online_features()`. These feature @@ -544,7 +561,7 @@ pprint(feature_vector) {% endtab %} {% endtabs %} -### Step 3e: Using a feature service to fetch online features instead. +### Step 8: Using a feature service to fetch online features instead. You can also use feature services to manage multiple features, and decouple feature view definitions and the features needed by end applications. The feature store can also be used to fetch either online or historical @@ -594,7 +611,7 @@ pprint(feature_vector) {% endtab %} {% endtabs %} -## Step 4: Browse your features with the Web UI (experimental) +## Step 9: Browse your features with the Web UI (experimental) View all registered features, data sources, entities, and feature services with the Web UI. @@ -626,7 +643,7 @@ INFO: Uvicorn running on http://0.0.0.0:8888 (Press CTRL+C to quit) ![](../reference/ui.png) -## Step 5: Re-examine `test_workflow.py` +## Step 10: Re-examine `test_workflow.py` Take a look at `test_workflow.py` again. It showcases many sample flows on how to interact with Feast. You'll see these show up in the upcoming concepts + architecture + tutorial pages as well. diff --git a/sdk/python/feast/templates/athena/.gitignore b/sdk/python/feast/templates/athena/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/athena/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/aws/.gitignore b/sdk/python/feast/templates/aws/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/aws/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/cassandra/.gitignore b/sdk/python/feast/templates/cassandra/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/cassandra/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/cassandra/bootstrap.py b/sdk/python/feast/templates/cassandra/bootstrap.py index fa70917914f..33385141145 100644 --- a/sdk/python/feast/templates/cassandra/bootstrap.py +++ b/sdk/python/feast/templates/cassandra/bootstrap.py @@ -275,7 +275,9 @@ def bootstrap(): # example_repo.py example_py_file = repo_path / "example_repo.py" - replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path)) + replace_str_in_file( + example_py_file, "%PARQUET_PATH%", str(driver_stats_path.relative_to(repo_path)) + ) # store config yaml, interact with user and then customize file: settings = collect_cassandra_store_settings() diff --git a/sdk/python/feast/templates/gcp/.gitignore b/sdk/python/feast/templates/gcp/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/gcp/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/hazelcast/.gitignore b/sdk/python/feast/templates/hazelcast/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/hazelcast/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/hazelcast/bootstrap.py b/sdk/python/feast/templates/hazelcast/bootstrap.py index e5018e4fe02..7a2b49d2493 100644 --- a/sdk/python/feast/templates/hazelcast/bootstrap.py +++ b/sdk/python/feast/templates/hazelcast/bootstrap.py @@ -165,7 +165,9 @@ def bootstrap(): # example_repo.py example_py_file = repo_path / "example_repo.py" - replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path)) + replace_str_in_file( + example_py_file, "%PARQUET_PATH%", str(driver_stats_path.relative_to(repo_path)) + ) # store config yaml, interact with user and then customize file: settings = collect_hazelcast_online_store_settings() diff --git a/sdk/python/feast/templates/hbase/.gitignore b/sdk/python/feast/templates/hbase/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/hbase/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/hbase/bootstrap.py b/sdk/python/feast/templates/hbase/bootstrap.py index 125eb7c2e72..94be8e441da 100644 --- a/sdk/python/feast/templates/hbase/bootstrap.py +++ b/sdk/python/feast/templates/hbase/bootstrap.py @@ -23,7 +23,9 @@ def bootstrap(): driver_df.to_parquet(path=str(driver_stats_path), allow_truncated_timestamps=True) example_py_file = repo_path / "example_repo.py" - replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path)) + replace_str_in_file( + example_py_file, "%PARQUET_PATH%", str(driver_stats_path.relative_to(repo_path)) + ) if __name__ == "__main__": diff --git a/sdk/python/feast/templates/local/.gitignore b/sdk/python/feast/templates/local/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/local/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/local/bootstrap.py b/sdk/python/feast/templates/local/bootstrap.py index e2c1efdbc49..9f6a5a6c969 100644 --- a/sdk/python/feast/templates/local/bootstrap.py +++ b/sdk/python/feast/templates/local/bootstrap.py @@ -25,8 +25,12 @@ def bootstrap(): example_py_file = repo_path / "example_repo.py" replace_str_in_file(example_py_file, "%PROJECT_NAME%", str(project_name)) - replace_str_in_file(example_py_file, "%PARQUET_PATH%", str(driver_stats_path)) - replace_str_in_file(example_py_file, "%LOGGING_PATH%", str(data_path)) + replace_str_in_file( + example_py_file, "%PARQUET_PATH%", str(driver_stats_path.relative_to(repo_path)) + ) + replace_str_in_file( + example_py_file, "%LOGGING_PATH%", str(data_path.relative_to(repo_path)) + ) if __name__ == "__main__": diff --git a/sdk/python/feast/templates/minimal/.gitignore b/sdk/python/feast/templates/minimal/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/minimal/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/postgres/.gitignore b/sdk/python/feast/templates/postgres/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/postgres/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/snowflake/.gitignore b/sdk/python/feast/templates/snowflake/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/snowflake/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db diff --git a/sdk/python/feast/templates/spark/.gitignore b/sdk/python/feast/templates/spark/.gitignore new file mode 100644 index 00000000000..c47cbcd857f --- /dev/null +++ b/sdk/python/feast/templates/spark/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +*.egg-info/ +dist/ +build/ + +# Pytest +.cache +*.cover +*.log +.coverage +nosetests.xml +coverage.xml +*.hypothesis/ +*.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo +*.sublime-workspace +*.sublime-project + +# OS generated files +.DS_Store +Thumbs.db From 98aa4816927edba14d782695aaef56b378f18d6e Mon Sep 17 00:00:00 2001 From: Theodor Mihalache Date: Mon, 14 Oct 2024 10:08:54 -0400 Subject: [PATCH 2/3] fix: changes following issue 4593 - Fixed file path in templates to be relative path Signed-off-by: Theodor Mihalache --- docs/getting-started/concepts/README.md | 4 -- docs/getting-started/concepts/feature-view.md | 2 +- docs/getting-started/concepts/overview.md | 6 ++- docs/getting-started/concepts/project.md | 19 -------- docs/getting-started/quickstart.md | 35 ++++----------- sdk/python/feast/templates/athena/.gitignore | 44 ------------------- sdk/python/feast/templates/aws/.gitignore | 44 ------------------- .../feast/templates/cassandra/.gitignore | 44 ------------------- sdk/python/feast/templates/gcp/.gitignore | 44 ------------------- .../feast/templates/hazelcast/.gitignore | 44 ------------------- sdk/python/feast/templates/hbase/.gitignore | 44 ------------------- sdk/python/feast/templates/local/.gitignore | 44 ------------------- sdk/python/feast/templates/minimal/.gitignore | 44 ------------------- .../feast/templates/postgres/.gitignore | 44 ------------------- .../feast/templates/snowflake/.gitignore | 44 ------------------- sdk/python/feast/templates/spark/.gitignore | 44 ------------------- 16 files changed, 15 insertions(+), 535 deletions(-) delete mode 100644 docs/getting-started/concepts/project.md delete mode 100644 sdk/python/feast/templates/athena/.gitignore delete mode 100644 sdk/python/feast/templates/aws/.gitignore delete mode 100644 sdk/python/feast/templates/cassandra/.gitignore delete mode 100644 sdk/python/feast/templates/gcp/.gitignore delete mode 100644 sdk/python/feast/templates/hazelcast/.gitignore delete mode 100644 sdk/python/feast/templates/hbase/.gitignore delete mode 100644 sdk/python/feast/templates/local/.gitignore delete mode 100644 sdk/python/feast/templates/minimal/.gitignore delete mode 100644 sdk/python/feast/templates/postgres/.gitignore delete mode 100644 sdk/python/feast/templates/snowflake/.gitignore delete mode 100644 sdk/python/feast/templates/spark/.gitignore diff --git a/docs/getting-started/concepts/README.md b/docs/getting-started/concepts/README.md index 95e1a14bf1f..a32c53b5f4e 100644 --- a/docs/getting-started/concepts/README.md +++ b/docs/getting-started/concepts/README.md @@ -4,10 +4,6 @@ [overview.md](overview.md) {% endcontent-ref %} -{% content-ref url="project.md" %} -[project.md](project.md) -{% endcontent-ref %} - {% content-ref url="data-ingestion.md" %} [data-ingestion.md](data-ingestion.md) {% endcontent-ref %} diff --git a/docs/getting-started/concepts/feature-view.md b/docs/getting-started/concepts/feature-view.md index 6ebe4feacff..ccb380497d8 100644 --- a/docs/getting-started/concepts/feature-view.md +++ b/docs/getting-started/concepts/feature-view.md @@ -14,7 +14,7 @@ Feature views consist of: * zero or more [entities](entity.md) * If the features are not related to a specific object, the feature view might not have entities; see [feature views without entities](feature-view.md#feature-views-without-entities) below. * a name to uniquely identify this feature view in the project. -* (optional, but recommended) a schema specifying one or more [features](feature-view.md#field) (without this, Feast will infer the schema by reading from the data source) +* (optional, but recommended) a schema specifying one or more [features](feature-view.md#feature) (without this, Feast will infer the schema by reading from the data source) * (optional, but recommended) metadata (for example, description, or other free-form metadata via `tags`) * (optional) a TTL, which limits how far back Feast will look when generating historical datasets diff --git a/docs/getting-started/concepts/overview.md b/docs/getting-started/concepts/overview.md index f17db15170d..ffbad86c037 100644 --- a/docs/getting-started/concepts/overview.md +++ b/docs/getting-started/concepts/overview.md @@ -2,7 +2,11 @@ ### Feast project structure -The top-level namespace within Feast is a [project](project.md). +The top-level namespace within Feast is a **project**. Users define one or more [feature views](feature-view.md) within a project. Each feature view contains one or more [features](feature-view.md#feature). These features typically relate to one or more [entities](entity.md). A feature view must always have a [data source](data-ingestion.md), which in turn is used during the generation of training [datasets](feature-retrieval.md#dataset) and when materializing feature values into the online store. + +![](<../../.gitbook/assets/image (7).png>) + +**Projects** provide complete isolation of feature stores at the infrastructure level. This is accomplished through resource namespacing, e.g., prefixing table names with the associated project. Each project should be considered a completely separate universe of entities and features. It is not possible to retrieve features from multiple projects in a single request. We recommend having a single feature store and a single project per environment (`dev`, `staging`, `prod`). ### Data ingestion diff --git a/docs/getting-started/concepts/project.md b/docs/getting-started/concepts/project.md deleted file mode 100644 index 713e1410f61..00000000000 --- a/docs/getting-started/concepts/project.md +++ /dev/null @@ -1,19 +0,0 @@ -# Project - -Projects provide complete isolation of feature stores at the infrastructure level. This is accomplished through resource namespacing, e.g., prefixing table names with the associated project. Each project should be considered a completely separate universe of entities and features. It is not possible to retrieve features from multiple projects in a single request. We recommend having a single feature store and a single project per environment (`dev`, `staging`, `prod`). - -![](<../../.gitbook/assets/image (7).png>) - -Users define one or more [feature views](feature-view.md) within a project. Each feature view contains one or more [features](feature-view.md#field). These features typically relate to one or more [entities](entity.md). A feature view must always have a [data source](data-ingestion.md), which in turn is used during the generation of training [datasets](feature-retrieval.md#dataset) and when materializing feature values into the online store. - -The concept of project provide the following benefits: - -**Logical Grouping**: Projects group related features together, making it easier to manage and track them. - -**Feature Definitions**: Within a project, you can define features, including their metadata, types, and sources. This helps standardize how features are created and consumed. - -**Isolation**: Projects provide a way to isolate different environments, such as development, testing, and production, ensuring that changes in one project do not affect others. - -**Collaboration**: By organizing features within projects, teams can collaborate more effectively, with clear boundaries around the features they are responsible for. - -**Access Control**: Projects can implement permissions, allowing different users or teams to access only the features relevant to their work. \ No newline at end of file diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index 76ba606f64b..4afd0086d9b 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -1,23 +1,6 @@ # Quickstart -## What is Feast? - -Feast (Feature Store) is an open-source feature store designed to facilitate the management and serving of machine learning features in a way that supports both batch and real-time applications. - -For more info refer to [Introduction to feast](../README.md) - -## Prerequisites -* Ensure that you have Python (3.9 or above) installed. -* It is recommended to create and work in a virtual environment: - ```sh - # create & activate a virtual environment - python -m venv venv/ - source venv/bin/activate - ``` - -## Overview - -In this tutorial we will: +In this tutorial we will 1. Deploy a local feature store with a **Parquet file offline store** and **Sqlite online store**. 2. Build a training dataset using our time series features from our **Parquet files**. @@ -26,7 +9,7 @@ In this tutorial we will: 5. Read the latest features from the online store for real-time inference. 6. Explore the (experimental) Feast UI -***Note*** - Feast can used as an executable or as a server, please refer to [feast feature server](../reference/feature-servers/python-feature-server.md) +## Overview In this tutorial, we'll use Feast to generate training data and power online model inference for a ride-sharing driver satisfaction prediction model. Feast solves several common issues in this flow: @@ -296,7 +279,7 @@ There's an included `test_workflow.py` file which runs through a full sample wor 7. Verify online features are updated / fresher We'll walk through some snippets of code below and explain -### Step 4: Register feature definitions and deploy your feature store +### Step 3a: Register feature definitions and deploy your feature store The `apply` command scans python files in the current directory for feature view/entity definitions, registers the objects, and deploys infrastructure. In this example, it reads `example_repo.py` and sets up SQLite online store tables. Note that we had specified SQLite as the default online store by @@ -328,7 +311,7 @@ Created sqlite table my_project_driver_hourly_stats {% endtab %} {% endtabs %} -### Step 5: Generating training data or powering batch scoring models +### Step 3b: Generating training data or powering batch scoring models To train a model, we need features and labels. Often, this label data is stored separately (e.g. you have one table storing user survey results and another set of tables with feature values). Feast can help generate the features that map to these labels. @@ -483,7 +466,7 @@ print(training_df.head()) ``` {% endtab %} {% endtabs %} -### Step 6: Ingest batch features into your online store +### Step 3c: Ingest batch features into your online store We now serialize the latest values of features since the beginning of time to prepare for serving (note: `materialize-incremental` serializes all new features since the last `materialize` call). @@ -516,7 +499,7 @@ Materializing 2 feature views to 2024-04-19 10:59:58-04:00 into the sqlite onlin {% endtab %} {% endtabs %} -### Step 7: Fetching feature vectors for inference +### Step 3d: Fetching feature vectors for inference At inference time, we need to quickly read the latest feature values for different drivers (which otherwise might have existed only in batch sources) from the online feature store using `get_online_features()`. These feature @@ -561,7 +544,7 @@ pprint(feature_vector) {% endtab %} {% endtabs %} -### Step 8: Using a feature service to fetch online features instead. +### Step 3e: Using a feature service to fetch online features instead. You can also use feature services to manage multiple features, and decouple feature view definitions and the features needed by end applications. The feature store can also be used to fetch either online or historical @@ -611,7 +594,7 @@ pprint(feature_vector) {% endtab %} {% endtabs %} -## Step 9: Browse your features with the Web UI (experimental) +## Step 4: Browse your features with the Web UI (experimental) View all registered features, data sources, entities, and feature services with the Web UI. @@ -643,7 +626,7 @@ INFO: Uvicorn running on http://0.0.0.0:8888 (Press CTRL+C to quit) ![](../reference/ui.png) -## Step 10: Re-examine `test_workflow.py` +## Step 5: Re-examine `test_workflow.py` Take a look at `test_workflow.py` again. It showcases many sample flows on how to interact with Feast. You'll see these show up in the upcoming concepts + architecture + tutorial pages as well. diff --git a/sdk/python/feast/templates/athena/.gitignore b/sdk/python/feast/templates/athena/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/athena/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/aws/.gitignore b/sdk/python/feast/templates/aws/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/aws/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/cassandra/.gitignore b/sdk/python/feast/templates/cassandra/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/cassandra/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/gcp/.gitignore b/sdk/python/feast/templates/gcp/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/gcp/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/hazelcast/.gitignore b/sdk/python/feast/templates/hazelcast/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/hazelcast/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/hbase/.gitignore b/sdk/python/feast/templates/hbase/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/hbase/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/local/.gitignore b/sdk/python/feast/templates/local/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/local/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/minimal/.gitignore b/sdk/python/feast/templates/minimal/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/minimal/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/postgres/.gitignore b/sdk/python/feast/templates/postgres/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/postgres/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/snowflake/.gitignore b/sdk/python/feast/templates/snowflake/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/snowflake/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db diff --git a/sdk/python/feast/templates/spark/.gitignore b/sdk/python/feast/templates/spark/.gitignore deleted file mode 100644 index c47cbcd857f..00000000000 --- a/sdk/python/feast/templates/spark/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*.pyo -*.pyd - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -dist/ -build/ - -# Pytest -.cache -*.cover -*.log -.coverage -nosetests.xml -coverage.xml -*.hypothesis/ -*.pytest_cache/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IDEs and Editors -.vscode/ -.idea/ -*.swp -*.swo -*.sublime-workspace -*.sublime-project - -# OS generated files -.DS_Store -Thumbs.db From fa995fc7a44d001a35c53ff90f219b8760f0b80b Mon Sep 17 00:00:00 2001 From: Theodor Mihalache Date: Wed, 16 Oct 2024 10:57:28 -0400 Subject: [PATCH 3/3] fix: Fixes to relative path in FileSource Signed-off-by: Theodor Mihalache --- sdk/python/feast/feature_store.py | 4 +- sdk/python/feast/infra/offline_stores/dask.py | 40 +++++++++++++++---- .../feast/infra/offline_stores/duckdb.py | 17 ++++++-- .../feast/infra/offline_stores/file_source.py | 11 ++++- sdk/python/feast/infra/offline_stores/ibis.py | 37 +++++++++++------ sdk/python/feast/repo_config.py | 1 + sdk/python/tests/doctest/test_all.py | 2 +- .../offline_stores/test_offline_store.py | 2 +- sdk/python/tests/unit/test_offline_server.py | 1 + .../tests/utils/auth_permissions_util.py | 1 + 10 files changed, 89 insertions(+), 27 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 6112279a027..f2fa33e53a8 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -706,7 +706,7 @@ def plan( >>> fs = FeatureStore(repo_path="project/feature_repo") >>> driver = Entity(name="driver_id", description="driver id") >>> driver_hourly_stats = FileSource( - ... path="project/feature_repo/data/driver_stats.parquet", + ... path="data/driver_stats.parquet", ... timestamp_field="event_timestamp", ... created_timestamp_column="created", ... ) @@ -820,7 +820,7 @@ def apply( >>> fs = FeatureStore(repo_path="project/feature_repo") >>> driver = Entity(name="driver_id", description="driver id") >>> driver_hourly_stats = FileSource( - ... path="project/feature_repo/data/driver_stats.parquet", + ... path="data/driver_stats.parquet", ... timestamp_field="event_timestamp", ... created_timestamp_column="created", ... ) diff --git a/sdk/python/feast/infra/offline_stores/dask.py b/sdk/python/feast/infra/offline_stores/dask.py index 52ad88d2997..d26e8609bae 100644 --- a/sdk/python/feast/infra/offline_stores/dask.py +++ b/sdk/python/feast/infra/offline_stores/dask.py @@ -57,6 +57,7 @@ def __init__( self, evaluation_function: Callable, full_feature_names: bool, + repo_path: str, on_demand_feature_views: Optional[List[OnDemandFeatureView]] = None, metadata: Optional[RetrievalMetadata] = None, ): @@ -67,6 +68,7 @@ def __init__( self._full_feature_names = full_feature_names self._on_demand_feature_views = on_demand_feature_views or [] self._metadata = metadata + self.repo_path = repo_path @property def full_feature_names(self) -> bool: @@ -99,8 +101,13 @@ def persist( if not allow_overwrite and os.path.exists(storage.file_options.uri): raise SavedDatasetLocationAlreadyExists(location=storage.file_options.uri) + if not Path(storage.file_options.uri).is_absolute(): + absolute_path = Path(self.repo_path) / storage.file_options.uri + else: + absolute_path = Path(storage.file_options.uri) + filesystem, path = FileSource.create_filesystem_and_path( - storage.file_options.uri, + str(absolute_path), storage.file_options.s3_endpoint_override, ) @@ -243,7 +250,9 @@ def evaluate_historical_retrieval(): all_join_keys = list(set(all_join_keys + join_keys)) - df_to_join = _read_datasource(feature_view.batch_source) + df_to_join = _read_datasource( + feature_view.batch_source, config.repo_path + ) df_to_join, timestamp_field = _field_mapping( df_to_join, @@ -297,6 +306,7 @@ def evaluate_historical_retrieval(): min_event_timestamp=entity_df_event_timestamp_range[0], max_event_timestamp=entity_df_event_timestamp_range[1], ), + repo_path=str(config.repo_path), ) return job @@ -316,7 +326,7 @@ def pull_latest_from_table_or_query( # Create lazy function that is only called from the RetrievalJob object def evaluate_offline_job(): - source_df = _read_datasource(data_source) + source_df = _read_datasource(data_source, config.repo_path) source_df = _normalize_timestamp( source_df, timestamp_field, created_timestamp_column @@ -377,6 +387,7 @@ def evaluate_offline_job(): return DaskRetrievalJob( evaluation_function=evaluate_offline_job, full_feature_names=False, + repo_path=str(config.repo_path), ) @staticmethod @@ -420,8 +431,13 @@ def write_logged_features( # Since this code will be mostly used from Go-created thread, it's better to avoid producing new threads data = pyarrow.parquet.read_table(data, use_threads=False, pre_buffer=False) + if config.repo_path is not None and not Path(destination.path).is_absolute(): + absolute_path = config.repo_path / destination.path + else: + absolute_path = Path(destination.path) + filesystem, path = FileSource.create_filesystem_and_path( - destination.path, + str(absolute_path), destination.s3_endpoint_override, ) @@ -456,8 +472,14 @@ def offline_write_batch( ) file_options = feature_view.batch_source.file_options + + if config.repo_path is not None and not Path(file_options.uri).is_absolute(): + absolute_path = config.repo_path / file_options.uri + else: + absolute_path = Path(file_options.uri) + filesystem, path = FileSource.create_filesystem_and_path( - file_options.uri, file_options.s3_endpoint_override + str(absolute_path), file_options.s3_endpoint_override ) prev_table = pyarrow.parquet.read_table( path, filesystem=filesystem, memory_map=True @@ -493,7 +515,7 @@ def _get_entity_df_event_timestamp_range( ) -def _read_datasource(data_source) -> dd.DataFrame: +def _read_datasource(data_source, repo_path) -> dd.DataFrame: storage_options = ( { "client_kwargs": { @@ -504,8 +526,12 @@ def _read_datasource(data_source) -> dd.DataFrame: else None ) + if not Path(data_source.path).is_absolute(): + path = repo_path / data_source.path + else: + path = data_source.path return dd.read_parquet( - data_source.path, + path, storage_options=storage_options, ) diff --git a/sdk/python/feast/infra/offline_stores/duckdb.py b/sdk/python/feast/infra/offline_stores/duckdb.py index a639d54add5..e64da029a6a 100644 --- a/sdk/python/feast/infra/offline_stores/duckdb.py +++ b/sdk/python/feast/infra/offline_stores/duckdb.py @@ -27,7 +27,7 @@ from feast.repo_config import FeastConfigBaseModel, RepoConfig -def _read_data_source(data_source: DataSource) -> Table: +def _read_data_source(data_source: DataSource, repo_path: str) -> Table: assert isinstance(data_source, FileSource) if isinstance(data_source.file_format, ParquetFormat): @@ -43,6 +43,7 @@ def _read_data_source(data_source: DataSource) -> Table: def _write_data_source( table: Table, data_source: DataSource, + repo_path: str, mode: str = "append", allow_overwrite: bool = False, ): @@ -50,14 +51,24 @@ def _write_data_source( file_options = data_source.file_options - if mode == "overwrite" and not allow_overwrite and os.path.exists(file_options.uri): + if not Path(file_options.uri).is_absolute(): + absolute_path = Path(repo_path) / file_options.uri + else: + absolute_path = Path(file_options.uri) + + if ( + mode == "overwrite" + and not allow_overwrite + and os.path.exists(str(absolute_path)) + ): raise SavedDatasetLocationAlreadyExists(location=file_options.uri) if isinstance(data_source.file_format, ParquetFormat): if mode == "overwrite": table = table.to_pyarrow() + filesystem, path = FileSource.create_filesystem_and_path( - file_options.uri, + str(absolute_path), file_options.s3_endpoint_override, ) diff --git a/sdk/python/feast/infra/offline_stores/file_source.py b/sdk/python/feast/infra/offline_stores/file_source.py index 3fdc6cba31a..9557b8077d0 100644 --- a/sdk/python/feast/infra/offline_stores/file_source.py +++ b/sdk/python/feast/infra/offline_stores/file_source.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Callable, Dict, Iterable, List, Optional, Tuple import pyarrow @@ -154,8 +155,16 @@ def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: def get_table_column_names_and_types( self, config: RepoConfig ) -> Iterable[Tuple[str, str]]: + if ( + config.repo_path is not None + and not Path(self.file_options.uri).is_absolute() + ): + absolute_path = config.repo_path / self.file_options.uri + else: + absolute_path = Path(self.file_options.uri) + filesystem, path = FileSource.create_filesystem_and_path( - self.path, self.file_options.s3_endpoint_override + str(absolute_path), self.file_options.s3_endpoint_override ) # TODO why None check necessary diff --git a/sdk/python/feast/infra/offline_stores/ibis.py b/sdk/python/feast/infra/offline_stores/ibis.py index 61c477baec6..66d00ca6292 100644 --- a/sdk/python/feast/infra/offline_stores/ibis.py +++ b/sdk/python/feast/infra/offline_stores/ibis.py @@ -46,8 +46,8 @@ def pull_latest_from_table_or_query_ibis( created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, - data_source_reader: Callable[[DataSource], Table], - data_source_writer: Callable[[pyarrow.Table, DataSource], None], + data_source_reader: Callable[[DataSource, str], Table], + data_source_writer: Callable[[pyarrow.Table, DataSource, str], None], staging_location: Optional[str] = None, staging_location_endpoint_override: Optional[str] = None, ) -> RetrievalJob: @@ -57,7 +57,7 @@ def pull_latest_from_table_or_query_ibis( start_date = start_date.astimezone(tz=timezone.utc) end_date = end_date.astimezone(tz=timezone.utc) - table = data_source_reader(data_source) + table = data_source_reader(data_source, str(config.repo_path)) table = table.select(*fields) @@ -87,6 +87,7 @@ def pull_latest_from_table_or_query_ibis( data_source_writer=data_source_writer, staging_location=staging_location, staging_location_endpoint_override=staging_location_endpoint_override, + repo_path=str(config.repo_path), ) @@ -147,8 +148,8 @@ def get_historical_features_ibis( entity_df: Union[pd.DataFrame, str], registry: BaseRegistry, project: str, - data_source_reader: Callable[[DataSource], Table], - data_source_writer: Callable[[pyarrow.Table, DataSource], None], + data_source_reader: Callable[[DataSource, str], Table], + data_source_writer: Callable[[pyarrow.Table, DataSource, str], None], full_feature_names: bool = False, staging_location: Optional[str] = None, staging_location_endpoint_override: Optional[str] = None, @@ -174,7 +175,9 @@ def get_historical_features_ibis( def read_fv( feature_view: FeatureView, feature_refs: List[str], full_feature_names: bool ) -> Tuple: - fv_table: Table = data_source_reader(feature_view.batch_source) + fv_table: Table = data_source_reader( + feature_view.batch_source, str(config.repo_path) + ) for old_name, new_name in feature_view.batch_source.field_mapping.items(): if old_name in fv_table.columns: @@ -247,6 +250,7 @@ def read_fv( data_source_writer=data_source_writer, staging_location=staging_location, staging_location_endpoint_override=staging_location_endpoint_override, + repo_path=str(config.repo_path), ) @@ -258,8 +262,8 @@ def pull_all_from_table_or_query_ibis( timestamp_field: str, start_date: datetime, end_date: datetime, - data_source_reader: Callable[[DataSource], Table], - data_source_writer: Callable[[pyarrow.Table, DataSource], None], + data_source_reader: Callable[[DataSource, str], Table], + data_source_writer: Callable[[pyarrow.Table, DataSource, str], None], staging_location: Optional[str] = None, staging_location_endpoint_override: Optional[str] = None, ) -> RetrievalJob: @@ -267,7 +271,7 @@ def pull_all_from_table_or_query_ibis( start_date = start_date.astimezone(tz=timezone.utc) end_date = end_date.astimezone(tz=timezone.utc) - table = data_source_reader(data_source) + table = data_source_reader(data_source, str(config.repo_path)) table = table.select(*fields) @@ -290,6 +294,7 @@ def pull_all_from_table_or_query_ibis( data_source_writer=data_source_writer, staging_location=staging_location, staging_location_endpoint_override=staging_location_endpoint_override, + repo_path=str(config.repo_path), ) @@ -319,7 +324,7 @@ def offline_write_batch_ibis( feature_view: FeatureView, table: pyarrow.Table, progress: Optional[Callable[[int], Any]], - data_source_writer: Callable[[pyarrow.Table, DataSource], None], + data_source_writer: Callable[[pyarrow.Table, DataSource, str], None], ): pa_schema, column_names = get_pyarrow_schema_from_batch_source( config, feature_view.batch_source @@ -330,7 +335,9 @@ def offline_write_batch_ibis( f"The schema is expected to be {pa_schema} with the columns (in this exact order) to be {column_names}." ) - data_source_writer(ibis.memtable(table), feature_view.batch_source) + data_source_writer( + ibis.memtable(table), feature_view.batch_source, str(config.repo_path) + ) def deduplicate( @@ -469,6 +476,7 @@ def __init__( data_source_writer, staging_location, staging_location_endpoint_override, + repo_path, ) -> None: super().__init__() self.table = table @@ -480,6 +488,7 @@ def __init__( self.data_source_writer = data_source_writer self.staging_location = staging_location self.staging_location_endpoint_override = staging_location_endpoint_override + self.repo_path = repo_path def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame: return self.table.execute() @@ -502,7 +511,11 @@ def persist( timeout: Optional[int] = None, ): self.data_source_writer( - self.table, storage.to_data_source(), "overwrite", allow_overwrite + self.table, + storage.to_data_source(), + self.repo_path, + "overwrite", + allow_overwrite, ) @property diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index bf0bde6fcbf..845b5505c9f 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -193,6 +193,7 @@ class RepoConfig(FeastBaseModel): """ Flags (deprecated field): Feature flags for experimental features """ repo_path: Optional[Path] = None + """When using relative path in FileSource path, this parameter is mandatory""" entity_key_serialization_version: StrictInt = 1 """ Entity key serialization version: This version is used to control what serialization scheme is diff --git a/sdk/python/tests/doctest/test_all.py b/sdk/python/tests/doctest/test_all.py index 52348e7da4e..d1b2161252f 100644 --- a/sdk/python/tests/doctest/test_all.py +++ b/sdk/python/tests/doctest/test_all.py @@ -26,7 +26,7 @@ def setup_feature_store(): description="driver id", ) driver_hourly_stats = FileSource( - path="project/feature_repo/data/driver_stats.parquet", + path="data/driver_stats.parquet", timestamp_field="event_timestamp", created_timestamp_column="created", ) diff --git a/sdk/python/tests/unit/infra/offline_stores/test_offline_store.py b/sdk/python/tests/unit/infra/offline_stores/test_offline_store.py index 6d5eeb90c71..afc0e4e5c8f 100644 --- a/sdk/python/tests/unit/infra/offline_stores/test_offline_store.py +++ b/sdk/python/tests/unit/infra/offline_stores/test_offline_store.py @@ -109,7 +109,7 @@ def metadata(self) -> Optional[RetrievalMetadata]: ) def retrieval_job(request, environment): if request.param is DaskRetrievalJob: - return DaskRetrievalJob(lambda: 1, full_feature_names=False) + return DaskRetrievalJob(lambda: 1, full_feature_names=False, repo_path="") elif request.param is RedshiftRetrievalJob: offline_store_config = RedshiftOfflineStoreConfig( cluster_id="feast-int-bucket", diff --git a/sdk/python/tests/unit/test_offline_server.py b/sdk/python/tests/unit/test_offline_server.py index 7c38d9bfca4..e82e2fa6872 100644 --- a/sdk/python/tests/unit/test_offline_server.py +++ b/sdk/python/tests/unit/test_offline_server.py @@ -95,6 +95,7 @@ def remote_feature_store(offline_server): provider="local", offline_store=offline_config, entity_key_serialization_version=2, + # repo_config = ) ) return store diff --git a/sdk/python/tests/utils/auth_permissions_util.py b/sdk/python/tests/utils/auth_permissions_util.py index 3b5e589812a..b8ca7355e98 100644 --- a/sdk/python/tests/utils/auth_permissions_util.py +++ b/sdk/python/tests/utils/auth_permissions_util.py @@ -119,6 +119,7 @@ def get_remote_registry_store(server_port, feature_store): registry=registry_config, provider="local", entity_key_serialization_version=2, + repo_path=feature_store.repo_path, ) ) return store