diff --git a/Makefile b/Makefile index c33685ef2cb..e80dec734ba 100644 --- a/Makefile +++ b/Makefile @@ -246,7 +246,28 @@ test-python-universal-postgres-offline: not gcs_registry and \ not s3_registry and \ not test_snowflake and \ - not test_universal_types" \ + not test_spark" \ + sdk/python/tests + + test-python-universal-clickhouse-offline: + PYTHONPATH='.' \ + FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.offline_stores.contrib.clickhouse_repo_configuration \ + PYTEST_PLUGINS=sdk.python.feast.infra.offline_stores.contrib.clickhouse_offline_store.tests \ + python -m pytest -v -n 8 --integration \ + -k "not test_historical_retrieval_with_validation and \ + not test_historical_features_persisting and \ + not test_universal_cli and \ + not test_go_feature_server and \ + not test_feature_logging and \ + not test_reorder_columns and \ + not test_logged_features_validation and \ + not test_lambda_materialization_consistency and \ + not test_offline_write and \ + not test_push_features_to_offline_store and \ + not gcs_registry and \ + not s3_registry and \ + not test_snowflake and \ + not test_spark" \ sdk/python/tests test-python-universal-postgres-online: diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 8db4143697e..a8d3c7d630a 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -101,6 +101,7 @@ * [PostgreSQL (contrib)](reference/offline-stores/postgres.md) * [Trino (contrib)](reference/offline-stores/trino.md) * [Azure Synapse + Azure SQL (contrib)](reference/offline-stores/mssql.md) + * [Clickhouse (contrib)](reference/offline-stores/clickhouse.md) * [Remote Offline](reference/offline-stores/remote-offline-store.md) * [Online stores](reference/online-stores/README.md) * [Overview](reference/online-stores/overview.md) diff --git a/docs/reference/data-sources/README.md b/docs/reference/data-sources/README.md index 09df6b861e8..151a948d0af 100644 --- a/docs/reference/data-sources/README.md +++ b/docs/reference/data-sources/README.md @@ -53,3 +53,7 @@ Please see [Data Source](../../getting-started/concepts/data-ingestion.md) for a {% content-ref url="mssql.md" %} [mssql.md](mssql.md) {% endcontent-ref %} + +{% content-ref url="clickhouse.md" %} +[clickhouse.md](clickhouse.md) +{% endcontent-ref %} diff --git a/docs/reference/data-sources/clickhouse.md b/docs/reference/data-sources/clickhouse.md new file mode 100644 index 00000000000..7630d5dd14a --- /dev/null +++ b/docs/reference/data-sources/clickhouse.md @@ -0,0 +1,36 @@ +# Clickhouse source (contrib) + +## Description + +Clickhouse data sources are Clickhouse tables or views. +These can be specified either by a table reference or a SQL query. + +## Disclaimer + +The Clickhouse data source does not achieve full test coverage. +Please do not assume complete stability. + +## Examples + +Defining a Clickhouse source: + +```python +from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source import ( + ClickhouseSource, +) + +driver_stats_source = ClickhouseSource( + name="feast_driver_hourly_stats", + query="SELECT * FROM feast_driver_hourly_stats", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) +``` + +The full set of configuration options is available [here](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source.ClickhouseSource). + +## Supported Types + +Clickhouse data sources support all eight primitive types and their corresponding array types. +The support for Clickhouse Decimal type is achieved by converting it to double. +For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/docs/reference/offline-stores/clickhouse.md b/docs/reference/offline-stores/clickhouse.md new file mode 100644 index 00000000000..317d6e23e1e --- /dev/null +++ b/docs/reference/offline-stores/clickhouse.md @@ -0,0 +1,69 @@ +# Clickhouse offline store (contrib) + +## Description + +The Clickhouse offline store provides support for reading [ClickhouseSource](../data-sources/clickhouse.md). +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. A Pandas dataframes will be uploaded to Clickhouse as a table (temporary table by default) in order to complete join operations. + +## Disclaimer + +The Clickhouse offline store does not achieve full test coverage. +Please do not assume complete stability. + +## Getting started +In order to use this offline store, you'll need to run `pip install 'feast[clickhouse]'`. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +project: my_project +registry: data/registry.db +provider: local +offline_store: + type: feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse.ClickhouseOfflineStore + host: DB_HOST + port: DB_PORT + database: DB_NAME + user: DB_USERNAME + password: DB_PASSWORD + use_temporary_tables_for_entity_df: true +online_store: + path: data/online_store.db +``` +{% endcode %} + +Note that `use_temporary_tables_for_entity_df` is an optional parameter. +The full set of configuration options is available in [ClickhouseOfflineStoreConfig](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse.ClickhouseOfflineStore). + +## Functionality Matrix + +The set of functionality supported by offline stores is described in detail [here](overview.md#functionality). +Below is a matrix indicating which functionality is supported by the Clickhouse offline store. + +| | Clickhouse | +| :----------------------------------------------------------------- |:-----------| +| `get_historical_features` (point-in-time correct join) | yes | +| `pull_latest_from_table_or_query` (retrieve latest feature values) | yes | +| `pull_all_from_table_or_query` (retrieve a saved dataset) | no | +| `offline_write_batch` (persist dataframes to offline store) | no | +| `write_logged_features` (persist logged features to offline store) | no | + +Below is a matrix indicating which functionality is supported by `ClickhouseRetrievalJob`. + +| | Clickhouse | +| ----------------------------------------------------- |------------| +| export to dataframe | yes | +| export to arrow table | yes | +| export to arrow batches | no | +| export to SQL | yes | +| export to data lake (S3, GCS, etc.) | yes | +| export to data warehouse | yes | +| export as Spark dataframe | no | +| local execution of Python-based on-demand transforms | yes | +| remote execution of Python-based on-demand transforms | no | +| persist results in the offline store | yes | +| preview the query plan before execution | yes | +| read partitioned data | yes | + +To compare this set of functionality against other offline stores, please see the full [functionality matrix](overview.md#functionality-matrix). diff --git a/pyproject.toml b/pyproject.toml index e6cad61b690..f24235176b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ azure = [ "pymssql" ] cassandra = ["cassandra-driver>=3.24.0,<4"] +clickhouse = ["clickhouse-connect>=0.7.19"] couchbase = ["couchbase==4.3.2", "couchbase-columnar==1.0.0"] delta = ["deltalake"] docling = ["docling>=2.23.0"] @@ -95,7 +96,7 @@ opentelemetry = ["prometheus_client", "psutil"] spark = ["pyspark>=3.0.0,<4"] trino = ["trino>=0.305.0,<0.400.0", "regex"] postgres = ["psycopg[binary,pool]>=3.0.0,<4"] -pytorch = ["torch>=2.2.2", "torchvision>=0.17.2"] +pytorch = ["torch==2.2.2", "torchvision>=0.17.2"] qdrant = ["qdrant-client>=1.12.0"] redis = [ "redis>=4.2.2,<5", @@ -150,7 +151,7 @@ ci = [ "types-setuptools", "types-tabulate", "virtualenv<20.24.2", - "feast[aws, azure, cassandra, couchbase, delta, docling, duckdb, elasticsearch, faiss, gcp, ge, go, grpcio, hazelcast, hbase, ibis, ikv, k8s, milvus, mssql, mysql, opentelemetry, spark, trino, postgres, pytorch, qdrant, redis, singlestore, snowflake, sqlite_vec]" + "feast[aws, azure, cassandra, clickhouse, couchbase, delta, docling, duckdb, elasticsearch, faiss, gcp, ge, go, grpcio, hazelcast, hbase, ibis, ikv, k8s, milvus, mssql, mysql, opentelemetry, spark, trino, postgres, pytorch, qdrant, redis, singlestore, snowflake, sqlite_vec]" ] nlp = ["feast[docling, milvus, pytorch]"] dev = ["feast[ci]"] diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.clickhouse_offline_store.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.clickhouse_offline_store.rst new file mode 100644 index 00000000000..b593da3115b --- /dev/null +++ b/sdk/python/docs/source/feast.infra.offline_stores.contrib.clickhouse_offline_store.rst @@ -0,0 +1,37 @@ +feast.infra.offline\_stores.contrib.clickhouse\_offline\_store package +====================================================================== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + feast.infra.offline_stores.contrib.clickhouse_offline_store.tests + +Submodules +---------- + +feast.infra.offline\_stores.contrib.clickhouse\_offline\_store.clickhouse module +-------------------------------------------------------------------------------- + +.. automodule:: feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse + :members: + :undoc-members: + :show-inheritance: + +feast.infra.offline\_stores.contrib.clickhouse\_offline\_store.clickhouse\_source module +---------------------------------------------------------------------------------------- + +.. automodule:: feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: feast.infra.offline_stores.contrib.clickhouse_offline_store + :members: + :undoc-members: + :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.clickhouse_offline_store.tests.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.clickhouse_offline_store.tests.rst new file mode 100644 index 00000000000..0b1265e438c --- /dev/null +++ b/sdk/python/docs/source/feast.infra.offline_stores.contrib.clickhouse_offline_store.tests.rst @@ -0,0 +1,21 @@ +feast.infra.offline\_stores.contrib.clickhouse\_offline\_store.tests package +============================================================================ + +Submodules +---------- + +feast.infra.offline\_stores.contrib.clickhouse\_offline\_store.tests.data\_source module +---------------------------------------------------------------------------------------- + +.. automodule:: feast.infra.offline_stores.contrib.clickhouse_offline_store.tests.data_source + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: feast.infra.offline_stores.contrib.clickhouse_offline_store.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.offline_stores.contrib.rst b/sdk/python/docs/source/feast.infra.offline_stores.contrib.rst index 61e797bd6a9..e931c8ed744 100644 --- a/sdk/python/docs/source/feast.infra.offline_stores.contrib.rst +++ b/sdk/python/docs/source/feast.infra.offline_stores.contrib.rst @@ -9,6 +9,7 @@ Subpackages feast.infra.offline_stores.contrib.athena_offline_store feast.infra.offline_stores.contrib.couchbase_offline_store + feast.infra.offline_stores.contrib.clickhouse_offline_store feast.infra.offline_stores.contrib.mssql_offline_store feast.infra.offline_stores.contrib.postgres_offline_store feast.infra.offline_stores.contrib.spark_offline_store @@ -33,6 +34,14 @@ feast.infra.offline\_stores.contrib.couchbase\_columnar\_repo\_configuration mod :undoc-members: :show-inheritance: +feast.infra.offline\_stores.contrib.clickhouse\_repo\_configuration module +-------------------------------------------------------------------------- + +.. automodule:: feast.infra.offline_stores.contrib.clickhouse_repo_configuration + :members: + :undoc-members: + :show-inheritance: + feast.infra.offline\_stores.contrib.mssql\_repo\_configuration module --------------------------------------------------------------------- diff --git a/sdk/python/docs/source/feast.infra.utils.clickhouse.rst b/sdk/python/docs/source/feast.infra.utils.clickhouse.rst new file mode 100644 index 00000000000..4fa3e2a0cbd --- /dev/null +++ b/sdk/python/docs/source/feast.infra.utils.clickhouse.rst @@ -0,0 +1,29 @@ +feast.infra.utils.clickhouse package +==================================== + +Submodules +---------- + +feast.infra.utils.clickhouse.clickhouse\_config module +------------------------------------------------------ + +.. automodule:: feast.infra.utils.clickhouse.clickhouse_config + :members: + :undoc-members: + :show-inheritance: + +feast.infra.utils.clickhouse.connection\_utils module +----------------------------------------------------- + +.. automodule:: feast.infra.utils.clickhouse.connection_utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: feast.infra.utils.clickhouse + :members: + :undoc-members: + :show-inheritance: diff --git a/sdk/python/docs/source/feast.infra.utils.rst b/sdk/python/docs/source/feast.infra.utils.rst index cfa82dc5fd2..2583364acdc 100644 --- a/sdk/python/docs/source/feast.infra.utils.rst +++ b/sdk/python/docs/source/feast.infra.utils.rst @@ -8,6 +8,7 @@ Subpackages :maxdepth: 4 feast.infra.utils.couchbase + feast.infra.utils.clickhouse feast.infra.utils.postgres feast.infra.utils.snowflake diff --git a/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/clickhouse.py b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/clickhouse.py new file mode 100644 index 00000000000..dad5b470cd8 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/clickhouse.py @@ -0,0 +1,574 @@ +import contextlib +import re +from dataclasses import asdict +from datetime import datetime +from typing import Iterator, List, Literal, Optional, Union, cast + +import numpy as np +import pandas as pd +import pyarrow as pa +from pyarrow.compute import cast as pa_cast + +from feast import FeatureView, OnDemandFeatureView, RepoConfig +from feast.data_source import DataSource +from feast.errors import EntitySQLEmptyResults, InvalidEntityType +from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL +from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source import ( + ClickhouseSource, + SavedDatasetClickhouseStorage, +) +from feast.infra.offline_stores.contrib.postgres_offline_store.postgres import ( + PostgreSQLRetrievalJob, + build_point_in_time_query, +) +from feast.infra.offline_stores.offline_store import ( + OfflineStore, + RetrievalJob, + RetrievalMetadata, +) +from feast.infra.registry.base_registry import BaseRegistry +from feast.infra.utils.clickhouse.clickhouse_config import ClickhouseConfig +from feast.infra.utils.clickhouse.connection_utils import get_client +from feast.saved_dataset import SavedDatasetStorage + + +class ClickhouseOfflineStoreConfig(ClickhouseConfig): + type: Literal["clickhouse"] = "clickhouse" + + +class ClickhouseOfflineStore(OfflineStore): + @staticmethod + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + assert isinstance(config.offline_store, ClickhouseOfflineStoreConfig) + for fv in feature_views: + assert isinstance(fv.batch_source, ClickhouseSource) + + entity_schema = _get_entity_schema(entity_df, config) + + entity_df_event_timestamp_col = ( + offline_utils.infer_event_timestamp_from_entity_df(entity_schema) + ) + + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, + entity_df_event_timestamp_col, + config, + ) + + @contextlib.contextmanager + def query_generator() -> Iterator[str]: + table_name = offline_utils.get_temp_entity_table_name() + if ( + isinstance(entity_df, pd.DataFrame) + and not config.offline_store.use_temporary_tables_for_entity_df + ): + table_name = f"{config.offline_store.database}.{table_name}" + + _upload_entity_df( + config, + entity_df, + table_name, + entity_df_event_timestamp_col, + ) + + expected_join_keys = offline_utils.get_expected_join_keys( + project, feature_views, registry + ) + + offline_utils.assert_expected_columns_in_entity_df( + entity_schema, expected_join_keys, entity_df_event_timestamp_col + ) + + query_context = offline_utils.get_feature_view_query_context( + feature_refs, + feature_views, + registry, + project, + entity_df_event_timestamp_range, + ) + + query_context_dict = [asdict(context) for context in query_context] + # Hack for query_context.entity_selections to support uppercase in columns + for context in query_context_dict: + context["entity_selections"] = [ + f""""{entity_selection.replace(" AS ", '" AS "')}\"""" + for entity_selection in context["entity_selections"] + ] + + try: + query = build_point_in_time_query( + query_context_dict, + left_table_query_string=table_name, + entity_df_event_timestamp_col=entity_df_event_timestamp_col, + entity_df_columns=entity_schema.keys(), + query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, + full_feature_names=full_feature_names, + ) + yield query + finally: + if ( + table_name + and not config.offline_store.use_temporary_tables_for_entity_df + ): + get_client(config.offline_store).command( + f"DROP TABLE IF EXISTS {table_name}" + ) + + return ClickhouseRetrievalJob( + query=query_generator, + config=config, + full_feature_names=full_feature_names, + on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( + feature_refs, project, registry + ), + metadata=RetrievalMetadata( + features=feature_refs, + keys=list(entity_schema.keys() - {entity_df_event_timestamp_col}), + min_event_timestamp=entity_df_event_timestamp_range[0], + max_event_timestamp=entity_df_event_timestamp_range[1], + ), + ) + + @staticmethod + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert isinstance(config.offline_store, ClickhouseOfflineStoreConfig) + assert isinstance(data_source, ClickhouseSource) + from_expression = data_source.get_table_query_string() + + partition_by_join_key_string = ", ".join(_append_alias(join_key_columns, "a")) + if partition_by_join_key_string != "": + partition_by_join_key_string = ( + "PARTITION BY " + partition_by_join_key_string + ) + timestamps = [timestamp_field] + if created_timestamp_column: + timestamps.append(created_timestamp_column) + timestamp_desc_string = " DESC, ".join(_append_alias(timestamps, "a")) + " DESC" + a_field_string = ", ".join( + _append_alias(join_key_columns + feature_name_columns + timestamps, "a") + ) + b_field_string = ", ".join( + _append_alias(join_key_columns + feature_name_columns + timestamps, "b") + ) + + query = f""" + SELECT + {b_field_string} + {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} + FROM ( + SELECT {a_field_string}, + ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row + FROM ({from_expression}) a + WHERE a."{timestamp_field}" + BETWEEN toDateTime64('{start_date.replace(tzinfo=None)!s}', 6, '{start_date.tzinfo!s}') + AND toDateTime64('{end_date.replace(tzinfo=None)!s}', 6, '{end_date.tzinfo!s}') + ) b + WHERE _feast_row = 1 + """ + + return ClickhouseRetrievalJob( + query=query, + config=config, + full_feature_names=False, + on_demand_feature_views=None, + ) + + +class ClickhouseRetrievalJob(PostgreSQLRetrievalJob): + def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame: + with self._query_generator() as query: + results = get_client(self.config.offline_store).query_df(query) + return results + + def _to_arrow_internal(self, timeout: Optional[int] = None) -> pa.Table: + with self._query_generator() as query: + results: pa.Table = get_client(self.config.offline_store).query_arrow(query) + # Feast doesn't support native decimal types, so we must convert decimal columns to double + for col_index, (name, dtype) in enumerate( + zip(results.schema.names, results.schema.types) + ): + if pa.types.is_decimal(dtype): + results = results.set_column( + col_index, + name, + pa_cast(results[name], target_type=pa.float64()), + ) + return results + + def persist( + self, + storage: SavedDatasetStorage, + allow_overwrite: Optional[bool] = False, + timeout: Optional[int] = None, + ): + assert isinstance(storage, SavedDatasetClickhouseStorage) + + df_to_clickhouse_table( + config=self.config.offline_store, + df=self.to_df(), + table_name=storage.clickhouse_options._table, + entity_timestamp_col="event_timestamp", + ) + + +def _get_entity_schema( + entity_df: Union[pd.DataFrame, str], + config: RepoConfig, +) -> dict[str, np.dtype]: + if isinstance(entity_df, pd.DataFrame): + return dict(zip(entity_df.columns, entity_df.dtypes)) + elif isinstance(entity_df, str): + query = f"SELECT * FROM ({entity_df}) LIMIT 1" + df = get_client(config.offline_store).query_df(query) + return _get_entity_schema(df, config) + else: + raise InvalidEntityType(type(entity_df)) + + +def _get_entity_df_event_timestamp_range( + entity_df: Union[pd.DataFrame, str], + entity_df_event_timestamp_col: str, + config: RepoConfig, +) -> tuple[datetime, datetime]: + if isinstance(entity_df, pd.DataFrame): + entity_df_event_timestamp = entity_df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pd.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pd.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), determine range + # from table + query = f'SELECT MIN("{entity_df_event_timestamp_col}") AS "min_value", MAX("{entity_df_event_timestamp_col}") AS "max_value" FROM ({entity_df})' + results = get_client(config.offline_store).query(query).result_rows + + entity_df_event_timestamp_range = cast(tuple[datetime, datetime], results[0]) + if ( + entity_df_event_timestamp_range[0] is None + or entity_df_event_timestamp_range[1] is None + ): + raise EntitySQLEmptyResults(entity_df) + else: + raise InvalidEntityType(type(entity_df)) + + return entity_df_event_timestamp_range + + +def _upload_entity_df( + config: RepoConfig, + entity_df: Union[pd.DataFrame, str], + table_name: str, + entity_timestamp_col: str, +) -> None: + if isinstance(entity_df, pd.DataFrame): + df_to_clickhouse_table( + config.offline_store, entity_df, table_name, entity_timestamp_col + ) + elif isinstance(entity_df, str): + if config.offline_store.use_temporary_tables_for_entity_df: + query = f'CREATE TEMPORARY TABLE "{table_name}" AS ({entity_df})' + else: + query = f'CREATE TABLE "{table_name}" ENGINE = MergeTree() ORDER BY ({entity_timestamp_col}) AS ({entity_df})' + get_client(config.offline_store).command(query) + else: + raise InvalidEntityType(type(entity_df)) + + +def df_to_clickhouse_table( + config: ClickhouseConfig, + df: pd.DataFrame, + table_name: str, + entity_timestamp_col: str, +) -> None: + table_schema = _df_to_create_table_schema(df) + if config.use_temporary_tables_for_entity_df: + query = f"CREATE TEMPORARY TABLE {table_name} ({table_schema})" + else: + query = f""" + CREATE TABLE {table_name} ( + {table_schema} + ) + ENGINE = MergeTree() + ORDER BY ({entity_timestamp_col}) + """ + get_client(config).command(query) + get_client(config).insert_df(table_name, df) + + +def _df_to_create_table_schema(entity_df: pd.DataFrame) -> str: + pa_table = pa.Table.from_pandas(entity_df) + columns = [ + f""""{f.name}" {arrow_to_ch_type(str(f.type), f.nullable)}""" + for f in pa_table.schema + ] + return ", ".join(columns) + + +def arrow_to_ch_type(t_str: str, nullable: bool) -> str: + list_pattern = r"list" + list_res = re.search(list_pattern, t_str) + if list_res is not None: + item_type_str = list_res.group(1) + return f"Array({arrow_to_ch_type(item_type_str, nullable)})" + + if nullable: + return f"Nullable({arrow_to_ch_type(t_str, nullable=False)})" + + try: + if t_str.startswith("timestamp"): + return _arrow_to_ch_timestamp_type(t_str) + return { + "bool": "Boolean", + "int8": "Int8", + "int16": "Int16", + "int32": "Int32", + "int64": "Int64", + "uint8": "UInt8", + "uint16": "UInt16", + "uint32": "UInt32", + "uint64": "Uint64", + "float": "Float32", + "double": "Float64", + "string": "String", + }[t_str] + except KeyError: + raise ValueError(f"Unsupported type: {t_str}") + + +def _arrow_to_ch_timestamp_type(t_str: str) -> str: + _ARROW_PRECISION_TO_CH_PRECISION = { + "s": 0, + "ms": 3, + "us": 6, + "ns": 9, + } + + unit, *rest = t_str.removeprefix("timestamp[").removesuffix("]").split(",") + + unit = unit.strip() + precision = _ARROW_PRECISION_TO_CH_PRECISION[unit] + + if len(rest): + tz = rest[0] + tz = ( + tz.strip() + .removeprefix("tz=") + .translate( + str.maketrans( + { # type: ignore[arg-type] + "'": None, + '"': None, + } + ) + ) + ) + else: + tz = None + + if precision > 0: + if tz is not None: + return f"DateTime64({precision}, '{tz}')" + else: + return f"DateTime64({precision})" + else: + if tz is not None: + return f"DateTime('{tz}')" + else: + return "DateTime" + + +def _append_alias(field_names: List[str], alias: str) -> List[str]: + return [f'{alias}."{field_name}"' for field_name in field_names] + + +MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """ +/* + Compute a deterministic hash for the `left_table_query_string` that will be used throughout + all the logic as the field to GROUP BY the data +*/ +WITH entity_dataframe AS ( + SELECT *, + {{entity_df_event_timestamp_col}} AS entity_timestamp + {% for featureview in featureviews %} + {% if featureview.entities %} + ,( + {% for entity in featureview.entities %} + CAST("{{entity}}" as VARCHAR) || + {% endfor %} + CAST("{{entity_df_event_timestamp_col}}" AS VARCHAR) + ) AS "{{featureview.name}}__entity_row_unique_id" + {% else %} + ,CAST("{{entity_df_event_timestamp_col}}" AS VARCHAR) AS "{{featureview.name}}__entity_row_unique_id" + {% endif %} + {% endfor %} + FROM {{ left_table_query_string }} +), + +{% for featureview in featureviews %} + +"{{ featureview.name }}__entity_dataframe" AS ( + SELECT + {% if featureview.entities %}"{{ featureview.entities | join('", "') }}",{% endif %} + entity_timestamp, + "{{featureview.name}}__entity_row_unique_id" + FROM entity_dataframe + GROUP BY + {% if featureview.entities %}"{{ featureview.entities | join('", "')}}",{% endif %} + entity_timestamp, + "{{featureview.name}}__entity_row_unique_id" +), + +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + + 1. We first join the current feature_view to the entity dataframe that has been passed. + This JOIN has the following logic: + - For each row of the entity dataframe, only keep the rows where the `timestamp_field` + is less than the one provided in the entity dataframe + - If there a TTL for the current feature_view, also keep the rows where the `timestamp_field` + is higher the the one provided minus the TTL + - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been + computed previously + + The output of this CTE will contain all the necessary information and already filtered out most + of the data that is not relevant. +*/ + +"{{ featureview.name }}__subquery" AS ( + SELECT + "{{ featureview.timestamp_field }}" as event_timestamp, + {{ '"' ~ featureview.created_timestamp_column ~ '" as created_timestamp,' if featureview.created_timestamp_column else '' }} + {{ featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %} + {% for feature in featureview.features %} + "{{ feature }}" as {% if full_feature_names %}"{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}"{% else %}"{{ featureview.field_mapping.get(feature, feature) }}"{% endif %}{% if loop.last %}{% else %}, {% endif %} + {% endfor %} + FROM {{ featureview.table_subquery }} AS sub + WHERE "{{ featureview.timestamp_field }}" <= (SELECT MAX(entity_timestamp) FROM entity_dataframe) + {% if featureview.ttl == 0 %}{% else %} + AND "{{ featureview.timestamp_field }}" >= (SELECT MIN(entity_timestamp) FROM entity_dataframe) - interval {{ featureview.ttl }} second + {% endif %} +), + +"{{ featureview.name }}__base" AS ( + SELECT + subquery.*, + entity_dataframe.entity_timestamp, + entity_dataframe."{{featureview.name}}__entity_row_unique_id" + FROM "{{ featureview.name }}__subquery" AS subquery + INNER JOIN "{{ featureview.name }}__entity_dataframe" AS entity_dataframe + ON TRUE + {% for entity in featureview.entities %} + AND subquery."{{ entity }}" = entity_dataframe."{{ entity }}" + {% endfor %} + WHERE TRUE + AND subquery.event_timestamp <= entity_dataframe.entity_timestamp + + {% if featureview.ttl == 0 %}{% else %} + AND subquery.event_timestamp >= entity_dataframe.entity_timestamp - interval {{ featureview.ttl }} second + {% endif %} +), + +/* + 2. If the `created_timestamp_column` has been set, we need to + deduplicate the data first. This is done by calculating the + `MAX(created_at_timestamp)` for each event_timestamp. + We then join the data on the next CTE +*/ +{% if featureview.created_timestamp_column %} +"{{ featureview.name }}__dedup" AS ( + SELECT + "{{featureview.name}}__entity_row_unique_id", + event_timestamp, + MAX(created_timestamp) as created_timestamp + FROM "{{ featureview.name }}__base" + GROUP BY "{{featureview.name}}__entity_row_unique_id", event_timestamp +), +{% endif %} + +/* + 3. The data has been filtered during the first CTE "*__base" + Thus we only need to compute the latest timestamp of each feature. +*/ +"{{ featureview.name }}__latest" AS ( + SELECT + event_timestamp, + {% if featureview.created_timestamp_column %}created_timestamp,{% endif %} + "{{featureview.name}}__entity_row_unique_id" + FROM + ( + SELECT *, + ROW_NUMBER() OVER( + PARTITION BY "{{featureview.name}}__entity_row_unique_id" + ORDER BY event_timestamp DESC{% if featureview.created_timestamp_column %},created_timestamp DESC{% endif %} + ) AS row_number + FROM "{{ featureview.name }}__base" + {% if featureview.created_timestamp_column %} + INNER JOIN "{{ featureview.name }}__dedup" + USING ("{{featureview.name}}__entity_row_unique_id", event_timestamp, created_timestamp) + {% endif %} + ) AS sub + WHERE row_number = 1 +), + +/* + 4. Once we know the latest value of each feature for a given timestamp, + we can join again the data back to the original "base" dataset +*/ +"{{ featureview.name }}__cleaned" AS ( + SELECT base.* + FROM "{{ featureview.name }}__base" as base + INNER JOIN "{{ featureview.name }}__latest" + USING( + "{{featureview.name}}__entity_row_unique_id", + event_timestamp + {% if featureview.created_timestamp_column %} + ,created_timestamp + {% endif %} + ) +){% if loop.last %}{% else %}, {% endif %} + + +{% endfor %} +/* + Joins the outputs of multiple time travel joins to a single table. + The entity_dataframe dataset being our source of truth here. + */ + +SELECT "{{ final_output_feature_names | join('", "')}}" +FROM entity_dataframe +{% for featureview in featureviews %} +LEFT JOIN ( + SELECT + "{{featureview.name}}__entity_row_unique_id" + {% for feature in featureview.features %} + ,"{% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}" + {% endfor %} + FROM "{{ featureview.name }}__cleaned" +) AS "{{featureview.name}}" USING ("{{featureview.name}}__entity_row_unique_id") +{% endfor %} +""" diff --git a/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/clickhouse_source.py b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/clickhouse_source.py new file mode 100644 index 00000000000..6ed86e1c37c --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/clickhouse_source.py @@ -0,0 +1,211 @@ +import json +from typing import Any, Callable, Iterable, Optional, Tuple, Type + +from clickhouse_connect.datatypes.base import ClickHouseType +from clickhouse_connect.datatypes.container import Array +from clickhouse_connect.datatypes.numeric import ( + Boolean, + Decimal, + Float32, + Float64, + Int32, + Int64, +) +from clickhouse_connect.datatypes.registry import get_from_name +from clickhouse_connect.datatypes.string import String +from clickhouse_connect.datatypes.temporal import DateTime, DateTime64 + +from feast import RepoConfig, ValueType +from feast.data_source import DataSource +from feast.errors import DataSourceNoNameException +from feast.infra.utils.clickhouse.connection_utils import get_client +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) +from feast.saved_dataset import SavedDatasetStorage + + +class ClickhouseOptions: + def __init__( + self, + name: Optional[str], + query: Optional[str], + table: Optional[str], + ): + self._name = name or "" + self._query = query or "" + self._table = table or "" + + @classmethod + def from_proto(cls, clickhouse_options_proto: DataSourceProto.CustomSourceOptions): + config = json.loads(clickhouse_options_proto.configuration.decode("utf8")) + postgres_options = cls( + name=config["name"], query=config["query"], table=config["table"] + ) + + return postgres_options + + def to_proto(self) -> DataSourceProto.CustomSourceOptions: + clickhouse_options_proto = DataSourceProto.CustomSourceOptions( + configuration=json.dumps( + {"name": self._name, "query": self._query, "table": self._table} + ).encode() + ) + return clickhouse_options_proto + + +class ClickhouseSource(DataSource): + def __init__( + self, + name: Optional[str] = None, + query: Optional[str] = None, + table: Optional[str] = None, + timestamp_field: Optional[str] = "", + created_timestamp_column: Optional[str] = "", + field_mapping: Optional[dict[str, str]] = None, + description: Optional[str] = "", + tags: Optional[dict[str, str]] = None, + owner: Optional[str] = "", + ): + self._clickhouse_options = ClickhouseOptions( + name=name, query=query, table=table + ) + + if name is None and table is None: + raise DataSourceNoNameException() + name = name or table + assert name + + super().__init__( + name=name, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + description=description, + tags=tags, + owner=owner, + ) + + @staticmethod + def from_proto(data_source: DataSourceProto) -> Any: + assert data_source.HasField("custom_options") + + postgres_options = json.loads(data_source.custom_options.configuration) + + return ClickhouseSource( + name=postgres_options["name"], + query=postgres_options["query"], + table=postgres_options["table"], + field_mapping=dict(data_source.field_mapping), + timestamp_field=data_source.timestamp_field, + created_timestamp_column=data_source.created_timestamp_column, + description=data_source.description, + tags=dict(data_source.tags), + owner=data_source.owner, + ) + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + name=self.name, + type=DataSourceProto.CUSTOM_SOURCE, + data_source_class_type="feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source.ClickhouseSource", + field_mapping=self.field_mapping, + custom_options=self._clickhouse_options.to_proto(), + description=self.description, + tags=self.tags, + owner=self.owner, + ) + + data_source_proto.timestamp_field = self.timestamp_field + data_source_proto.created_timestamp_column = self.created_timestamp_column + + return data_source_proto + + def validate(self, config: RepoConfig): + pass + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return ch_type_to_feast_value_type + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + with get_client(config.offline_store) as client: + result = client.query( + f"SELECT * FROM {self.get_table_query_string()} AS sub LIMIT 0" + ) + column_types = list(zip(result.column_names, result.column_types)) + return [ + (name, _ch_type_to_ch_type_str(type_)) for name, type_ in column_types + ] + + def get_table_query_string(self) -> str: + if self._clickhouse_options._table: + return f"{self._clickhouse_options._table}" + else: + return f"({self._clickhouse_options._query})" + + +class SavedDatasetClickhouseStorage(SavedDatasetStorage): + _proto_attr_name = "custom_storage" + + clickhouse_options: ClickhouseOptions + + def __init__(self, table_ref: str): + self.clickhouse_options = ClickhouseOptions( + table=table_ref, name=None, query=None + ) + + @staticmethod + def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: + return SavedDatasetClickhouseStorage( + table_ref=ClickhouseOptions.from_proto(storage_proto.custom_storage)._table + ) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto( + custom_storage=self.clickhouse_options.to_proto() + ) + + def to_data_source(self) -> DataSource: + return ClickhouseSource(table=self.clickhouse_options._table) + + +def ch_type_to_feast_value_type(type_str: str) -> ValueType: + type_obj = get_from_name(type_str) + type_cls = type(type_obj) + container_type = None + if isinstance(type_obj, Array): + container_type = Array + type_map: dict[ + tuple[Optional[Type[ClickHouseType]], Type[ClickHouseType]], ValueType + ] = { + (None, Boolean): ValueType.BOOL, + (None, String): ValueType.STRING, + (None, Float32): ValueType.FLOAT, + (None, Float64): ValueType.DOUBLE, + (None, Decimal): ValueType.DOUBLE, + (None, Int32): ValueType.INT32, + (None, Int64): ValueType.INT64, + (None, DateTime): ValueType.UNIX_TIMESTAMP, + (None, DateTime64): ValueType.UNIX_TIMESTAMP, + (Array, Boolean): ValueType.BOOL_LIST, + (Array, String): ValueType.STRING_LIST, + (Array, Float32): ValueType.FLOAT_LIST, + (Array, Float64): ValueType.DOUBLE_LIST, + (Array, Decimal): ValueType.DOUBLE_LIST, + (Array, Int32): ValueType.INT32_LIST, + (Array, Int64): ValueType.INT64_LIST, + (Array, DateTime): ValueType.UNIX_TIMESTAMP_LIST, + (Array, DateTime64): ValueType.UNIX_TIMESTAMP_LIST, + } + value = type_map.get((container_type, type_cls), ValueType.UNKNOWN) + if value == ValueType.UNKNOWN: + print("unknown type:", type_str) + return value + + +def _ch_type_to_ch_type_str(type_: ClickHouseType) -> str: + return type_.name diff --git a/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/tests/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/tests/__init__.py new file mode 100644 index 00000000000..0dd9d78e888 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/tests/__init__.py @@ -0,0 +1 @@ +from .data_source import clickhouse_container # noqa diff --git a/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/tests/data_source.py new file mode 100644 index 00000000000..80fd1751dc5 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/tests/data_source.py @@ -0,0 +1,116 @@ +import logging +from typing import Dict, Optional + +import pandas as pd +import pytest +from testcontainers.clickhouse import ClickHouseContainer +from testcontainers.core.waiting_utils import wait_for_logs + +from feast.data_source import DataSource +from feast.feature_logging import LoggingDestination +from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse import ( + ClickhouseOfflineStoreConfig, + df_to_clickhouse_table, +) +from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source import ( + ClickhouseSource, +) +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) + +logger = logging.getLogger(__name__) + +CLICKHOUSE_USER = "default" +CLICKHOUSE_PASSWORD = "password" +CLICKHOUSE_OFFLINE_DB = "default" +CLICKHOUSE_ONLINE_DB = "default_online" + + +@pytest.fixture(scope="session") +def clickhouse_container(): + container = ClickHouseContainer( + username=CLICKHOUSE_USER, + password=CLICKHOUSE_PASSWORD, + dbname=CLICKHOUSE_OFFLINE_DB, + ) + container.start() + + log_string_to_wait_for = "Logging errors to" + waited = wait_for_logs( + container=container, + predicate=log_string_to_wait_for, + timeout=30, + interval=10, + ) + logger.info("Waited for %s seconds until clickhouse container was up", waited) + + yield container + container.stop() + + +class ClickhouseDataSourceCreator(DataSourceCreator): + def create_logged_features_destination(self) -> LoggingDestination: + return None # type: ignore + + def __init__( + self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs + ): + super().__init__( + project_name, + ) + + self.project_name = project_name + self.container = fixture_request.getfixturevalue("clickhouse_container") + if not self.container: + raise RuntimeError( + "In order to use this data source " + "'feast.infra.offline_stores.contrib.clickhouse_offline_store.tests' " + "must be include into pytest plugins" + ) + + self.offline_store_config = ClickhouseOfflineStoreConfig( + type="clickhouse", + host="localhost", + port=self.container.get_exposed_port(8123), + database=CLICKHOUSE_OFFLINE_DB, + user=CLICKHOUSE_USER, + password=CLICKHOUSE_PASSWORD, + ) + + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + created_timestamp_column="created_ts", + field_mapping: Optional[Dict[str, str]] = None, + timestamp_field: Optional[str] = "ts", + ) -> DataSource: + destination_name = self.get_prefixed_table_name(destination_name) + + if self.offline_store_config: + if timestamp_field is None: + timestamp_field = "ts" + df_to_clickhouse_table( + self.offline_store_config, df, destination_name, timestamp_field + ) + return ClickhouseSource( + name=destination_name, + query=f"SELECT * FROM {destination_name}", + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping or {"ts_1": "ts"}, + ) + + def create_offline_store_config(self) -> ClickhouseOfflineStoreConfig: + assert self.offline_store_config + return self.offline_store_config + + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" + + def create_saved_dataset_destination(self): + pass + + def teardown(self): + pass diff --git a/sdk/python/feast/infra/offline_stores/contrib/clickhouse_repo_configuration.py b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_repo_configuration.py new file mode 100644 index 00000000000..5c9d4461b16 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/clickhouse_repo_configuration.py @@ -0,0 +1,11 @@ +from feast.infra.offline_stores.contrib.clickhouse_offline_store.tests.data_source import ( + ClickhouseDataSourceCreator, +) +from tests.integration.feature_repos.repo_configuration import REDIS_CONFIG +from tests.integration.feature_repos.universal.online_store.redis import ( + RedisOnlineStoreCreator, +) + +AVAILABLE_OFFLINE_STORES = [("local", ClickhouseDataSourceCreator)] + +AVAILABLE_ONLINE_STORES = {"redis": (REDIS_CONFIG, RedisOnlineStoreCreator)} diff --git a/sdk/python/feast/infra/utils/clickhouse/__init__.py b/sdk/python/feast/infra/utils/clickhouse/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/sdk/python/feast/infra/utils/clickhouse/clickhouse_config.py b/sdk/python/feast/infra/utils/clickhouse/clickhouse_config.py new file mode 100644 index 00000000000..1f163e0a81b --- /dev/null +++ b/sdk/python/feast/infra/utils/clickhouse/clickhouse_config.py @@ -0,0 +1,14 @@ +from pydantic import ConfigDict, StrictStr + +from feast.repo_config import FeastConfigBaseModel + + +class ClickhouseConfig(FeastConfigBaseModel): + host: StrictStr + port: int = 8123 + database: StrictStr + user: StrictStr + password: StrictStr + use_temporary_tables_for_entity_df: bool = True + + model_config = ConfigDict(frozen=True) diff --git a/sdk/python/feast/infra/utils/clickhouse/connection_utils.py b/sdk/python/feast/infra/utils/clickhouse/connection_utils.py new file mode 100644 index 00000000000..e60922e478d --- /dev/null +++ b/sdk/python/feast/infra/utils/clickhouse/connection_utils.py @@ -0,0 +1,18 @@ +from functools import cache + +import clickhouse_connect +from clickhouse_connect.driver import Client + +from feast.infra.utils.clickhouse.clickhouse_config import ClickhouseConfig + + +@cache +def get_client(config: ClickhouseConfig) -> Client: + client = clickhouse_connect.get_client( + host=config.host, + port=config.port, + user=config.user, + password=config.password, + database=config.database, + ) + return client diff --git a/sdk/python/requirements/py3.10-ci-requirements.txt b/sdk/python/requirements/py3.10-ci-requirements.txt index fec49b8cb0d..51e68920f08 100644 --- a/sdk/python/requirements/py3.10-ci-requirements.txt +++ b/sdk/python/requirements/py3.10-ci-requirements.txt @@ -2,7 +2,7 @@ # uv pip compile -p 3.10 --system --no-strip-extras setup.py --extra ci --output-file sdk/python/requirements/py3.10-ci-requirements.txt aiobotocore==2.21.1 # via feast (setup.py) -aiohappyeyeballs==2.5.0 +aiohappyeyeballs==2.6.1 # via aiohttp aiohttp==3.11.13 # via aiobotocore @@ -48,7 +48,7 @@ async-timeout==5.0.1 # redis atpublic==5.1 # via ibis-framework -attrs==25.1.0 +attrs==25.2.0 # via # aiohttp # jsonlines @@ -58,9 +58,9 @@ azure-core==1.32.0 # via # azure-identity # azure-storage-blob -azure-identity==1.20.0 +azure-identity==1.21.0 # via feast (setup.py) -azure-storage-blob==12.24.1 +azure-storage-blob==12.25.0 # via feast (setup.py) babel==2.17.0 # via @@ -96,6 +96,7 @@ cassandra-driver==3.29.2 # via feast (setup.py) certifi==2025.1.31 # via + # clickhouse-connect # docling # elastic-transport # httpcore @@ -126,6 +127,8 @@ click==8.1.8 # pip-tools # typer # uvicorn +clickhouse-connect==0.8.15 + # via feast (setup.py) cloudpickle==3.1.1 # via dask colorama==0.4.6 @@ -182,7 +185,7 @@ distlib==0.3.9 # via virtualenv docker==7.1.0 # via testcontainers -docling==2.25.2 +docling==2.26.0 # via feast (setup.py) docling-core[chunking]==2.21.2 # via @@ -247,7 +250,7 @@ fsspec==2024.9.0 # torch geomet==0.2.1.post1 # via cassandra-driver -google-api-core[grpc]==2.24.1 +google-api-core[grpc]==2.24.2 # via # feast (setup.py) # google-cloud-bigquery @@ -278,11 +281,11 @@ google-cloud-bigquery[pandas]==3.30.0 # via # feast (setup.py) # pandas-gbq -google-cloud-bigquery-storage==2.28.0 +google-cloud-bigquery-storage==2.29.0 # via feast (setup.py) google-cloud-bigtable==2.29.0 # via feast (setup.py) -google-cloud-core==2.4.2 +google-cloud-core==2.4.3 # via # google-cloud-bigquery # google-cloud-bigtable @@ -308,11 +311,9 @@ googleapis-common-protos[grpc]==1.69.1 # grpcio-status great-expectations==0.18.22 # via feast (setup.py) -greenlet==3.1.1 - # via sqlalchemy grpc-google-iam-v1==0.14.1 # via google-cloud-bigtable -grpcio==1.70.0 +grpcio==1.71.0 # via # feast (setup.py) # google-api-core @@ -327,17 +328,17 @@ grpcio==1.70.0 # ikvpy # pymilvus # qdrant-client -grpcio-health-checking==1.70.0 +grpcio-health-checking==1.71.0 # via feast (setup.py) -grpcio-reflection==1.70.0 +grpcio-reflection==1.71.0 # via feast (setup.py) -grpcio-status==1.70.0 +grpcio-status==1.71.0 # via # google-api-core # ikvpy -grpcio-testing==1.70.0 +grpcio-testing==1.71.0 # via feast (setup.py) -grpcio-tools==1.70.0 +grpcio-tools==1.71.0 # via # feast (setup.py) # qdrant-client @@ -369,7 +370,7 @@ httpx[http2]==0.27.2 # jupyterlab # python-keycloak # qdrant-client -huggingface-hub==0.29.2 +huggingface-hub==0.29.3 # via # docling # docling-ibm-models @@ -383,7 +384,7 @@ ibis-framework[duckdb, mssql]==9.5.0 # ibis-substrait ibis-substrait==4.0.1 # via feast (setup.py) -identify==2.6.8 +identify==2.6.9 # via pre-commit idna==3.10 # via @@ -407,7 +408,7 @@ iniconfig==2.0.0 # via pytest ipykernel==6.29.5 # via jupyterlab -ipython==8.33.0 +ipython==8.34.0 # via # great-expectations # ipykernel @@ -513,7 +514,9 @@ lxml==5.3.1 # python-docx # python-pptx lz4==4.4.3 - # via trino + # via + # clickhouse-connect + # trino makefun==1.15.6 # via great-expectations markdown-it-py==3.0.0 @@ -731,7 +734,7 @@ propcache==0.3.0 # via # aiohttp # yarl -proto-plus==1.26.0 +proto-plus==1.26.1 # via # google-api-core # google-cloud-bigquery-storage @@ -918,6 +921,7 @@ python-pptx==1.0.2 # via docling pytz==2025.1 # via + # clickhouse-connect # great-expectations # ibis-framework # pandas @@ -937,7 +941,7 @@ pyyaml==6.0.2 # responses # transformers # uvicorn -pyzmq==26.2.1 +pyzmq==26.3.0 # via # ipykernel # jupyter-client @@ -986,7 +990,7 @@ requests-oauthlib==2.0.0 # kubernetes requests-toolbelt==1.0.0 # via python-keycloak -responses==0.25.6 +responses==0.25.7 # via moto rfc3339-validator==0.1.4 # via @@ -1012,7 +1016,7 @@ ruamel-yaml==0.17.40 # via great-expectations ruamel-yaml-clib==0.2.12 # via ruamel-yaml -ruff==0.9.9 +ruff==0.9.10 # via feast (setup.py) s3transfer==0.11.3 # via boto3 @@ -1032,7 +1036,7 @@ semchunk==2.2.2 # via docling-core send2trash==1.8.3 # via jupyter-server -setuptools==75.8.2 +setuptools==76.0.0 # via # grpcio-tools # jupyterlab @@ -1086,7 +1090,7 @@ sphinxcontrib-qthelp==2.0.0 # via sphinx sphinxcontrib-serializinghtml==2.0.0 # via sphinx -sqlalchemy[mypy]==2.0.38 +sqlalchemy[mypy]==2.0.39 # via feast (setup.py) sqlglot==25.20.2 # via ibis-framework @@ -1096,7 +1100,7 @@ sqlparams==6.2.0 # via singlestoredb stack-data==0.6.3 # via ipython -starlette==0.46.0 +starlette==0.46.1 # via fastapi substrait==0.23.0 # via ibis-substrait @@ -1121,7 +1125,7 @@ tifffile==2025.2.18 # via scikit-image tinycss2==1.4.0 # via bleach -tokenizers==0.19.1 +tokenizers==0.21.0 # via transformers toml==0.10.2 # via feast (setup.py) @@ -1189,7 +1193,7 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat -transformers==4.42.4 +transformers==4.49.0 # via # docling-core # docling-ibm-models @@ -1281,6 +1285,7 @@ urllib3==2.3.0 # via # feast (setup.py) # botocore + # clickhouse-connect # docker # elastic-transport # great-expectations @@ -1339,4 +1344,6 @@ yarl==1.18.3 zipp==3.21.0 # via importlib-metadata zstandard==0.23.0 - # via trino + # via + # clickhouse-connect + # trino diff --git a/sdk/python/requirements/py3.10-requirements.txt b/sdk/python/requirements/py3.10-requirements.txt index bb4f26effd8..dbcf2648cfb 100644 --- a/sdk/python/requirements/py3.10-requirements.txt +++ b/sdk/python/requirements/py3.10-requirements.txt @@ -6,7 +6,7 @@ anyio==4.8.0 # via # starlette # watchfiles -attrs==25.1.0 +attrs==25.2.0 # via # jsonschema # referencing @@ -33,10 +33,8 @@ exceptiongroup==1.2.2 # via anyio fastapi==0.115.11 # via feast (setup.py) -fsspec==2025.2.0 +fsspec==2025.3.0 # via dask -greenlet==3.1.1 - # via sqlalchemy gunicorn==23.0.0 # via # feast (setup.py) @@ -127,9 +125,9 @@ six==1.17.0 # via python-dateutil sniffio==1.3.1 # via anyio -sqlalchemy[mypy]==2.0.38 +sqlalchemy[mypy]==2.0.39 # via feast (setup.py) -starlette==0.46.0 +starlette==0.46.1 # via fastapi tabulate==0.9.0 # via feast (setup.py) diff --git a/sdk/python/requirements/py3.11-ci-requirements.txt b/sdk/python/requirements/py3.11-ci-requirements.txt index 7f03e1d9ccd..da6cb402ebc 100644 --- a/sdk/python/requirements/py3.11-ci-requirements.txt +++ b/sdk/python/requirements/py3.11-ci-requirements.txt @@ -2,7 +2,7 @@ # uv pip compile -p 3.11 --system --no-strip-extras setup.py --extra ci --output-file sdk/python/requirements/py3.11-ci-requirements.txt aiobotocore==2.21.1 # via feast (setup.py) -aiohappyeyeballs==2.5.0 +aiohappyeyeballs==2.6.1 # via aiohttp aiohttp==3.11.13 # via aiobotocore @@ -46,7 +46,7 @@ async-timeout==5.0.1 # via redis atpublic==5.1 # via ibis-framework -attrs==25.1.0 +attrs==25.2.0 # via # aiohttp # jsonlines @@ -56,9 +56,9 @@ azure-core==1.32.0 # via # azure-identity # azure-storage-blob -azure-identity==1.20.0 +azure-identity==1.21.0 # via feast (setup.py) -azure-storage-blob==12.24.1 +azure-storage-blob==12.25.0 # via feast (setup.py) babel==2.17.0 # via @@ -94,6 +94,7 @@ cassandra-driver==3.29.2 # via feast (setup.py) certifi==2025.1.31 # via + # clickhouse-connect # docling # elastic-transport # httpcore @@ -124,6 +125,8 @@ click==8.1.8 # pip-tools # typer # uvicorn +clickhouse-connect==0.8.15 + # via feast (setup.py) cloudpickle==3.1.1 # via dask colorama==0.4.6 @@ -180,7 +183,7 @@ distlib==0.3.9 # via virtualenv docker==7.1.0 # via testcontainers -docling==2.25.2 +docling==2.26.0 # via feast (setup.py) docling-core[chunking]==2.21.2 # via @@ -240,7 +243,7 @@ fsspec==2024.9.0 # torch geomet==0.2.1.post1 # via cassandra-driver -google-api-core[grpc]==2.24.1 +google-api-core[grpc]==2.24.2 # via # feast (setup.py) # google-cloud-bigquery @@ -271,11 +274,11 @@ google-cloud-bigquery[pandas]==3.30.0 # via # feast (setup.py) # pandas-gbq -google-cloud-bigquery-storage==2.28.0 +google-cloud-bigquery-storage==2.29.0 # via feast (setup.py) google-cloud-bigtable==2.29.0 # via feast (setup.py) -google-cloud-core==2.4.2 +google-cloud-core==2.4.3 # via # google-cloud-bigquery # google-cloud-bigtable @@ -301,11 +304,9 @@ googleapis-common-protos[grpc]==1.69.1 # grpcio-status great-expectations==0.18.22 # via feast (setup.py) -greenlet==3.1.1 - # via sqlalchemy grpc-google-iam-v1==0.14.1 # via google-cloud-bigtable -grpcio==1.70.0 +grpcio==1.71.0 # via # feast (setup.py) # google-api-core @@ -320,17 +321,17 @@ grpcio==1.70.0 # ikvpy # pymilvus # qdrant-client -grpcio-health-checking==1.70.0 +grpcio-health-checking==1.71.0 # via feast (setup.py) -grpcio-reflection==1.70.0 +grpcio-reflection==1.71.0 # via feast (setup.py) -grpcio-status==1.70.0 +grpcio-status==1.71.0 # via # google-api-core # ikvpy -grpcio-testing==1.70.0 +grpcio-testing==1.71.0 # via feast (setup.py) -grpcio-tools==1.70.0 +grpcio-tools==1.71.0 # via # feast (setup.py) # qdrant-client @@ -362,7 +363,7 @@ httpx[http2]==0.27.2 # jupyterlab # python-keycloak # qdrant-client -huggingface-hub==0.29.2 +huggingface-hub==0.29.3 # via # docling # docling-ibm-models @@ -376,7 +377,7 @@ ibis-framework[duckdb, mssql]==9.5.0 # ibis-substrait ibis-substrait==4.0.1 # via feast (setup.py) -identify==2.6.8 +identify==2.6.9 # via pre-commit idna==3.10 # via @@ -398,7 +399,7 @@ iniconfig==2.0.0 # via pytest ipykernel==6.29.5 # via jupyterlab -ipython==9.0.1 +ipython==9.0.2 # via # great-expectations # ipykernel @@ -506,7 +507,9 @@ lxml==5.3.1 # python-docx # python-pptx lz4==4.4.3 - # via trino + # via + # clickhouse-connect + # trino makefun==1.15.6 # via great-expectations markdown-it-py==3.0.0 @@ -724,7 +727,7 @@ propcache==0.3.0 # via # aiohttp # yarl -proto-plus==1.26.0 +proto-plus==1.26.1 # via # google-api-core # google-cloud-bigquery-storage @@ -912,6 +915,7 @@ python-pptx==1.0.2 # via docling pytz==2025.1 # via + # clickhouse-connect # great-expectations # ibis-framework # pandas @@ -931,7 +935,7 @@ pyyaml==6.0.2 # responses # transformers # uvicorn -pyzmq==26.2.1 +pyzmq==26.3.0 # via # ipykernel # jupyter-client @@ -980,7 +984,7 @@ requests-oauthlib==2.0.0 # kubernetes requests-toolbelt==1.0.0 # via python-keycloak -responses==0.25.6 +responses==0.25.7 # via moto rfc3339-validator==0.1.4 # via @@ -1006,7 +1010,7 @@ ruamel-yaml==0.17.40 # via great-expectations ruamel-yaml-clib==0.2.12 # via ruamel-yaml -ruff==0.9.9 +ruff==0.9.10 # via feast (setup.py) s3transfer==0.11.3 # via boto3 @@ -1026,7 +1030,7 @@ semchunk==2.2.2 # via docling-core send2trash==1.8.3 # via jupyter-server -setuptools==75.8.2 +setuptools==76.0.0 # via # grpcio-tools # jupyterlab @@ -1080,7 +1084,7 @@ sphinxcontrib-qthelp==2.0.0 # via sphinx sphinxcontrib-serializinghtml==2.0.0 # via sphinx -sqlalchemy[mypy]==2.0.38 +sqlalchemy[mypy]==2.0.39 # via feast (setup.py) sqlglot==25.20.2 # via ibis-framework @@ -1090,7 +1094,7 @@ sqlparams==6.2.0 # via singlestoredb stack-data==0.6.3 # via ipython -starlette==0.46.0 +starlette==0.46.1 # via fastapi substrait==0.23.0 # via ibis-substrait @@ -1115,7 +1119,7 @@ tifffile==2025.2.18 # via scikit-image tinycss2==1.4.0 # via bleach -tokenizers==0.19.1 +tokenizers==0.21.0 # via transformers toml==0.10.2 # via feast (setup.py) @@ -1175,7 +1179,7 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat -transformers==4.42.4 +transformers==4.49.0 # via # docling-core # docling-ibm-models @@ -1261,6 +1265,7 @@ urllib3==2.3.0 # via # feast (setup.py) # botocore + # clickhouse-connect # docker # elastic-transport # great-expectations @@ -1319,4 +1324,6 @@ yarl==1.18.3 zipp==3.21.0 # via importlib-metadata zstandard==0.23.0 - # via trino + # via + # clickhouse-connect + # trino diff --git a/sdk/python/requirements/py3.11-requirements.txt b/sdk/python/requirements/py3.11-requirements.txt index 80478d5bad6..5d182e2fc09 100644 --- a/sdk/python/requirements/py3.11-requirements.txt +++ b/sdk/python/requirements/py3.11-requirements.txt @@ -6,7 +6,7 @@ anyio==4.8.0 # via # starlette # watchfiles -attrs==25.1.0 +attrs==25.2.0 # via # jsonschema # referencing @@ -31,10 +31,8 @@ dill==0.3.9 # via feast (setup.py) fastapi==0.115.11 # via feast (setup.py) -fsspec==2025.2.0 +fsspec==2025.3.0 # via dask -greenlet==3.1.1 - # via sqlalchemy gunicorn==23.0.0 # via # feast (setup.py) @@ -125,9 +123,9 @@ six==1.17.0 # via python-dateutil sniffio==1.3.1 # via anyio -sqlalchemy[mypy]==2.0.38 +sqlalchemy[mypy]==2.0.39 # via feast (setup.py) -starlette==0.46.0 +starlette==0.46.1 # via fastapi tabulate==0.9.0 # via feast (setup.py) diff --git a/sdk/python/requirements/py3.9-ci-requirements.txt b/sdk/python/requirements/py3.9-ci-requirements.txt index 3a564962973..c2d8966c5ea 100644 --- a/sdk/python/requirements/py3.9-ci-requirements.txt +++ b/sdk/python/requirements/py3.9-ci-requirements.txt @@ -2,7 +2,7 @@ # uv pip compile -p 3.9 --system --no-strip-extras setup.py --extra ci --output-file sdk/python/requirements/py3.9-ci-requirements.txt aiobotocore==2.21.1 # via feast (setup.py) -aiohappyeyeballs==2.5.0 +aiohappyeyeballs==2.6.1 # via aiohttp aiohttp==3.11.13 # via aiobotocore @@ -48,7 +48,7 @@ async-timeout==5.0.1 # redis atpublic==4.1.0 # via ibis-framework -attrs==25.1.0 +attrs==25.2.0 # via # aiohttp # jsonlines @@ -58,9 +58,9 @@ azure-core==1.32.0 # via # azure-identity # azure-storage-blob -azure-identity==1.20.0 +azure-identity==1.21.0 # via feast (setup.py) -azure-storage-blob==12.24.1 +azure-storage-blob==12.25.0 # via feast (setup.py) babel==2.17.0 # via @@ -98,6 +98,7 @@ cassandra-driver==3.29.2 # via feast (setup.py) certifi==2025.1.31 # via + # clickhouse-connect # docling # elastic-transport # httpcore @@ -128,6 +129,8 @@ click==8.1.8 # pip-tools # typer # uvicorn +clickhouse-connect==0.8.15 + # via feast (setup.py) cloudpickle==3.1.1 # via dask colorama==0.4.6 @@ -188,7 +191,7 @@ distlib==0.3.9 # via virtualenv docker==7.1.0 # via testcontainers -docling==2.25.2 +docling==2.26.0 # via feast (setup.py) docling-core[chunking]==2.21.2 # via @@ -253,7 +256,7 @@ fsspec==2024.9.0 # torch geomet==0.2.1.post1 # via cassandra-driver -google-api-core[grpc]==2.24.1 +google-api-core[grpc]==2.24.2 # via # feast (setup.py) # google-cloud-bigquery @@ -284,11 +287,11 @@ google-cloud-bigquery[pandas]==3.30.0 # via # feast (setup.py) # pandas-gbq -google-cloud-bigquery-storage==2.28.0 +google-cloud-bigquery-storage==2.29.0 # via feast (setup.py) google-cloud-bigtable==2.29.0 # via feast (setup.py) -google-cloud-core==2.4.2 +google-cloud-core==2.4.3 # via # google-cloud-bigquery # google-cloud-bigtable @@ -314,11 +317,9 @@ googleapis-common-protos[grpc]==1.69.1 # grpcio-status great-expectations==0.18.22 # via feast (setup.py) -greenlet==3.1.1 - # via sqlalchemy grpc-google-iam-v1==0.14.1 # via google-cloud-bigtable -grpcio==1.70.0 +grpcio==1.71.0 # via # feast (setup.py) # google-api-core @@ -333,17 +334,17 @@ grpcio==1.70.0 # ikvpy # pymilvus # qdrant-client -grpcio-health-checking==1.70.0 +grpcio-health-checking==1.71.0 # via feast (setup.py) -grpcio-reflection==1.70.0 +grpcio-reflection==1.71.0 # via feast (setup.py) -grpcio-status==1.70.0 +grpcio-status==1.71.0 # via # google-api-core # ikvpy -grpcio-testing==1.70.0 +grpcio-testing==1.71.0 # via feast (setup.py) -grpcio-tools==1.70.0 +grpcio-tools==1.71.0 # via # feast (setup.py) # qdrant-client @@ -375,7 +376,7 @@ httpx[http2]==0.27.2 # jupyterlab # python-keycloak # qdrant-client -huggingface-hub==0.29.2 +huggingface-hub==0.29.3 # via # docling # docling-ibm-models @@ -389,7 +390,7 @@ ibis-framework[duckdb, mssql]==9.0.0 # ibis-substrait ibis-substrait==4.0.1 # via feast (setup.py) -identify==2.6.8 +identify==2.6.9 # via pre-commit idna==3.10 # via @@ -526,7 +527,9 @@ lxml==5.3.1 # python-docx # python-pptx lz4==4.4.3 - # via trino + # via + # clickhouse-connect + # trino makefun==1.15.6 # via great-expectations markdown-it-py==3.0.0 @@ -744,7 +747,7 @@ propcache==0.3.0 # via # aiohttp # yarl -proto-plus==1.26.0 +proto-plus==1.26.1 # via # google-api-core # google-cloud-bigquery-storage @@ -931,6 +934,7 @@ python-pptx==1.0.2 # via docling pytz==2025.1 # via + # clickhouse-connect # great-expectations # ibis-framework # pandas @@ -950,7 +954,7 @@ pyyaml==6.0.2 # responses # transformers # uvicorn -pyzmq==26.2.1 +pyzmq==26.3.0 # via # ipykernel # jupyter-client @@ -999,7 +1003,7 @@ requests-oauthlib==2.0.0 # kubernetes requests-toolbelt==1.0.0 # via python-keycloak -responses==0.25.6 +responses==0.25.7 # via moto rfc3339-validator==0.1.4 # via @@ -1025,7 +1029,7 @@ ruamel-yaml==0.17.40 # via great-expectations ruamel-yaml-clib==0.2.12 # via ruamel-yaml -ruff==0.9.9 +ruff==0.9.10 # via feast (setup.py) s3transfer==0.11.3 # via boto3 @@ -1045,7 +1049,7 @@ semchunk==2.2.2 # via docling-core send2trash==1.8.3 # via jupyter-server -setuptools==75.8.2 +setuptools==76.0.0 # via # grpcio-tools # jupyterlab @@ -1099,7 +1103,7 @@ sphinxcontrib-qthelp==2.0.0 # via sphinx sphinxcontrib-serializinghtml==2.0.0 # via sphinx -sqlalchemy[mypy]==2.0.38 +sqlalchemy[mypy]==2.0.39 # via feast (setup.py) sqlglot==23.12.2 # via ibis-framework @@ -1109,7 +1113,7 @@ sqlparams==6.2.0 # via singlestoredb stack-data==0.6.3 # via ipython -starlette==0.46.0 +starlette==0.46.1 # via fastapi substrait==0.23.0 # via ibis-substrait @@ -1134,7 +1138,7 @@ tifffile==2024.8.30 # via scikit-image tinycss2==1.4.0 # via bleach -tokenizers==0.19.1 +tokenizers==0.21.0 # via transformers toml==0.10.2 # via feast (setup.py) @@ -1202,7 +1206,7 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat -transformers==4.42.4 +transformers==4.49.0 # via # docling-core # docling-ibm-models @@ -1297,6 +1301,7 @@ urllib3==1.26.20 # via # feast (setup.py) # botocore + # clickhouse-connect # docker # elastic-transport # great-expectations @@ -1356,4 +1361,6 @@ yarl==1.18.3 zipp==3.21.0 # via importlib-metadata zstandard==0.23.0 - # via trino + # via + # clickhouse-connect + # trino diff --git a/sdk/python/requirements/py3.9-requirements.txt b/sdk/python/requirements/py3.9-requirements.txt index 15aef6c3869..8daedb75c88 100644 --- a/sdk/python/requirements/py3.9-requirements.txt +++ b/sdk/python/requirements/py3.9-requirements.txt @@ -6,7 +6,7 @@ anyio==4.8.0 # via # starlette # watchfiles -attrs==25.1.0 +attrs==25.2.0 # via # jsonschema # referencing @@ -37,10 +37,8 @@ exceptiongroup==1.2.2 # via anyio fastapi==0.115.11 # via feast (setup.py) -fsspec==2025.2.0 +fsspec==2025.3.0 # via dask -greenlet==3.1.1 - # via sqlalchemy gunicorn==23.0.0 # via # feast (setup.py) @@ -134,9 +132,9 @@ six==1.17.0 # via python-dateutil sniffio==1.3.1 # via anyio -sqlalchemy[mypy]==2.0.38 +sqlalchemy[mypy]==2.0.39 # via feast (setup.py) -starlette==0.46.0 +starlette==0.46.1 # via fastapi tabulate==0.9.0 # via feast (setup.py) diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 9d0b620c083..5ba99b9d7f1 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -340,6 +340,14 @@ def offline_types_test_fixtures(request, environment): config: TypeTestConfig = request.param if environment.provider == "aws" and config.feature_is_list is True: pytest.skip("Redshift doesn't support list features") + if ( + environment.data_source_creator.__class__.__name__ + == "ClickhouseDataSourceCreator" + and config.feature_dtype in {"float", "datetime", "bool"} + and config.feature_is_list + and not config.has_empty_list + ): + pytest.skip("Clickhouse doesn't support Nullable(Array) type features") return get_fixtures(request, environment) diff --git a/setup.py b/setup.py index 2adaee75548..503329aa6d8 100644 --- a/setup.py +++ b/setup.py @@ -164,10 +164,12 @@ MILVUS_REQUIRED = ["pymilvus"] TORCH_REQUIRED = [ - "torch>=2.2.2", + "torch==2.2.2", "torchvision>=0.17.2", ] +CLICKHOUSE_REQUIRED = ["clickhouse-connect>=0.7.19"] + CI_REQUIRED = ( [ "build", @@ -241,6 +243,7 @@ + MILVUS_REQUIRED + DOCLING_REQUIRED + TORCH_REQUIRED + + CLICKHOUSE_REQUIRED ) NLP_REQUIRED = ( DOCLING_REQUIRED @@ -323,6 +326,7 @@ "docling": DOCLING_REQUIRED, "pytorch": TORCH_REQUIRED, "nlp": NLP_REQUIRED, + "clickhouse": CLICKHOUSE_REQUIRED, }, include_package_data=True, license="Apache",