Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions docs/reference/type-system.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,25 @@ All primitive types (except `Map` and `Json`) have corresponding set types for s
- Set types are best suited for **online serving** use cases where feature values are written as Python sets and retrieved via `get_online_features`.
{% endhint %}

### Nested Collection Types

Feast supports arbitrarily nested collections using a recursive `VALUE_LIST` / `VALUE_SET` design. The outer container determines the proto enum (`VALUE_LIST` for `Array(…)`, `VALUE_SET` for `Set(…)`), while the full inner type structure is persisted via a mandatory `feast:nested_inner_type` Field tag.

| Feast Type | Python Type | ValueType | Description |
|------------|-------------|-----------|-------------|
| `Array(Array(T))` | `List[List[T]]` | `VALUE_LIST` | List of lists |
| `Array(Set(T))` | `List[List[T]]` | `VALUE_LIST` | List of sets |
| `Set(Array(T))` | `List[List[T]]` | `VALUE_SET` | Set of lists |
| `Set(Set(T))` | `List[List[T]]` | `VALUE_SET` | Set of sets |
| `Array(Array(Array(T)))` | `List[List[List[T]]]` | `VALUE_LIST` | 3-level nesting |

Where `T` is any supported primitive type (Int32, Int64, Float32, Float64, String, Bytes, Bool, UnixTimestamp) or another nested collection type.

**Notes:**
- Nesting depth is **unlimited**. `Array(Array(Array(T)))`, `Set(Array(Set(T)))`, etc. are all supported.
- Inner type information is preserved via Field tags (`feast:nested_inner_type`) and restored during deserialization. This tag is mandatory for nested collection types.
- Empty inner collections (`[]`) are stored as empty proto values and round-trip as `None`. For example, `[[1, 2], [], [3]]` becomes `[[1, 2], None, [3]]` after a write-read cycle.

### Map Types

Map types allow storing dictionary-like data structures:
Expand Down Expand Up @@ -233,6 +252,10 @@ user_features = FeatureView(
Field(name="metadata", dtype=Map),
Field(name="activity_log", dtype=Array(Map)),

# Nested collection types
Field(name="weekly_scores", dtype=Array(Array(Float64))),
Field(name="unique_tags_per_category", dtype=Array(Set(String))),

# JSON type
Field(name="raw_event", dtype=Json),

Expand Down Expand Up @@ -290,6 +313,30 @@ related_sessions = [uuid.uuid4(), uuid.uuid4(), uuid.uuid4()]
unique_devices = {uuid.uuid4(), uuid.uuid4()}
```

### Nested Collection Type Usage Examples

Nested collections allow storing multi-dimensional data with unlimited depth:

```python
# List of lists — e.g., weekly score history per user
weekly_scores = [[85.0, 90.5, 78.0], [92.0, 88.5], [95.0, 91.0, 87.5]]

# List of sets — e.g., unique tags assigned per category
unique_tags_per_category = [["python", "ml"], ["rust", "systems"], ["python", "web"]]

# 3-level nesting — e.g., multi-dimensional matrices
Field(name="tensor", dtype=Array(Array(Array(Float64))))

# Mixed nesting
Field(name="grouped_tags", dtype=Array(Set(Array(String))))
```

**Limitation:** Empty inner collections round-trip as `None`:
```python
# Input: [[1, 2], [], [3]]
# Output: [[1, 2], None, [3]] (empty [] becomes None after write-read cycle)
```

### Map Type Usage Examples

Maps can store complex nested data structures:
Expand Down
4 changes: 4 additions & 0 deletions protos/feast/types/Value.proto
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ message ValueType {
TIME_UUID_LIST = 39;
UUID_SET = 40;
TIME_UUID_SET = 41;
VALUE_LIST = 42;
VALUE_SET = 43;
}
}

Expand Down Expand Up @@ -108,6 +110,8 @@ message Value {
StringList time_uuid_list_val = 39;
StringSet uuid_set_val = 40;
StringSet time_uuid_set_val = 41;
RepeatedValue list_val = 42;
RepeatedValue set_val = 43;
}
}

Expand Down
38 changes: 31 additions & 7 deletions sdk/python/feast/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from feast.value_type import ValueType

STRUCT_SCHEMA_TAG = "feast:struct_schema"
NESTED_COLLECTION_INNER_TYPE_TAG = "feast:nested_inner_type"


@typechecked
Expand Down Expand Up @@ -118,7 +119,7 @@ def __str__(self):

def to_proto(self) -> FieldProto:
"""Converts a Field object to its protobuf representation."""
from feast.types import Array
from feast.types import Array, Set

value_type = self.dtype.to_value_type()
vector_search_metric = self.vector_search_metric or ""
Expand All @@ -128,6 +129,11 @@ def to_proto(self) -> FieldProto:
tags[STRUCT_SCHEMA_TAG] = _serialize_struct_schema(self.dtype)
elif isinstance(self.dtype, Array) and isinstance(self.dtype.base_type, Struct):
tags[STRUCT_SCHEMA_TAG] = _serialize_struct_schema(self.dtype.base_type)
# Persist nested collection type info in tags
if isinstance(self.dtype, (Array, Set)) and isinstance(
self.dtype.base_type, (Array, Set)
):
tags[NESTED_COLLECTION_INNER_TYPE_TAG] = _feast_type_to_str(self.dtype)
return FieldProto(
name=self.name,
value_type=value_type.value,
Expand Down Expand Up @@ -155,17 +161,24 @@ def from_proto(cls, field_proto: FieldProto):
# Reconstruct Struct type from persisted schema in tags
from feast.types import Array

internal_tags = {STRUCT_SCHEMA_TAG, NESTED_COLLECTION_INNER_TYPE_TAG}
dtype: FeastType
if value_type == ValueType.STRUCT and STRUCT_SCHEMA_TAG in tags:
dtype = _deserialize_struct_schema(tags[STRUCT_SCHEMA_TAG])
user_tags = {k: v for k, v in tags.items() if k != STRUCT_SCHEMA_TAG}
user_tags = {k: v for k, v in tags.items() if k not in internal_tags}
elif value_type == ValueType.STRUCT_LIST and STRUCT_SCHEMA_TAG in tags:
inner_struct = _deserialize_struct_schema(tags[STRUCT_SCHEMA_TAG])
dtype = Array(inner_struct)
user_tags = {k: v for k, v in tags.items() if k != STRUCT_SCHEMA_TAG}
user_tags = {k: v for k, v in tags.items() if k not in internal_tags}
elif (
value_type in (ValueType.VALUE_LIST, ValueType.VALUE_SET)
and NESTED_COLLECTION_INNER_TYPE_TAG in tags
):
dtype = _str_to_feast_type(tags[NESTED_COLLECTION_INNER_TYPE_TAG])
user_tags = {k: v for k, v in tags.items() if k not in internal_tags}
else:
dtype = from_value_type(value_type=value_type)
user_tags = tags
user_tags = {k: v for k, v in tags.items() if k not in internal_tags}

return cls(
name=field_proto.name,
Expand Down Expand Up @@ -198,6 +211,7 @@ def _feast_type_to_str(feast_type: FeastType) -> str:
from feast.types import (
Array,
PrimitiveFeastType,
Set,
)

if isinstance(feast_type, PrimitiveFeastType):
Expand All @@ -209,6 +223,8 @@ def _feast_type_to_str(feast_type: FeastType) -> str:
return json.dumps({"__struct__": nested})
elif isinstance(feast_type, Array):
return f"Array({_feast_type_to_str(feast_type.base_type)})"
elif isinstance(feast_type, Set):
return f"Set({_feast_type_to_str(feast_type.base_type)})"
else:
return str(feast_type)

Expand All @@ -218,6 +234,7 @@ def _str_to_feast_type(type_str: str) -> FeastType:
from feast.types import (
Array,
PrimitiveFeastType,
Set,
)

# Check if it's an Array type
Expand All @@ -226,6 +243,12 @@ def _str_to_feast_type(type_str: str) -> FeastType:
base_type = _str_to_feast_type(inner)
return Array(base_type)

# Check if it's a Set type
if type_str.startswith("Set(") and type_str.endswith(")"):
inner = type_str[4:-1]
base_type = _str_to_feast_type(inner)
return Set(base_type)

# Check if it's a nested Struct (JSON encoded)
if type_str.startswith("{"):
try:
Expand All @@ -243,9 +266,10 @@ def _str_to_feast_type(type_str: str) -> FeastType:
try:
return PrimitiveFeastType[type_str]
except KeyError:
from feast.types import String

return String
raise ValueError(
f"Unknown FeastType: {type_str!r}. "
f"Valid primitive types: {[t.name for t in PrimitiveFeastType]}"
)


def _serialize_struct_schema(struct_type: Struct) -> str:
Expand Down
48 changes: 39 additions & 9 deletions sdk/python/feast/infra/offline_stores/offline_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import uuid
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta, timezone
Expand All @@ -21,6 +22,7 @@
from feast.repo_config import RepoConfig
from feast.type_map import feast_value_type_to_pa
from feast.utils import _get_requested_feature_views_to_features_dict, to_naive_utc
from feast.value_type import ValueType

DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL = "event_timestamp"

Expand Down Expand Up @@ -241,6 +243,37 @@ def get_offline_store_from_config(offline_store_config: Any) -> OfflineStore:
return offline_store_class()


_PA_BASIC_TYPES = {
"int32": pa.int32(),
"int64": pa.int64(),
"double": pa.float64(),
"float": pa.float32(),
"string": pa.string(),
"binary": pa.binary(),
"bool": pa.bool_(),
"large_string": pa.large_string(),
"null": pa.null(),
}


def _parse_pa_type_str(pa_type_str: str) -> pa.DataType:
"""Parse a PyArrow type string to preserve inner element types for nested lists."""
pa_type_str = pa_type_str.strip()
if pa_type_str.startswith("list<item: ") and pa_type_str.endswith(">"):
inner = pa_type_str[len("list<item: ") : -1]
return pa.list_(_parse_pa_type_str(inner))
if pa_type_str in _PA_BASIC_TYPES:
return _PA_BASIC_TYPES[pa_type_str]
if pa_type_str.startswith("timestamp"):
return pa.timestamp("us")
logger = logging.getLogger(__name__)
logger.warning(
"Unrecognized PyArrow type string '%s', falling back to pa.string()",
pa_type_str,
)
return pa.string()


def get_pyarrow_schema_from_batch_source(
config: RepoConfig, batch_source: DataSource, timestamp_unit: str = "us"
) -> Tuple[pa.Schema, List[str]]:
Expand All @@ -250,15 +283,12 @@ def get_pyarrow_schema_from_batch_source(
pa_schema = []
column_names = []
for column_name, column_type in column_names_and_types:
pa_schema.append(
(
column_name,
feast_value_type_to_pa(
batch_source.source_datatype_to_feast_value_type()(column_type),
timestamp_unit=timestamp_unit,
),
)
)
value_type = batch_source.source_datatype_to_feast_value_type()(column_type)
if value_type in (ValueType.VALUE_LIST, ValueType.VALUE_SET):
pa_type = _parse_pa_type_str(column_type)
else:
pa_type = feast_value_type_to_pa(value_type, timestamp_unit=timestamp_unit)
pa_schema.append((column_name, pa_type))
column_names.append(column_name)

return pa.schema(pa_schema), column_names
Expand Down
16 changes: 15 additions & 1 deletion sdk/python/feast/infra/online_stores/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ def _proto_value_to_transport_value(proto_value: ValueProto) -> Any:
if val_attr == "json_list_val":
return list(getattr(proto_value, val_attr).val)

# Nested collection types use feast_value_type_to_python_type
# which handles recursive conversion of RepeatedValue protos.
if val_attr in ("list_val", "set_val"):
return feast_value_type_to_python_type(proto_value)

# Map/Struct types are converted to Python dicts by
# feast_value_type_to_python_type. Serialise them to JSON strings
# so the server-side DataFrame gets VARCHAR columns instead of
Expand Down Expand Up @@ -204,6 +209,12 @@ def online_read(
logger.debug("Able to retrieve the online features from feature server.")
response_json = json.loads(response.text)
event_ts = self._get_event_ts(response_json)
# Build feature name -> ValueType mapping so we can reconstruct
# complex types (nested collections, sets, etc.) that cannot be
# inferred from raw JSON values alone.
feature_type_map: Dict[str, ValueType] = {
f.name: f.dtype.to_value_type() for f in table.features
}
# Iterating over results and converting the API results in column format to row format.
result_tuples: List[
Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]
Expand All @@ -223,13 +234,16 @@ def online_read(
]
== "PRESENT"
):
feature_value_type = feature_type_map.get(
feature_name, ValueType.UNKNOWN
)
message = python_values_to_proto_values(
[
response_json["results"][index]["values"][
feature_value_index
]
],
ValueType.UNKNOWN,
feature_value_type,
)
feature_values_dict[feature_name] = message[0]
else:
Expand Down
19 changes: 19 additions & 0 deletions sdk/python/feast/proto_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ def to_json_object(printer: _Printer, message: ProtoMessage) -> JsonObject:
# to JSON. The parse back result will be different from original message.
if which is None or which == "null_val":
return None
elif which in ("list_val", "set_val"):
# Nested collection: RepeatedValue containing Values
repeated = getattr(message, which)
value = [
printer._MessageToJsonObject(inner_val) for inner_val in repeated.val
]
elif "_list_" in which:
value = list(getattr(message, which).val)
else:
Expand All @@ -86,6 +92,19 @@ def from_json_object(
if len(value) == 0:
# Clear will mark the struct as modified so it will be created even if there are no values
message.int64_list_val.Clear()
elif isinstance(value[0], list) or (
value[0] is None and any(isinstance(v, list) for v in value)
):
# Nested collection (list of lists).
# Check any() to handle cases where the first element is None
# (empty inner collections round-trip through proto as None).
# Default to list_val since JSON transport loses the
# outer/inner set distinction.
rv = RepeatedValue()
for inner in value:
inner_val = rv.val.add()
from_json_object(parser, inner, inner_val)
message.list_val.CopyFrom(rv)
elif isinstance(value[0], bool):
message.bool_list_val.val.extend(value)
elif isinstance(value[0], str):
Expand Down
Loading
Loading