Skip to content

Commit 024737c

Browse files
authored
Use drop_duplicates() instead of groupby (about 1.5~2x faster) (#1617)
* Use drop_duplicates() instead of groupby (about 1.5~2x faster) Signed-off-by: rightx2 <rightx2@gmail.com> * Lint Signed-off-by: rightx2 <rightx2@gmail.com>
1 parent 24dc3f4 commit 024737c

File tree

1 file changed

+9
-6
lines changed
  • sdk/python/feast/infra/offline_stores

1 file changed

+9
-6
lines changed

sdk/python/feast/infra/offline_stores/file.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,12 @@ def evaluate_historical_retrieval():
153153
]
154154

155155
df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True)
156-
df_to_join = df_to_join.groupby(by=right_entity_key_columns).last()
157-
df_to_join.reset_index(inplace=True)
156+
df_to_join.drop_duplicates(
157+
right_entity_key_sort_columns,
158+
keep="last",
159+
ignore_index=True,
160+
inplace=True,
161+
)
158162

159163
# Select only the columns we need to join from the feature dataframe
160164
df_to_join = df_to_join[right_entity_key_columns + feature_names]
@@ -231,10 +235,9 @@ def pull_latest_from_table_or_query(
231235
(source_df[event_timestamp_column] >= start_date)
232236
& (source_df[event_timestamp_column] < end_date)
233237
]
234-
last_values_df = filtered_df.groupby(by=join_key_columns).last()
235-
236-
# make driver_id a normal column again
237-
last_values_df.reset_index(inplace=True)
238+
last_values_df = filtered_df.drop_duplicates(
239+
join_key_columns, keep="last", ignore_index=True
240+
)
238241

239242
columns_to_extract = set(join_key_columns + feature_name_columns + ts_columns)
240243
table = pyarrow.Table.from_pandas(last_values_df[columns_to_extract])

0 commit comments

Comments
 (0)