diff --git a/.librarian/state.yaml b/.librarian/state.yaml index ebc26d98adf..0344252107e 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:160860d189ff1c2f7515638478823712fa5b243e27ccc33a2728669fa1e2ed0c libraries: - id: bigframes - version: 2.38.0 + version: 2.39.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fc39e48030..ab25756d9d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,30 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.39.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.38.0...v2.39.0) (2026-03-31) + + +### Documentation + +* Rename Blob column references to ObjectRef column (#2535) ([44e0ffd947e9db66ab612f92de6e31f1085e7968](https://github.com/googleapis/python-bigquery-dataframes/commit/44e0ffd947e9db66ab612f92de6e31f1085e7968)) +* gemini retouch of the index page for seo (#2514) ([2e5311e2242b039da4c8e37b7b48942fa8ed34c2](https://github.com/googleapis/python-bigquery-dataframes/commit/2e5311e2242b039da4c8e37b7b48942fa8ed34c2)) + + +### Features + +* expose DataFrame.bigquery in both pandas and bigframes DataFrames (#2533) ([69fe317612a69aa92f06f0c418c67aa1f9488bd2](https://github.com/googleapis/python-bigquery-dataframes/commit/69fe317612a69aa92f06f0c418c67aa1f9488bd2)) +* support full round-trip persistence for multimodal reference cols (#2511) ([494a0a113b1ba6dcdc9f9b85a4f750d093f5652f](https://github.com/googleapis/python-bigquery-dataframes/commit/494a0a113b1ba6dcdc9f9b85a4f750d093f5652f)) +* add `df.bigquery.ai.forecast` method to pandas dataframe accessor (#2518) ([1126cec9cdfcc1ec1062c60e5affbe1b60223767](https://github.com/googleapis/python-bigquery-dataframes/commit/1126cec9cdfcc1ec1062c60e5affbe1b60223767)) + + +### Bug Fixes + +* handle aggregate operations on empty selections (#2510) ([34fb5daa93726d0d3ff364912a3c1de0fc535fb2](https://github.com/googleapis/python-bigquery-dataframes/commit/34fb5daa93726d0d3ff364912a3c1de0fc535fb2)) +* Localize BigQuery log suppression for gbq.py (#2541) ([af49ca29399aa2c63753d9045fd382e30334d134](https://github.com/googleapis/python-bigquery-dataframes/commit/af49ca29399aa2c63753d9045fd382e30334d134)) +* to_gbq may swap data columns when replace table (#2532) ([17ecc65e1c0397ef349fca4afcf5a77af72aa798](https://github.com/googleapis/python-bigquery-dataframes/commit/17ecc65e1c0397ef349fca4afcf5a77af72aa798)) +* Respect remote function config changes even if logic unchanged (#2512) ([b9524284ad3b457b15598f546bac04c76b3e27b8](https://github.com/googleapis/python-bigquery-dataframes/commit/b9524284ad3b457b15598f546bac04c76b3e27b8)) +* support melting empty DataFrames without crashing (#2509) ([e8c46032154e186042314d97aa813301413d8a13](https://github.com/googleapis/python-bigquery-dataframes/commit/e8c46032154e186042314d97aa813301413d8a13)) + ## [2.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.37.0...v2.38.0) (2026-03-16) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 14e0f315dcc..f0838870458 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -12,9 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This module integrates BigQuery built-in functions for use with DataFrame objects, -such as array functions: -https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ +""" +Access BigQuery-specific operations and namespaces within BigQuery DataFrames. + +This module provides specialized functions and sub-modules that expose BigQuery's +advanced capabilities to DataFrames and Series. It acts as a bridge between the +pandas-compatible API and the full power of BigQuery SQL. + +Key sub-modules include: + +* :mod:`bigframes.bigquery.ai`: Generative and predictive AI functions (Gemini, BQML). +* :mod:`bigframes.bigquery.ml`: Direct access to BigQuery ML model operations. +* :mod:`bigframes.bigquery.obj`: Support for BigQuery object tables. + +This module also provides direct access to optimized BigQuery functions for: + +* **JSON Processing:** High-performance functions like ``json_extract``, ``json_value``, + and ``parse_json`` for handling semi-structured data. +* **Geospatial Analysis:** Comprehensive geographic functions such as ``st_area``, + ``st_distance``, and ``st_centroid`` (``ST_`` prefixed functions). +* **Array Operations:** Tools for working with BigQuery arrays, including ``array_agg`` + and ``array_length``. +* **Vector Search:** Integration with BigQuery's vector search and indexing + capabilities for high-dimensional data. +* **Custom SQL:** The ``sql_scalar`` function allows embedding raw SQL snippets for + advanced operations not yet directly mapped in the API. + +By using these functions, you can leverage BigQuery's high-performance engine for +domain-specific tasks while maintaining a Python-centric development experience. + +For the full list of BigQuery standard SQL functions, see: +https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-reference +""" import sys diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index e578f4be4a7..bb73d02609b 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -880,6 +880,7 @@ def forecast( id_cols: Iterable[str] | None = None, horizon: int = 10, confidence_level: float = 0.95, + output_historical_time_series: bool = False, context_window: int | None = None, ) -> dataframe.DataFrame: """ @@ -892,6 +893,25 @@ def forecast( and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). + **Examples:** + + Forecast using a pandas DataFrame: + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> df = pd.DataFrame({"value": [1, 2, 3], "time": pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"])}) + >>> bpd.options.display.progress_bar = None # doctest: +SKIP + >>> forecasted_pandas_df = df.bigquery.ai.forecast(data_col="value", timestamp_col="time", horizon=2) # doctest: +SKIP + >>> type(forecasted_pandas_df) # doctest: +SKIP + + + Forecast using a BigFrames DataFrame: + + >>> bf_df = bpd.DataFrame({"value": [1, 2, 3], "time": pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"])}) + >>> forecasted_bf_df = bf_df.bigquery.ai.forecast(data_col="value", timestamp_col="time", horizon=2) # doctest: +SKIP + >>> type(forecasted_bf_df) # doctest: +SKIP + + Args: df (DataFrame): The dataframe that contains the data that you want to forecast. It could be either a BigFrames Dataframe or @@ -914,6 +934,15 @@ def forecast( confidence_level (float, default 0.95): A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval. The default value is 0.95. The valid input range is [0, 1). + output_historical_time_series (bool, default False): + A BOOL value that determines whether the input data is returned + along with the forecasted data. Set this argument to TRUE to return + input data. The default value is FALSE. + + Returning the input data along with the forecasted data lets you + compare the historical value of the data column with the forecasted + value of the data column, or chart the change in the data column + values over time. context_window (int, optional): An int value that specifies the context window length used by BigQuery ML's built-in TimesFM model. The context window length determines how many of the most recent data points from the input time series are use by the model. @@ -945,6 +974,7 @@ def forecast( "timestamp_col": timestamp_col, "model": model, "horizon": horizon, + "output_historical_time_series": output_historical_time_series, "confidence_level": confidence_level, } if id_cols: diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index b65dfd2d16e..649c7364dd8 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -71,6 +71,25 @@ def sql_scalar( 2 4.000000000 dtype: decimal128(38, 9)[pyarrow] + You can also use the `.bigquery` DataFrame accessor to apply a SQL scalar function. + + Compute SQL scalar using a pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame({"x": [1, 2, 3]}) + >>> bpd.options.display.progress_bar = None # doctest: +SKIP + >>> pandas_s = df.bigquery.sql_scalar("POW({0}, 2)") # doctest: +SKIP + >>> type(pandas_s) # doctest: +SKIP + + + Compute SQL scalar using a BigFrames DataFrame: + + >>> bf_df = bpd.DataFrame({"x": [1, 2, 3]}) + >>> bf_s = bf_df.bigquery.sql_scalar("POW({0}, 2)") # doctest: +SKIP + >>> type(bf_s) # doctest: +SKIP + + + Args: sql_template (str): A SQL format string with Python-style {0} placeholders for each of diff --git a/bigframes/bigquery/_operations/table.py b/bigframes/bigquery/_operations/table.py index c90f88dcd6f..cad025412d5 100644 --- a/bigframes/bigquery/_operations/table.py +++ b/bigframes/bigquery/_operations/table.py @@ -19,8 +19,8 @@ import google.cloud.bigquery import pandas as pd +import bigframes.core.compile.sqlglot.sql as sg_sql import bigframes.core.logging.log_adapter as log_adapter -import bigframes.core.sql.table import bigframes.session @@ -80,14 +80,16 @@ def create_external_table( """ import bigframes.pandas as bpd - sql = bigframes.core.sql.table.create_external_table_ddl( - table_name=table_name, - replace=replace, - if_not_exists=if_not_exists, - columns=columns, - partition_columns=partition_columns, - connection_name=connection_name, - options=options, + sql = sg_sql.to_sql( + sg_sql.create_external_table( + table_name=table_name, + replace=replace, + if_not_exists=if_not_exists, + columns=columns, + partition_columns=partition_columns, + connection_name=connection_name, + options=options, + ) ) if session is None: diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index bb24d5dc33f..25a7df77812 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -12,9 +12,49 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This module integrates BigQuery built-in AI functions for use with Series/DataFrame objects, -such as AI.GENERATE_BOOL: -https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool""" +""" +Integrate BigQuery built-in AI functions into your BigQuery DataFrames workflow. + +The ``bigframes.bigquery.ai`` module provides a Pythonic interface to leverage BigQuery ML's +generative AI and predictive functions directly on BigQuery DataFrames and Series objects. +These functions enable you to perform advanced AI tasks at scale without moving data +out of BigQuery. + +Key capabilities include: + +* **Generative AI:** Use :func:`bigframes.bigquery.ai.generate` (Gemini) to + perform text analysis, translation, or + content generation. Specialized versions like + :func:`~bigframes.bigquery.ai.generate_bool`, + :func:`~bigframes.bigquery.ai.generate_int`, and + :func:`~bigframes.bigquery.ai.generate_double` are available for structured + outputs. +* **Embeddings:** Generate vector embeddings for text using + :func:`~bigframes.bigquery.ai.generate_embedding`, which are essential for + semantic search and retrieval-augmented generation (RAG) workflows. +* **Classification and Scoring:** Apply machine learning models to your data for + predictive tasks with :func:`~bigframes.bigquery.ai.classify` and + :func:`~bigframes.bigquery.ai.score`. +* **Forecasting:** Predict future values in time-series data using + :func:`~bigframes.bigquery.ai.forecast`. + +**Example usage:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> df = bpd.DataFrame({ + ... "text_input": [ + ... "Is this a positive review? The food was terrible.", + ... ], + ... }) # doctest: +SKIP + + >>> # Assuming a Gemini model has been created in BigQuery as 'my_gemini_model' + >>> result = bq.ai.generate_text("my_gemini_model", df["text_input"]) # doctest: +SKIP + +For more information on the underlying BigQuery ML syntax, see: +https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool +""" from bigframes.bigquery._operations.ai import ( classify, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 239eedf6d3c..a15c83e82e4 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1822,9 +1822,9 @@ def melt( Arguments correspond to pandas.melt arguments. """ # TODO: Implement col_level and ignore_index - value_labels: pd.Index = pd.Index( - [self.col_id_to_label[col_id] for col_id in value_vars] - ) + value_labels: pd.Index = self.column_labels[ + [self.value_columns.index(col_id) for col_id in value_vars] + ] id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] unpivot_expr, (var_col_ids, unpivot_out, passthrough_cols) = unpivot( @@ -3417,6 +3417,7 @@ def unpivot( joined_array, (labels_mapping, column_mapping) = labels_array.relational_join( array_value, type="cross" ) + new_passthrough_cols = [column_mapping[col] for col in passthrough_columns] # Last column is offsets index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]] @@ -3426,20 +3427,24 @@ def unpivot( unpivot_exprs: List[ex.Expression] = [] # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index for input_ids in unpivot_columns: - # row explode offset used to choose the input column - # we use offset instead of label as labels are not necessarily unique - cases = itertools.chain( - *( - ( - ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), - ex.deref(column_mapping[id_or_null]) - if (id_or_null is not None) - else ex.const(None), + col_expr: ex.Expression + if not input_ids: + col_expr = ex.const(None, dtype=bigframes.dtypes.INT_DTYPE) + else: + # row explode offset used to choose the input column + # we use offset instead of label as labels are not necessarily unique + cases = itertools.chain( + *( + ( + ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), + ex.deref(column_mapping[id_or_null]) + if (id_or_null is not None) + else ex.const(None), + ) + for i, id_or_null in enumerate(input_ids) ) - for i, id_or_null in enumerate(input_ids) ) - ) - col_expr = ops.case_when_op.as_expr(*cases) + col_expr = ops.case_when_op.as_expr(*cases) unpivot_exprs.append(col_expr) joined_array, unpivot_col_ids = joined_array.compute_values(unpivot_exprs) @@ -3457,19 +3462,43 @@ def _pd_index_to_array_value( Create an ArrayValue from a list of label tuples. The last column will be row offsets. """ + id_gen = bigframes.core.identifiers.standard_id_strings() + col_ids = [next(id_gen) for _ in range(index.nlevels)] + offset_id = next(id_gen) + rows = [] labels_as_tuples = utils.index_as_tuples(index) for row_offset in range(len(index)): - id_gen = bigframes.core.identifiers.standard_id_strings() row_label = labels_as_tuples[row_offset] row_label = (row_label,) if not isinstance(row_label, tuple) else row_label row = {} - for label_part, id in zip(row_label, id_gen): - row[id] = label_part if pd.notnull(label_part) else None - row[next(id_gen)] = row_offset + for label_part, col_id in zip(row_label, col_ids): + row[col_id] = label_part if pd.notnull(label_part) else None + row[offset_id] = row_offset rows.append(row) - return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session) + if not rows: + dtypes_list = getattr(index, "dtypes", None) + if dtypes_list is None: + dtypes_list = ( + [index.dtype] if hasattr(index, "dtype") else [pd.Float64Dtype()] + ) + + fields = [] + for col_id, dtype in zip(col_ids, dtypes_list): + try: + pa_type = bigframes.dtypes.bigframes_dtype_to_arrow_dtype(dtype) + except Exception: + pa_type = pa.string() + fields.append(pa.field(col_id, pa_type)) + fields.append(pa.field(offset_id, pa.int64())) + schema = pa.schema(fields) + pt = pa.Table.from_pylist([], schema=schema) + else: + pt = pa.Table.from_pylist(rows) + pt = pt.rename_columns([*col_ids, offset_id]) + + return core.ArrayValue.from_pyarrow(pt, session=session) def _resolve_index_col( diff --git a/bigframes/core/compile/ibis_compiler/ibis_compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py index 8d40a9eb740..3802a57e02d 100644 --- a/bigframes/core/compile/ibis_compiler/ibis_compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -88,6 +88,7 @@ def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrites.rewrite_slice) node = nodes.bottom_up(node, rewrites.rewrite_timedelta_expressions) node = nodes.bottom_up(node, rewrites.rewrite_range_rolling) + node = nodes.bottom_up(node, rewrites.lower_udfs) return node diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index dd275874332..6a697a86579 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -663,7 +663,7 @@ def datetime_to_integer_label_non_fixed_frequency( .else_((x_int - first - 1) // us + 1) # type: ignore .end() ) - elif rule_code == "ME": # Monthly + elif rule_code in ("M", "ME"): # Monthly x_int = x.year() * 12 + x.month() - 1 # type: ignore first = y.year() * 12 + y.month() - 1 # type: ignore x_int_label = ( @@ -672,7 +672,7 @@ def datetime_to_integer_label_non_fixed_frequency( .else_((x_int - first - 1) // n + 1) # type: ignore .end() ) - elif rule_code == "QE-DEC": # Quarterly + elif rule_code in ("Q-DEC", "QE-DEC"): # Quarterly x_int = x.year() * 4 + x.quarter() - 1 # type: ignore first = y.year() * 4 + y.quarter() - 1 # type: ignore x_int_label = ( @@ -681,7 +681,7 @@ def datetime_to_integer_label_non_fixed_frequency( .else_((x_int - first - 1) // n + 1) # type: ignore .end() ) - elif rule_code == "YE-DEC": # Yearly + elif rule_code in ("A-DEC", "Y-DEC", "YE-DEC"): # Yearly x_int = x.year() # type: ignore first = y.year() # type: ignore x_int_label = ( @@ -749,7 +749,7 @@ def integer_label_to_datetime_op_non_fixed_frequency( .cast(ibis_dtypes.Timestamp(timezone="UTC")) .cast(y.type()) ) - elif rule_code == "ME": # Monthly + elif rule_code in ("M", "ME"): # Monthly one = ibis_types.literal(1) twelve = ibis_types.literal(12) first = y.year() * twelve + y.month() - one # type: ignore @@ -769,7 +769,7 @@ def integer_label_to_datetime_op_non_fixed_frequency( 0, ) x_label = next_month_date - ibis_api.interval(days=1) - elif rule_code == "QE-DEC": # Quarterly + elif rule_code in ("Q-DEC", "QE-DEC"): # Quarterly one = ibis_types.literal(1) three = ibis_types.literal(3) four = ibis_types.literal(4) @@ -792,7 +792,7 @@ def integer_label_to_datetime_op_non_fixed_frequency( ) x_label = next_month_date - ibis_api.interval(days=1) - elif rule_code == "YE-DEC": # Yearly + elif rule_code in ("A-DEC", "Y-DEC", "YE-DEC"): # Yearly one = ibis_types.literal(1) first = y.year() # type: ignore x = x * n + first # type: ignore @@ -1037,7 +1037,8 @@ def timedelta_floor_op_impl(x: ibis_types.NumericValue): @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + assert not udf_sig.is_virtual # should have been devirtualized in lowering pass + ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) @ibis_udf.scalar.builtin( name=str(op.function_def.routine_ref), signature=ibis_py_sig @@ -1056,7 +1057,8 @@ def binary_remote_function_op_impl( x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp ): udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + assert not udf_sig.is_virtual # should have been devirtualized in lowering pass + ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) @ibis_udf.scalar.builtin( name=str(op.function_def.routine_ref), signature=ibis_py_sig @@ -1073,8 +1075,9 @@ def nary_remote_function_op_impl( *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp ): udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) - arg_names = tuple(arg.name for arg in udf_sig.input_types) + assert not udf_sig.is_virtual # should have been devirtualized in lowering pass + ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) + arg_names = tuple(arg.name for arg in udf_sig.inputs) @ibis_udf.scalar.builtin( name=str(op.function_def.routine_ref), @@ -1153,6 +1156,13 @@ def array_reduce_op_impl(x: ibis_types.Value, op: ops.ArrayReduceOp): ) +@scalar_op_compiler.register_unary_op(ops.ArrayMapOp, pass_op=True) +def array_map_op_impl(x: ibis_types.Value, op: ops.ArrayMapOp): + return typing.cast(ibis_types.ArrayValue, x).map( + lambda arr_vals: scalar_op_compiler.compile_row_op(op.map_op, (arr_vals,)) + ) + + # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): diff --git a/bigframes/core/compile/sqlglot/aggregate_compiler.py b/bigframes/core/compile/sqlglot/aggregate_compiler.py index f86e2af0dee..9f72e1c7943 100644 --- a/bigframes/core/compile/sqlglot/aggregate_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregate_compiler.py @@ -70,7 +70,5 @@ def compile_analytic( aggregate.arg.output_type, ) return unary_compiler.compile(aggregate.op, column, window) - elif isinstance(aggregate, agg_expressions.BinaryAggregation): - raise NotImplementedError("binary analytic operations not yet supported") else: raise ValueError(f"Unexpected analytic operation: {aggregate}") diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index a86a192a9e1..ce9ed6ce377 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -369,4 +369,5 @@ def compile_aggregate( def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) node = nodes.bottom_up(node, rewrite.rewrite_range_rolling) + node = nodes.bottom_up(node, rewrite.lower_udfs) return node diff --git a/bigframes/core/compile/sqlglot/expressions/array_ops.py b/bigframes/core/compile/sqlglot/expressions/array_ops.py index eb7582cb168..0ae5f3e846d 100644 --- a/bigframes/core/compile/sqlglot/expressions/array_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/array_ops.py @@ -73,6 +73,28 @@ def _(expr: TypedExpr, op: ops.ArrayReduceOp) -> sge.Expression: ) +@register_unary_op(ops.ArrayMapOp, pass_op=True) +def _(expr: TypedExpr, op: ops.ArrayMapOp) -> sge.Expression: + sub_expr = sg.to_identifier("bf_arr_map_uid") + sub_type = dtypes.get_array_inner_type(expr.dtype) + + # TODO: Expression should be provided instead of invoking compiler manually + map_expr = expression_compiler.expression_compiler.compile_row_op( + op.map_op, (TypedExpr(sub_expr, sub_type),) + ) + + return sge.array( + sge.select(map_expr) + .from_( + sge.Unnest( + expressions=[expr.expr], + alias=sge.TableAlias(columns=[sub_expr]), + ) + ) + .subquery() + ) + + @register_unary_op(ops.ArraySliceOp, pass_op=True) def _(expr: TypedExpr, op: ops.ArraySliceOp) -> sge.Expression: if expr.dtype == dtypes.STRING_DTYPE: @@ -105,31 +127,6 @@ def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression: return typed_expr.expr -def _string_slice(expr: TypedExpr, op: ops.ArraySliceOp) -> sge.Expression: - # local name for each element in the array - el = sg.to_identifier("el") - # local name for the index in the array - slice_idx = sg.to_identifier("slice_idx") - - conditions: typing.List[sge.Predicate] = [slice_idx >= op.start] - if op.stop is not None: - conditions.append(slice_idx < op.stop) - - selected_elements = ( - sge.select(el) - .from_( - sge.Unnest( - expressions=[expr.expr], - alias=sge.TableAlias(columns=[el]), - offset=slice_idx, - ) - ) - .where(*conditions) - ) - - return sge.array(selected_elements) - - def _array_slice(expr: TypedExpr, op: ops.ArraySliceOp) -> sge.Expression: # local name for each element in the array el = sg.to_identifier("el") diff --git a/bigframes/core/compile/sqlglot/expressions/bool_ops.py b/bigframes/core/compile/sqlglot/expressions/bool_ops.py index cd7f9da4084..3b4ecf54310 100644 --- a/bigframes/core/compile/sqlglot/expressions/bool_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/bool_ops.py @@ -18,6 +18,7 @@ from bigframes import dtypes from bigframes import operations as ops +from bigframes.core.compile.sqlglot import sql import bigframes.core.compile.sqlglot.expression_compiler as expression_compiler from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr @@ -29,10 +30,10 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: # For AND, when we encounter a NULL value, we only know when the result is FALSE, # otherwise the result is unknown (NULL). See: truth table at # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if left.expr == sge.null(): + if sql.is_null_literal(left.expr): condition = sge.EQ(this=right.expr, expression=sge.convert(False)) return sge.If(this=condition, true=right.expr, false=sge.null()) - if right.expr == sge.null(): + if sql.is_null_literal(right.expr): condition = sge.EQ(this=left.expr, expression=sge.convert(False)) return sge.If(this=condition, true=left.expr, false=sge.null()) @@ -46,10 +47,10 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: # For OR, when we encounter a NULL value, we only know when the result is TRUE, # otherwise the result is unknown (NULL). See: truth table at # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if left.expr == sge.null(): + if sql.is_null_literal(left.expr): condition = sge.EQ(this=right.expr, expression=sge.convert(True)) return sge.If(this=condition, true=right.expr, false=sge.null()) - if right.expr == sge.null(): + if sql.is_null_literal(right.expr): condition = sge.EQ(this=left.expr, expression=sge.convert(True)) return sge.If(this=condition, true=left.expr, false=sge.null()) @@ -64,12 +65,12 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: # maintains the boolean data type. left_expr = left.expr left_dtype = left.dtype - if left_expr == sge.null(): + if sql.is_null_literal(left_expr): left_expr = sge.Cast(this=sge.convert(None), to="BOOLEAN") left_dtype = dtypes.BOOL_DTYPE right_expr = right.expr right_dtype = right.dtype - if right_expr == sge.null(): + if sql.is_null_literal(right_expr): right_expr = sge.Cast(this=sge.convert(None), to="BOOLEAN") right_dtype = dtypes.BOOL_DTYPE diff --git a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py index 7177f9de84b..82c264da505 100644 --- a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py @@ -102,7 +102,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.ge_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -112,7 +112,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.gt_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -122,7 +122,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.lt_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -132,7 +132,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.le_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) diff --git a/bigframes/core/compile/sqlglot/expressions/datetime_ops.py b/bigframes/core/compile/sqlglot/expressions/datetime_ops.py index 21f8b39e7d6..4e0a75e6994 100644 --- a/bigframes/core/compile/sqlglot/expressions/datetime_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/datetime_ops.py @@ -125,7 +125,7 @@ def _datetime_to_integer_label_non_fixed_frequency( expression=sge.convert(1), ), ) - elif rule_code == "ME": # Monthly + elif rule_code in ("M", "ME"): # Monthly x_int = sge.Paren( # type: ignore this=sge.Add( this=sge.Mul( @@ -182,7 +182,7 @@ def _datetime_to_integer_label_non_fixed_frequency( expression=sge.convert(1), ), ) - elif rule_code == "QE-DEC": # Quarterly + elif rule_code in ("Q-DEC", "QE-DEC"): # Quarterly x_int = sge.Paren( # type: ignore this=sge.Add( this=sge.Mul( @@ -239,7 +239,7 @@ def _datetime_to_integer_label_non_fixed_frequency( expression=sge.convert(1), ), ) - elif rule_code == "YE-DEC": # Yearly + elif rule_code in ("A-DEC", "Y-DEC", "YE-DEC"): # Yearly x_int = sge.Extract(this=sge.Identifier(this="YEAR"), expression=x.expr) first = sge.Extract(this=sge.Identifier(this="YEAR"), expression=y.expr) return sge.Case( diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index d70ec2ef3f9..c5fdbe3c84a 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -19,6 +19,7 @@ from bigframes import dtypes from bigframes import operations as ops +from bigframes.core.compile.sqlglot import sql import bigframes.core.compile.sqlglot.expression_compiler as expression_compiler from bigframes.core.compile.sqlglot.expressions.common import round_towards_zero import bigframes.core.compile.sqlglot.expressions.constants as constants @@ -260,6 +261,9 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: def _int_pow_op( left_expr: sge.Expression, right_expr: sge.Expression ) -> sge.Expression: + if sql.is_null_literal(left_expr) or sql.is_null_literal(right_expr): + return sge.null() + overflow_cond = sge.and_( sge.NEQ(this=left_expr, expression=sge.convert(0)), sge.GT( @@ -292,6 +296,9 @@ def _int_pow_op( def _float_pow_op( left_expr: sge.Expression, right_expr: sge.Expression ) -> sge.Expression: + if sql.is_null_literal(left_expr) or sql.is_null_literal(right_expr): + return sge.null() + # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow overflow_cond = sge.and_( @@ -425,7 +432,7 @@ def _(expr: TypedExpr) -> sge.Expression: @register_binary_op(ops.add_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: @@ -463,6 +470,9 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.div_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): + return sge.null() + left_expr = _coerce_bool_to_int(left) right_expr = _coerce_bool_to_int(right) @@ -482,7 +492,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.floordiv_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -525,6 +535,9 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.mod_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): + return sge.null() + # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) left_expr = _coerce_bool_to_int(left) right_expr = _coerce_bool_to_int(right) @@ -568,7 +581,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.mul_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -594,7 +607,7 @@ def _(expr: TypedExpr, n_digits: TypedExpr) -> sge.Expression: @register_binary_op(ops.sub_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): diff --git a/bigframes/core/compile/sqlglot/expressions/string_ops.py b/bigframes/core/compile/sqlglot/expressions/string_ops.py index 3bfec04b3e0..f8938b1486a 100644 --- a/bigframes/core/compile/sqlglot/expressions/string_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/string_ops.py @@ -366,10 +366,16 @@ def string_slice( column_length + sge.convert(start + 1), ] ) - length_expr = sge.convert(op_end) - sge.Greatest( + length_expr = sge.Greatest( expressions=[ sge.convert(0), - column_length + sge.convert(start), + sge.convert(op_end) + - sge.Greatest( + expressions=[ + sge.convert(0), + column_length + sge.convert(start), + ] + ), ] ) else: diff --git a/bigframes/core/compile/sqlglot/sql/__init__.py b/bigframes/core/compile/sqlglot/sql/__init__.py index 17c78ba379a..751c3cfc3a5 100644 --- a/bigframes/core/compile/sqlglot/sql/__init__.py +++ b/bigframes/core/compile/sqlglot/sql/__init__.py @@ -15,26 +15,25 @@ from bigframes.core.compile.sqlglot.sql.base import ( cast, - escape_chars, identifier, is_null_literal, literal, table, to_sql, ) -from bigframes.core.compile.sqlglot.sql.ddl import load_data +from bigframes.core.compile.sqlglot.sql.ddl import create_external_table, load_data from bigframes.core.compile.sqlglot.sql.dml import insert, replace __all__ = [ # From base.py "cast", - "escape_chars", "identifier", "is_null_literal", "literal", "table", "to_sql", # From ddl.py + "create_external_table", "load_data", # From dml.py "insert", diff --git a/bigframes/core/compile/sqlglot/sql/base.py b/bigframes/core/compile/sqlglot/sql/base.py index 6e888fdf5e8..d287b2cac9d 100644 --- a/bigframes/core/compile/sqlglot/sql/base.py +++ b/bigframes/core/compile/sqlglot/sql/base.py @@ -136,29 +136,6 @@ def table(table: bigquery.TableReference) -> sge.Table: ) -def escape_chars(value: str): - """Escapes all special characters""" - # TODO: Reuse literal's escaping logic instead of re-implementing it here. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals - trans_table = str.maketrans( - { - "\a": r"\a", - "\b": r"\b", - "\f": r"\f", - "\n": r"\n", - "\r": r"\r", - "\t": r"\t", - "\v": r"\v", - "\\": r"\\", - "?": r"\?", - '"': r"\"", - "'": r"\'", - "`": r"\`", - } - ) - return value.translate(trans_table) - - def is_null_literal(expr: sge.Expression) -> bool: """Checks if the given expression is a NULL literal.""" if isinstance(expr, sge.Null): diff --git a/bigframes/core/compile/sqlglot/sql/ddl.py b/bigframes/core/compile/sqlglot/sql/ddl.py index 911c63781b0..5134368b89e 100644 --- a/bigframes/core/compile/sqlglot/sql/ddl.py +++ b/bigframes/core/compile/sqlglot/sql/ddl.py @@ -22,51 +22,6 @@ from bigframes.core.compile.sqlglot.sql import base -def _loaddata_sql(self: sg.Generator, expression: sge.LoadData) -> str: - out = ["LOAD DATA"] - if expression.args.get("overwrite"): - out.append("OVERWRITE") - - out.append(f"INTO {self.sql(expression, 'this').strip()}") - - # We ignore inpath as it's just a dummy to satisfy sqlglot requirements - # but BigQuery uses FROM FILES instead. - - columns = self.sql(expression, "columns").strip() - if columns: - out.append(columns) - - partition_by = self.sql(expression, "partition_by").strip() - if partition_by: - out.append(partition_by) - - cluster_by = self.sql(expression, "cluster_by").strip() - if cluster_by: - out.append(cluster_by) - - options = self.sql(expression, "options").strip() - if options: - out.append(options) - - from_files = self.sql(expression, "from_files").strip() - if from_files: - out.append(f"FROM FILES {from_files}") - - with_partition_columns = self.sql(expression, "with_partition_columns").strip() - if with_partition_columns: - out.append(f"WITH PARTITION COLUMNS {with_partition_columns}") - - connection = self.sql(expression, "connection").strip() - if connection: - out.append(f"WITH CONNECTION {connection}") - - return " ".join(out) - - -# Register the transform for BigQuery generator -sg.dialects.bigquery.BigQuery.Generator.TRANSFORMS[sge.LoadData] = _loaddata_sql - - def load_data( table_name: str, *, @@ -84,21 +39,6 @@ def load_data( # Quoting is handled by the dialect. table_expr = sge.Table(this=base.identifier(table_name)) - sge_columns = ( - sge.Schema( - this=None, - expressions=[ - sge.ColumnDef( - this=base.identifier(name), - kind=sge.DataType.build(typ, dialect="bigquery"), - ) - for name, typ in columns.items() - ], - ) - if columns - else None - ) - sge_partition_by = ( sge.PartitionedByProperty( this=base.identifier(partition_by[0]) @@ -115,17 +55,6 @@ def load_data( else None ) - sge_table_options = ( - sge.Properties( - expressions=[ - sge.Property(this=base.identifier(k), value=base.literal(v)) - for k, v in table_options.items() - ] - ) - if table_options - else None - ) - sge_from_files = sge.Tuple( expressions=[ sge.Property(this=base.identifier(k), value=base.literal(v)) @@ -133,32 +62,159 @@ def load_data( ] ) - sge_with_partition_columns = ( - sge.Schema( - this=None, - expressions=[ - sge.ColumnDef( - this=base.identifier(name), - kind=sge.DataType.build(typ, dialect="bigquery"), - ) - for name, typ in with_partition_columns.items() - ], - ) - if with_partition_columns - else None - ) - sge_connection = base.identifier(connection_name) if connection_name else None return sge.LoadData( this=table_expr, overwrite=(write_disposition == "OVERWRITE"), inpath=sge.convert("fake"), # satisfy sqlglot's required inpath arg - columns=sge_columns, + columns=_get_sge_schema(columns), partition_by=sge_partition_by, cluster_by=sge_cluster_by, - options=sge_table_options, + options=_get_sge_properties(table_options), from_files=sge_from_files, - with_partition_columns=sge_with_partition_columns, + with_partition_columns=_get_sge_schema(with_partition_columns), + connection=sge_connection, + ) + + +def create_external_table( + table_name: str, + *, + replace: bool = False, + if_not_exists: bool = False, + columns: Optional[Mapping[str, str]] = None, + partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, + options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None, +) -> sge.Create: + """Generates the CREATE EXTERNAL TABLE DDL statement.""" + sge_connection = base.identifier(connection_name) if connection_name else None + + table_expr = sge.Table(this=base.identifier(table_name)) + + # sqlglot.expressions.Create usually takes 'this' (Table or Schema) + sge_schema = _get_sge_schema(columns) + this: sge.Table | sge.Schema + if sge_schema: + sge_schema.set("this", table_expr) + this = sge_schema + else: + this = table_expr + + return sge.Create( + this=this, + kind="EXTERNAL TABLE", + replace=replace, + exists_ok=if_not_exists, + properties=_get_sge_properties(options), connection=sge_connection, + partition_columns=_get_sge_schema(partition_columns), ) + + +def _get_sge_schema( + columns: Optional[Mapping[str, str]] = None +) -> Optional[sge.Schema]: + if not columns: + return None + + return sge.Schema( + this=None, + expressions=[ + sge.ColumnDef( + this=base.identifier(name), + kind=sge.DataType.build(typ, dialect=base.DIALECT), + ) + for name, typ in columns.items() + ], + ) + + +def _get_sge_properties( + options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None +) -> Optional[sge.Properties]: + if not options: + return None + + return sge.Properties( + expressions=[ + sge.Property(this=base.identifier(k), value=base.literal(v)) + for k, v in options.items() + ] + ) + + +def _loaddata_sql(self: sg.Generator, expression: sge.LoadData) -> str: + out = ["LOAD DATA"] + if expression.args.get("overwrite"): + out.append("OVERWRITE") + + out.append(f"INTO {self.sql(expression, 'this').strip()}") + + # We ignore inpath as it's just a dummy to satisfy sqlglot requirements + # but BigQuery uses FROM FILES instead. + + columns = self.sql(expression, "columns").strip() + if columns: + out.append(columns) + + partition_by = self.sql(expression, "partition_by").strip() + if partition_by: + out.append(partition_by) + + cluster_by = self.sql(expression, "cluster_by").strip() + if cluster_by: + out.append(cluster_by) + + options = self.sql(expression, "options").strip() + if options: + out.append(options) + + from_files = self.sql(expression, "from_files").strip() + if from_files: + out.append(f"FROM FILES {from_files}") + + with_partition_columns = self.sql(expression, "with_partition_columns").strip() + if with_partition_columns: + out.append(f"WITH PARTITION COLUMNS {with_partition_columns}") + + connection = self.sql(expression, "connection").strip() + if connection: + out.append(f"WITH CONNECTION {connection}") + + return " ".join(out) + + +def _create_sql(self: sg.Generator, expression: sge.Create) -> str: + kind = expression.args.get("kind") + if kind != "EXTERNAL TABLE": + return self.create_sql(expression) + + out = ["CREATE"] + if expression.args.get("replace"): + out.append("OR REPLACE") + out.append("EXTERNAL TABLE") + if expression.args.get("exists_ok"): + out.append("IF NOT EXISTS") + + out.append(self.sql(expression, "this")) + + connection = self.sql(expression, "connection").strip() + if connection: + out.append(f"WITH CONNECTION {connection}") + + partition_columns = self.sql(expression, "partition_columns").strip() + if partition_columns: + out.append(f"WITH PARTITION COLUMNS {partition_columns}") + + properties = self.sql(expression, "properties").strip() + if properties: + out.append(properties) + + return " ".join(out) + + +# Register the transform for BigQuery generator +base.DIALECT.Generator.TRANSFORMS[sge.LoadData] = _loaddata_sql +base.DIALECT.Generator.TRANSFORMS[sge.Create] = _create_sql diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 5279418f5fb..6b00e9b2f12 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -27,6 +27,7 @@ from bigframes.core.rewrite.select_pullup import defer_selection from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions +from bigframes.core.rewrite.udfs import lower_udfs from bigframes.core.rewrite.windows import ( pull_out_window_order, rewrite_range_rolling, @@ -53,4 +54,5 @@ "pull_out_window_order", "defer_selection", "simplify_complex_windows", + "lower_udfs", ] diff --git a/bigframes/core/rewrite/udfs.py b/bigframes/core/rewrite/udfs.py new file mode 100644 index 00000000000..f9aa330247b --- /dev/null +++ b/bigframes/core/rewrite/udfs.py @@ -0,0 +1,87 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses + +from bigframes.core import bigframe_node, expression +from bigframes.core.rewrite import op_lowering +import bigframes.functions.udf_def as udf_def +import bigframes.operations as ops + + +@dataclasses.dataclass +class LowerRemoteFunctionRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return ops.RemoteFunctionOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, ops.RemoteFunctionOp) + func_def = expr.op.function_def + devirtualized_expr = ops.RemoteFunctionOp( + func_def.with_devirtualize(), + apply_on_null=expr.op.apply_on_null, + ).as_expr(*expr.children) + if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): + return func_def.signature.output.out_expr(devirtualized_expr) + else: + return devirtualized_expr + + +@dataclasses.dataclass +class LowerBinaryRemoteFunctionRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return ops.BinaryRemoteFunctionOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, ops.BinaryRemoteFunctionOp) + func_def = expr.op.function_def + devirtualized_expr = ops.BinaryRemoteFunctionOp( + func_def.with_devirtualize(), + ).as_expr(*expr.children) + if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): + return func_def.signature.output.out_expr(devirtualized_expr) + else: + return devirtualized_expr + + +@dataclasses.dataclass +class LowerNaryRemoteFunctionRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return ops.NaryRemoteFunctionOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, ops.NaryRemoteFunctionOp) + func_def = expr.op.function_def + devirtualized_expr = ops.NaryRemoteFunctionOp( + func_def.with_devirtualize(), + ).as_expr(*expr.children) + if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): + return func_def.signature.output.out_expr(devirtualized_expr) + else: + return devirtualized_expr + + +UDF_LOWERING_RULES = ( + LowerRemoteFunctionRule(), + LowerBinaryRemoteFunctionRule(), + LowerNaryRemoteFunctionRule(), +) + + +def lower_udfs(root: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: + return op_lowering.lower_ops(root, rules=UDF_LOWERING_RULES) diff --git a/bigframes/core/sql/__init__.py b/bigframes/core/sql/__init__.py index 8c9a093802c..69a74b15ced 100644 --- a/bigframes/core/sql/__init__.py +++ b/bigframes/core/sql/__init__.py @@ -48,6 +48,35 @@ to_wkt = dumps +def identifier(name: str) -> str: + if len(name) > 256: + raise ValueError("Identifier must be less than 256 characters") + return f"`{escape_chars(name)}`" + + +def escape_chars(value: str): + """Escapes all special characters""" + # TODO: Reuse literal's escaping logic instead of re-implementing it here. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + trans_table = str.maketrans( + { + "\a": r"\a", + "\b": r"\b", + "\f": r"\f", + "\n": r"\n", + "\r": r"\r", + "\t": r"\t", + "\v": r"\v", + "\\": r"\\", + "?": r"\?", + '"': r"\"", + "'": r"\'", + "`": r"\`", + } + ) + return value.translate(trans_table) + + def multi_literal(*values: Any): literal_strings = [sql.to_sql(sql.literal(i)) for i in values] return "(" + ", ".join(literal_strings) + ")" diff --git a/bigframes/core/sql/table.py b/bigframes/core/sql/table.py deleted file mode 100644 index 24a97ed1598..00000000000 --- a/bigframes/core/sql/table.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from typing import Mapping, Optional, Union - - -def create_external_table_ddl( - table_name: str, - *, - replace: bool = False, - if_not_exists: bool = False, - columns: Optional[Mapping[str, str]] = None, - partition_columns: Optional[Mapping[str, str]] = None, - connection_name: Optional[str] = None, - options: Mapping[str, Union[str, int, float, bool, list]], -) -> str: - """Generates the CREATE EXTERNAL TABLE DDL statement.""" - statement = ["CREATE"] - if replace: - statement.append("OR REPLACE") - statement.append("EXTERNAL TABLE") - if if_not_exists: - statement.append("IF NOT EXISTS") - statement.append(table_name) - - if columns: - column_defs = ", ".join([f"{name} {typ}" for name, typ in columns.items()]) - statement.append(f"({column_defs})") - - if connection_name: - statement.append(f"WITH CONNECTION `{connection_name}`") - - if partition_columns: - part_defs = ", ".join( - [f"{name} {typ}" for name, typ in partition_columns.items()] - ) - statement.append(f"WITH PARTITION COLUMNS ({part_defs})") - - if options: - opts = [] - for key, value in options.items(): - if isinstance(value, str): - value_sql = repr(value) - opts.append(f"{key} = {value_sql}") - elif isinstance(value, bool): - opts.append(f"{key} = {str(value).upper()}") - elif isinstance(value, list): - list_str = ", ".join([repr(v) for v in value]) - opts.append(f"{key} = [{list_str}]") - else: - opts.append(f"{key} = {value}") - options_str = ", ".join(opts) - statement.append(f"OPTIONS ({options_str})") - - return " ".join(statement) diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index baf4b12566b..5f713450f7f 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -15,10 +15,13 @@ import functools import itertools -from typing import Callable, Dict, Optional, Sequence +from typing import Callable, Dict, Optional, Sequence, TYPE_CHECKING import bigframes.core.nodes as nodes +if TYPE_CHECKING: + import bigframes.session.execution_cache as execution_cache + def is_trivially_executable(node: nodes.BigFrameNode) -> bool: if local_only(node): @@ -65,7 +68,7 @@ def select_cache_target( root: nodes.BigFrameNode, min_complexity: float, max_complexity: float, - cache: dict[nodes.BigFrameNode, nodes.BigFrameNode], + cache: execution_cache.ExecutionCache, heuristic: Callable[[int, int], float], ) -> Optional[nodes.BigFrameNode]: """Take tree, and return candidate nodes with (# of occurences, post-caching planning complexity). @@ -75,7 +78,7 @@ def select_cache_target( @functools.cache def _with_caching(subtree: nodes.BigFrameNode) -> nodes.BigFrameNode: - return nodes.top_down(subtree, lambda x: cache.get(x, x)) + return cache.subsitute_cached_subplans(subtree) def _combine_counts( left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int] @@ -106,6 +109,7 @@ def _node_counts_inner( if len(node_counts) == 0: raise ValueError("node counts should be non-zero") + # for each considered node, calculate heuristic value, and return node with max value return max( node_counts.keys(), key=lambda node: heuristic( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 25cedda8f4a..1ac80a4e6a1 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -37,6 +37,7 @@ overload, Sequence, Tuple, + TYPE_CHECKING, TypeVar, Union, ) @@ -47,6 +48,8 @@ import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import google.api_core.exceptions import google.cloud.bigquery as bigquery +import google.cloud.bigquery.job +import google.cloud.bigquery.table import numpy import pandas from pandas.api import extensions as pd_ext @@ -91,9 +94,10 @@ import bigframes.session._io.bigquery import bigframes.session.execution_spec as ex_spec -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from _typeshed import SupportsRichComparison + import bigframes.extensions.bigframes.dataframe_accessor as bigquery_accessor import bigframes.session SingleItemValue = Union[ @@ -144,7 +148,7 @@ def __init__( ): global bigframes - self._query_job: Optional[bigquery.QueryJob] = None + self._query_job: Optional[google.cloud.bigquery.job.QueryJob] = None if copy is not None and not copy: raise ValueError( @@ -376,6 +380,25 @@ def bqclient(self) -> bigframes.Session: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def bigquery( + self, + ) -> bigquery_accessor.BigframesBigQueryDataFrameAccessor: + """ + Accessor for BigQuery functionality. + + Returns: + bigframes.extensions.core.dataframe_accessor.BigQueryDataFrameAccessor: + Accessor that exposes BigQuery functionality on a DataFrame, + with method names closer to SQL. + """ + # Import the accessor here to avoid circular imports. + import bigframes.extensions.bigframes.dataframe_accessor + + return bigframes.extensions.bigframes.dataframe_accessor.BigframesBigQueryDataFrameAccessor( + self + ) + @property def _has_index(self) -> bool: return len(self._block.index_columns) > 0 @@ -438,7 +461,9 @@ def _should_sql_have_index(self) -> bool: self.index.name is not None or len(self.index.names) > 1 ) - def _to_placeholder_table(self, dry_run: bool = False) -> bigquery.TableReference: + def _to_placeholder_table( + self, dry_run: bool = False + ) -> google.cloud.bigquery.table.TableReference: """Compiles this DataFrame's expression tree to SQL and saves it to a (temporary) view or table (in the case of a dry run). """ @@ -488,11 +513,11 @@ def sql(self) -> str: ) from e @property - def query_job(self) -> Optional[bigquery.QueryJob]: + def query_job(self) -> Optional[google.cloud.bigquery.job.QueryJob]: """BigQuery job metadata for the most recent query. Returns: - None or google.cloud.bigquery.QueryJob: + None or google.cloud.bigquery.job.QueryJob: The most recent `QueryJob `_. """ @@ -606,7 +631,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ) return DataFrame(self._block.select_columns(selected_columns)) - def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): + def _set_internal_query_job( + self, query_job: Optional[google.cloud.bigquery.job.QueryJob] + ): self._query_job = query_job def __getitem__( @@ -790,7 +817,7 @@ def __repr__(self) -> str: ) def _get_display_df_and_blob_cols(self) -> tuple[DataFrame, list[str]]: - """Process blob columns for display.""" + """Process ObjectRef columns for display.""" df = self blob_cols = [] if bigframes.options.display.blob_display: @@ -1782,7 +1809,7 @@ def _to_pandas_batches( allow_large_results=allow_large_results, ) - def _compute_dry_run(self) -> bigquery.QueryJob: + def _compute_dry_run(self) -> google.cloud.bigquery.job.QueryJob: _, query_job = self._block._compute_dry_run() return query_job @@ -4748,7 +4775,9 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # compatible with the data types of the input params. # 3. The order of the columns in the dataframe must correspond # to the order of the input params in the function. - udf_input_dtypes = func.udf_def.signature.bf_input_types + udf_input_dtypes = tuple( + arg.bf_type for arg in func.udf_def.signature.inputs + ) if not args and len(udf_input_dtypes) != len(self.columns): raise ValueError( f"Parameter count mismatch: BigFrames BigQuery function" @@ -4793,7 +4822,6 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) result_series.name = None - result_series = func._post_process_series(result_series) return result_series # At this point column-wise or element-wise bigquery function operation will diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 304428ef2fa..6601fe5ae51 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -772,6 +772,13 @@ def convert_schema_field( ) -> typing.Tuple[str, Dtype]: is_repeated = field.mode == "REPEATED" if field.field_type == "RECORD": + if field.description == OBJ_REF_DESCRIPTION_TAG: + bf_dtype = OBJ_REF_DTYPE # type: ignore + if is_repeated: + pa_type = pa.list_(bigframes_dtype_to_arrow_dtype(bf_dtype)) + bf_dtype = pd.ArrowDtype(pa_type) + return field.name, bf_dtype + mapped_fields = map(convert_schema_field, field.fields) fields = [] for name, dtype in mapped_fields: @@ -815,7 +822,11 @@ def convert_to_schema_field( ) inner_field = convert_to_schema_field(name, inner_type, overrides) return google.cloud.bigquery.SchemaField( - name, inner_field.field_type, mode="REPEATED", fields=inner_field.fields + name, + inner_field.field_type, + mode="REPEATED", + fields=inner_field.fields, + description=inner_field.description, ) if pa.types.is_struct(bigframes_dtype.pyarrow_dtype): inner_fields: list[google.cloud.bigquery.SchemaField] = [] @@ -827,6 +838,14 @@ def convert_to_schema_field( convert_to_schema_field(field.name, inner_bf_type, overrides) ) + if bigframes_dtype == OBJ_REF_DTYPE: + return google.cloud.bigquery.SchemaField( + name, + "RECORD", + fields=inner_fields, + description=OBJ_REF_DESCRIPTION_TAG, + ) + return google.cloud.bigquery.SchemaField( name, "RECORD", fields=inner_fields ) @@ -971,6 +990,7 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: TIMEDELTA_DESCRIPTION_TAG = "#microseconds" +OBJ_REF_DESCRIPTION_TAG = "bigframes_dtype: OBJ_REF_DTYPE" def contains_db_dtypes_json_arrow_type(type_): diff --git a/bigframes/extensions/bigframes/__init__.py b/bigframes/extensions/bigframes/__init__.py new file mode 100644 index 00000000000..859b51d71ca --- /dev/null +++ b/bigframes/extensions/bigframes/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.extensions.bigframes.dataframe_accessor import ( + BigframesAIAccessor, + BigframesBigQueryDataFrameAccessor, +) + +__all__ = ["BigframesAIAccessor", "BigframesBigQueryDataFrameAccessor"] diff --git a/bigframes/extensions/bigframes/dataframe_accessor.py b/bigframes/extensions/bigframes/dataframe_accessor.py new file mode 100644 index 00000000000..f58f0d48381 --- /dev/null +++ b/bigframes/extensions/bigframes/dataframe_accessor.py @@ -0,0 +1,71 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import cast, TypeVar + +from bigframes.core.logging import log_adapter +import bigframes.dataframe +import bigframes.extensions.core.dataframe_accessor as core_accessor +import bigframes.series + +T = TypeVar("T", bound="bigframes.dataframe.DataFrame") +S = TypeVar("S", bound="bigframes.series.Series") + + +@log_adapter.class_logger +class BigframesAIAccessor(core_accessor.AIAccessor[T, S]): + """ + BigFrames DataFrame accessor for BigQuery AI functions. + """ + + def __init__(self, bf_obj: T): + super().__init__(bf_obj) + + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: + return self._obj + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series) + + +@log_adapter.class_logger +class BigframesBigQueryDataFrameAccessor(core_accessor.BigQueryDataFrameAccessor[T, S]): + """ + BigFrames DataFrame accessor for BigQuery DataFrames functionality. + """ + + def __init__(self, bf_obj: T): + super().__init__(bf_obj) + + @property + def ai(self) -> BigframesAIAccessor: + return BigframesAIAccessor(self._obj) + + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: + return self._obj + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series) diff --git a/bigframes/extensions/core/__init__.py b/bigframes/extensions/core/__init__.py new file mode 100644 index 00000000000..41b554c99ef --- /dev/null +++ b/bigframes/extensions/core/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.extensions.core.dataframe_accessor import ( + AIAccessor, + BigQueryDataFrameAccessor, +) + +__all__ = ["AIAccessor", "BigQueryDataFrameAccessor"] diff --git a/bigframes/extensions/core/dataframe_accessor.py b/bigframes/extensions/core/dataframe_accessor.py new file mode 100644 index 00000000000..02c13e4555a --- /dev/null +++ b/bigframes/extensions/core/dataframe_accessor.py @@ -0,0 +1,125 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import abc +from typing import Generic, Iterable, TYPE_CHECKING, TypeVar + +if TYPE_CHECKING: + import bigframes.dataframe + import bigframes.session + +T = TypeVar("T") +S = TypeVar("S") + + +class AbstractBigQueryDataFrameAccessor(abc.ABC, Generic[T, S]): + @abc.abstractmethod + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: + """Convert the accessor's object to a BigFrames DataFrame.""" + + @abc.abstractmethod + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + """Convert a BigFrames DataFrame to the accessor's object type.""" + + @abc.abstractmethod + def _to_series(self, bf_series: bigframes.series.Series) -> S: + """Convert a BigFrames Series to the accessor's object type.""" + + +class AIAccessor(AbstractBigQueryDataFrameAccessor[T, S]): + """ + DataFrame accessor for BigQuery AI functions. + """ + + def __init__(self, obj: T): + self._obj = obj + + def forecast( + self, + *, + data_col: str, + timestamp_col: str, + model: str = "TimesFM 2.0", + id_cols: Iterable[str] | None = None, + horizon: int = 10, + confidence_level: float = 0.95, + context_window: int | None = None, + output_historical_time_series: bool = False, + session: bigframes.session.Session | None = None, + ) -> T: + """ + Forecast time series at future horizon using BigQuery AI.FORECAST. + + This is an accessor for :func:`bigframes.bigquery.ai.forecast`. See that + function's documentation for detailed parameter descriptions and examples. + """ + import bigframes.bigquery.ai + + bf_df = self._bf_from_dataframe(session) + result = bigframes.bigquery.ai.forecast( + bf_df, + data_col=data_col, + timestamp_col=timestamp_col, + model=model, + id_cols=id_cols, + horizon=horizon, + confidence_level=confidence_level, + context_window=context_window, + output_historical_time_series=output_historical_time_series, + ) + return self._to_dataframe(result) + + +class BigQueryDataFrameAccessor(AbstractBigQueryDataFrameAccessor[T, S]): + """ + DataFrame accessor for BigQuery DataFrames functionality. + """ + + def __init__(self, obj: T): + self._obj = obj + + @property + @abc.abstractmethod + def ai(self) -> AIAccessor: + """ + Accessor for BigQuery AI functions. + + Returns: + AIAccessor: Accessor for BigQuery AI functions. + """ + + def sql_scalar( + self, + sql_template: str, + *, + output_dtype=None, + session: bigframes.session.Session | None = None, + ) -> S: + """ + Compute a new Series by applying a SQL scalar function to the DataFrame. + + This is an accessor for :func:`bigframes.bigquery.sql_scalar`. See that + function's documentation for detailed parameter descriptions and examples. + """ + import bigframes.bigquery + + bf_df = self._bf_from_dataframe(session) + result = bigframes.bigquery.sql_scalar( + sql_template, bf_df, output_dtype=output_dtype + ) + return self._to_series(result) diff --git a/bigframes/extensions/pandas/__init__.py b/bigframes/extensions/pandas/__init__.py index 58d482ea386..d47acd3b05e 100644 --- a/bigframes/extensions/pandas/__init__.py +++ b/bigframes/extensions/pandas/__init__.py @@ -11,3 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +""" +BigQuery DataFrames automatically registers a pandas extenstion when imported. +This allows you to use the power of the BigQuery engine with pandas objects +directly. +""" + +from bigframes.extensions.pandas.dataframe_accessor import ( + PandasBigQueryDataFrameAccessor, +) + +__all__ = ["PandasBigQueryDataFrameAccessor"] diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py index 2cb44fe3c5e..3edb8ebe14c 100644 --- a/bigframes/extensions/pandas/dataframe_accessor.py +++ b/bigframes/extensions/pandas/dataframe_accessor.py @@ -12,56 +12,72 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import cast +from typing import cast, TypeVar import pandas import pandas.api.extensions import bigframes.core.global_session as bf_session +from bigframes.core.logging import log_adapter +import bigframes.dataframe +from bigframes.extensions.core.dataframe_accessor import ( + AIAccessor, + BigQueryDataFrameAccessor, +) import bigframes.pandas as bpd +T = TypeVar("T", bound="pandas.DataFrame") +S = TypeVar("S", bound="pandas.Series") + + +@log_adapter.class_logger +class PandasAIAccessor(AIAccessor[T, S]): + """ + Pandas DataFrame accessor for BigQuery AI functions. + """ + + def __init__(self, pandas_obj: T): + super().__init__(pandas_obj) + + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: + if session is None: + session = bf_session.get_global_session() + + return cast(bpd.DataFrame, session.read_pandas(self._obj)) + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series.to_pandas(ordered=True)) + @pandas.api.extensions.register_dataframe_accessor("bigquery") -class BigQueryDataFrameAccessor: +@log_adapter.class_logger +class PandasBigQueryDataFrameAccessor(BigQueryDataFrameAccessor[T, S]): """ Pandas DataFrame accessor for BigQuery DataFrames functionality. This accessor is registered under the ``bigquery`` namespace on pandas DataFrame objects. """ - def __init__(self, pandas_obj: pandas.DataFrame): - self._obj = pandas_obj - - def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None): - """ - Compute a new pandas Series by applying a SQL scalar function to the DataFrame. - - The DataFrame is converted to BigFrames by calling ``read_pandas``, then the SQL - template is applied using ``bigframes.bigquery.sql_scalar``, and the result is - converted back to a pandas Series using ``to_pandas``. - - Args: - sql_template (str): - A SQL format string with Python-style {0}, {1}, etc. placeholders for each of - the columns in the DataFrame (in the order they appear in ``df.columns``). - output_dtype (a BigQuery DataFrames compatible dtype, optional): - If provided, BigQuery DataFrames uses this to determine the output - of the returned Series. This avoids a dry run query. - session (bigframes.session.Session, optional): - The BigFrames session to use. If not provided, the default global session is used. - - Returns: - pandas.Series: - The result of the SQL scalar function as a pandas Series. - """ - # Import bigframes.bigquery here to avoid circular imports - import bigframes.bigquery + def __init__(self, pandas_obj: T): + super().__init__(pandas_obj) + @property + def ai(self) -> PandasAIAccessor: + return PandasAIAccessor(self._obj) + + def _bf_from_dataframe(self, session) -> bigframes.dataframe.DataFrame: if session is None: session = bf_session.get_global_session() - bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) - result = bigframes.bigquery.sql_scalar( - sql_template, bf_df, output_dtype=output_dtype - ) - return result.to_pandas(ordered=True) + return cast(bpd.DataFrame, session.read_pandas(self._obj)) + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series.to_pandas(ordered=True)) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index be9ff0956ef..fc064653276 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -15,16 +15,16 @@ from __future__ import annotations -import inspect import logging import os import random +import re import shutil import string import tempfile import textwrap import types -from typing import Any, cast, Optional, Sequence, Tuple, TYPE_CHECKING +from typing import Any, cast, Optional, TYPE_CHECKING import warnings import requests @@ -32,6 +32,7 @@ import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting import bigframes.functions.function_template as bff_template +import bigframes.functions.udf_def as udf_def if TYPE_CHECKING: from bigframes.session import Session @@ -40,7 +41,12 @@ import google.api_core.retry from google.cloud import bigquery, functions_v2 -from . import _utils +from bigframes.functions import _utils +from bigframes.functions._utils import ( + _BIGFRAMES_FUNCTION_PREFIX, + _BQ_FUNCTION_NAME_SEPERATOR, + _GCF_FUNCTION_NAME_SEPERATOR, +) logger = logging.getLogger(__name__) @@ -80,7 +86,6 @@ def __init__( bq_location, bq_dataset, bq_client, - bq_connection_id, bq_connection_manager, cloud_function_region=None, cloud_functions_client=None, @@ -95,7 +100,6 @@ def __init__( self._bq_location = bq_location self._bq_dataset = bq_dataset self._bq_client = bq_client - self._bq_connection_id = bq_connection_id self._bq_connection_manager = bq_connection_manager self._session = session @@ -107,12 +111,12 @@ def __init__( self._cloud_function_docker_repository = cloud_function_docker_repository self._cloud_build_service_account = cloud_build_service_account - def _create_bq_connection(self) -> None: + def _create_bq_connection(self, connection_id: str) -> None: if self._bq_connection_manager: self._bq_connection_manager.create_bq_connection( self._gcp_project_id, self._bq_location, - self._bq_connection_id, + connection_id, "run.invoker", ) @@ -162,45 +166,40 @@ def _format_function_options(self, function_options: dict) -> str: def create_bq_remote_function( self, - input_args: Sequence[str], - input_types: Sequence[str], - output_type: str, - endpoint: str, - bq_function_name: str, - max_batching_rows: int, - metadata: str, + name: str, + udf_def: udf_def.RemoteFunctionConfig, ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" - self._create_bq_connection() + self._create_bq_connection(udf_def.connection_id) # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 - bq_function_args = [] - bq_function_return_type = output_type - - # We are expecting the input type annotations to be 1:1 with the input args - for name, type_ in zip(input_args, input_types): - bq_function_args.append(f"{name} {type_}") remote_function_options = { - "endpoint": endpoint, - "max_batching_rows": max_batching_rows, + "endpoint": udf_def.endpoint, + "max_batching_rows": udf_def.max_batching_rows, } - if metadata: + if udf_def.bq_metadata: # We are using the description field to store this structured # bigframes specific metadata for the lack of a better option - remote_function_options["description"] = metadata + remote_function_options["description"] = udf_def.bq_metadata remote_function_options_str = self._format_function_options( remote_function_options ) + import bigframes.core.sql + import bigframes.core.utils + + # removes anything that isn't letter, number or underscore + _validate_routine_name(name) + bq_function_name_escaped = bigframes.core.sql.identifier(name) create_function_ddl = f""" - CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) - RETURNS {bq_function_return_type} - REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` + CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name_escaped}({udf_def.signature.to_sql_input_signature()}) + RETURNS {udf_def.signature.with_devirtualize().output.sql_type} + REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{udf_def.connection_id}` OPTIONS ({remote_function_options_str})""" logger.info(f"Creating BQ remote function: {create_function_ddl}") @@ -210,25 +209,15 @@ def create_bq_remote_function( def provision_bq_managed_function( self, - func, - input_types: Sequence[str], - output_type: str, name: Optional[str], - packages: Optional[Sequence[str]], - max_batching_rows: Optional[int], - container_cpu: Optional[float], - container_memory: Optional[str], - is_row_processor: bool, - bq_connection_id, - *, - capture_references: bool = False, + config: udf_def.ManagedFunctionConfig, ): """Create a BigQuery managed function.""" # TODO(b/406283812): Expose the capability to pass down # capture_references=True in the public udf API. if ( - capture_references + config.capture_references and (python_version := _utils.get_python_version()) != _MANAGED_FUNC_PYTHON_VERSION ): @@ -238,29 +227,27 @@ def provision_bq_managed_function( ) # Create BQ managed function. - bq_function_args = [] - bq_function_return_type = output_type - - input_args = inspect.getargs(func.__code__).args - # We expect the input type annotations to be 1:1 with the input args. - for name_, type_ in zip(input_args, input_types): - bq_function_args.append(f"{name_} {type_}") + bq_function_args = config.signature.to_sql_input_signature() + bq_function_return_type = config.signature.with_devirtualize().output.sql_type managed_function_options: dict[str, Any] = { "runtime_version": _MANAGED_FUNC_PYTHON_VERSION, "entry_point": "bigframes_handler", } - if max_batching_rows: - managed_function_options["max_batching_rows"] = max_batching_rows - if container_cpu: - managed_function_options["container_cpu"] = container_cpu - if container_memory: - managed_function_options["container_memory"] = container_memory + if config.max_batching_rows: + managed_function_options["max_batching_rows"] = config.max_batching_rows + if config.container_cpu: + managed_function_options["container_cpu"] = config.container_cpu + if config.container_memory: + managed_function_options["container_memory"] = config.container_memory # Augment user package requirements with any internal package # requirements. packages = _utils.get_updated_package_requirements( - packages, is_row_processor, capture_references, ignore_package_version=True + config.code.package_requirements or [], + config.signature.is_row_processor, + config.capture_references, + ignore_package_version=True, ) if packages: managed_function_options["packages"] = packages @@ -268,27 +255,22 @@ def provision_bq_managed_function( managed_function_options ) - session_id = None if name else self._session.session_id bq_function_name = name if not bq_function_name: - # Compute a unique hash representing the user code. - function_hash = _utils.get_hash(func, packages) - bq_function_name = _utils.get_bigframes_function_name( - function_hash, - session_id, + # Compute a unique hash representing the artifact definition. + bq_function_name = get_managed_function_name( + config, self._session.session_id ) persistent_func_id = ( f"`{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}" ) - udf_name = func.__name__ - with_connection_clause = ( ( - f"WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}`" + f"WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{config.bq_connection_id}`" ) - if bq_connection_id + if config.bq_connection_id else "" ) @@ -296,13 +278,13 @@ def provision_bq_managed_function( # including the user's function, necessary imports, and the BigQuery # handler wrapper. python_code_block = bff_template.generate_managed_function_code( - func, udf_name, is_row_processor, capture_references + config.code, config.signature, config.capture_references ) create_function_ddl = ( textwrap.dedent( f""" - CREATE OR REPLACE FUNCTION {persistent_func_id}({','.join(bq_function_args)}) + CREATE OR REPLACE FUNCTION {persistent_func_id}({bq_function_args}) RETURNS {bq_function_return_type} LANGUAGE python {with_connection_clause} @@ -337,7 +319,7 @@ def get_remote_function_fully_qualilfied_name(self, name): "Get the fully qualilfied name for a BQ remote function." return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" - def get_cloud_function_endpoint(self, name): + def get_cloud_function_endpoint(self, name) -> str | None: """Get the http endpoint of a cloud function if it exists.""" fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) try: @@ -351,29 +333,24 @@ def get_cloud_function_endpoint(self, name): def generate_cloud_function_code( self, - def_, + code_def: udf_def.CodeDef, directory, *, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - is_row_processor=False, + udf_signature: udf_def.UdfSignature, ): """Generate the cloud function code for a given user defined function.""" # requirements.txt - if package_requirements: + if code_def.package_requirements: requirements_txt = os.path.join(directory, "requirements.txt") with open(requirements_txt, "w") as f: - f.write("\n".join(package_requirements)) + f.write("\n".join(code_def.package_requirements)) # main.py entry_point = bff_template.generate_cloud_function_main_code( - def_, + code_def, directory, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, + udf_signature=udf_signature, ) return entry_point @@ -393,35 +370,19 @@ def _get_cloud_function_endpoint_with_retry(self, name): def create_cloud_function( self, - def_, - *, - random_name, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - timeout_seconds=600, - max_instance_count=None, - is_row_processor=False, - vpc_connector=None, - vpc_connector_egress_settings="private-ranges-only", - memory_mib=None, - cpus=None, - ingress_settings="internal-only", - workers=None, - threads=None, - concurrency=None, - ): + name: str, + func_def: udf_def.CloudRunFunctionConfig, + ) -> str: """Create a cloud function from the given user defined function.""" + config = func_def + # Build and deploy folder structure containing cloud function with tempfile.TemporaryDirectory() as directory: entry_point = self.generate_cloud_function_code( - def_, + config.code, directory, - package_requirements=package_requirements, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, + udf_signature=config.signature, ) archive_path = shutil.make_archive(directory, "zip", directory) @@ -461,9 +422,9 @@ def create_cloud_function( create_function_request.parent = ( self.get_cloud_function_fully_qualified_parent() ) - create_function_request.function_id = random_name + create_function_request.function_id = name function = functions_v2.Function() - function.name = self.get_cloud_function_fully_qualified_name(random_name) + function.name = self.get_cloud_function_fully_qualified_name(name) function.build_config = functions_v2.BuildConfig() function.build_config.runtime = python_version function.build_config.entry_point = entry_point @@ -490,33 +451,34 @@ def create_cloud_function( ) function.service_config = functions_v2.ServiceConfig() - if memory_mib is not None: - function.service_config.available_memory = f"{memory_mib}Mi" - if cpus is not None: - function.service_config.available_cpu = str(cpus) - if timeout_seconds is not None: - if timeout_seconds > 1200: + if config.memory_mib is not None: + function.service_config.available_memory = f"{config.memory_mib}Mi" + if config.cpus is not None: + function.service_config.available_cpu = str(config.cpus) + if config.timeout_seconds is not None: + if config.timeout_seconds > 1200: raise bf_formatting.create_exception_with_feedback_link( ValueError, "BigQuery remote function can wait only up to 20 minutes" ", see for more details " "https://cloud.google.com/bigquery/quotas#remote_function_limits.", ) - function.service_config.timeout_seconds = timeout_seconds - if max_instance_count is not None: - function.service_config.max_instance_count = max_instance_count - if vpc_connector is not None: - function.service_config.vpc_connector = vpc_connector - if vpc_connector_egress_settings is None: + function.service_config.timeout_seconds = config.timeout_seconds + if config.max_instance_count is not None: + function.service_config.max_instance_count = config.max_instance_count + if config.vpc_connector is not None: + function.service_config.vpc_connector = config.vpc_connector + vpc_connector_egress_settings = config.vpc_connector_egress_settings + if config.vpc_connector_egress_settings is None: msg = bfe.format_message( "The 'vpc_connector_egress_settings' was not specified. Defaulting to 'private-ranges-only'.", ) warnings.warn(msg, category=UserWarning) vpc_connector_egress_settings = "private-ranges-only" - if vpc_connector_egress_settings not in _VPC_EGRESS_SETTINGS_MAP: + if config.vpc_connector_egress_settings not in _VPC_EGRESS_SETTINGS_MAP: raise bf_formatting.create_exception_with_feedback_link( ValueError, - f"'{vpc_connector_egress_settings}' is not one of the supported vpc egress settings values: {list(_VPC_EGRESS_SETTINGS_MAP)}", + f"'{config.vpc_connector_egress_settings}' is not one of the supported vpc egress settings values: {list(_VPC_EGRESS_SETTINGS_MAP)}", ) function.service_config.vpc_connector_egress_settings = cast( functions_v2.ServiceConfig.VpcConnectorEgressSettings, @@ -525,28 +487,30 @@ def create_cloud_function( function.service_config.service_account_email = ( self._cloud_function_service_account ) - if concurrency: - function.service_config.max_instance_request_concurrency = concurrency + if config.concurrency: + function.service_config.max_instance_request_concurrency = ( + config.concurrency + ) # Functions framework use environment variables to pass config to gunicorn # See https://github.com/GoogleCloudPlatform/functions-framework-python/issues/241 # Code: https://github.com/GoogleCloudPlatform/functions-framework-python/blob/v3.10.1/src/functions_framework/_http/gunicorn.py#L37-L43 env_vars = {} - if workers: - env_vars["WORKERS"] = str(workers) - if threads: - env_vars["THREADS"] = str(threads) + if config.workers: + env_vars["WORKERS"] = str(config.workers) + if config.threads: + env_vars["THREADS"] = str(config.threads) if env_vars: function.service_config.environment_variables = env_vars - if ingress_settings not in _INGRESS_SETTINGS_MAP: + if config.ingress_settings not in _INGRESS_SETTINGS_MAP: raise bf_formatting.create_exception_with_feedback_link( ValueError, - f"'{ingress_settings}' not one of the supported ingress settings values: {list(_INGRESS_SETTINGS_MAP)}", + f"'{config.ingress_settings}' not one of the supported ingress settings values: {list(_INGRESS_SETTINGS_MAP)}", ) function.service_config.ingress_settings = cast( functions_v2.ServiceConfig.IngressSettings, - _INGRESS_SETTINGS_MAP[ingress_settings], + _INGRESS_SETTINGS_MAP[config.ingress_settings], ) function.kms_key_name = self._cloud_function_kms_key_name create_function_request.function = function @@ -577,68 +541,39 @@ def create_cloud_function( # Fetch the endpoint with retries if it wasn't returned by the operation if not endpoint: try: - endpoint = self._get_cloud_function_endpoint_with_retry(random_name) + endpoint = self._get_cloud_function_endpoint_with_retry(name) except Exception as e: raise bf_formatting.create_exception_with_feedback_link( ValueError, f"Couldn't fetch the http endpoint: {e}" ) - logger.info( - f"Successfully created cloud function {random_name} with uri ({endpoint})" - ) + logger.info(f"Successfully created cloud function {name} with uri ({endpoint})") return endpoint def provision_bq_remote_function( self, def_, - input_types, - output_type, - reuse, - name, - package_requirements, - max_batching_rows, - cloud_function_timeout, - cloud_function_max_instance_count, - is_row_processor, - cloud_function_vpc_connector, - cloud_function_vpc_connector_egress_settings, - cloud_function_memory_mib, - cloud_function_cpus, - cloud_function_ingress_settings, - bq_metadata, + func_signature: udf_def.UdfSignature, + reuse: bool, + name: str | None, + package_requirements: tuple[str, ...], + max_batching_rows: int | None, + cloud_function_timeout: int | None, + cloud_function_max_instance_count: int | None, + cloud_function_vpc_connector: str | None, + cloud_function_vpc_connector_egress_settings: str | None, + cloud_function_memory_mib: int | None, + cloud_function_cpus: float | None, + cloud_function_ingress_settings: str, + bq_connection_id: str, ): """Provision a BigQuery remote function.""" # Augment user package requirements with any internal package # requirements - package_requirements = _utils.get_updated_package_requirements( - package_requirements, is_row_processor + full_package_requirements = _utils.get_updated_package_requirements( + package_requirements, func_signature.is_row_processor ) - # Compute a unique hash representing the user code - function_hash = _utils.get_hash(def_, package_requirements) - - # If reuse of any existing function with the same name (indicated by the - # same hash of its source code) is not intended, then attach a unique - # suffix to the intended function name to make it unique. - uniq_suffix = None - if not reuse: - # use 4 digits as a unique suffix which should suffice for - # uniqueness per session - uniq_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=4) - ) - - # Derive the name of the cloud function underlying the intended BQ - # remote function. Use the session id to identify the GCF for unnamed - # functions. The named remote functions are treated as a persistant - # artifacts, so let's keep them independent of session id, which also - # makes their naming more stable for the same udf code - session_id = None if name else self._session.session_id - cloud_function_name = _utils.get_cloud_function_name( - function_hash, session_id, uniq_suffix - ) - cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) - if cloud_function_memory_mib is None: cloud_function_memory_mib = _DEFAULT_FUNCTION_MEMORY_MIB @@ -654,90 +589,144 @@ def provision_bq_remote_function( # max concurrency==1 for vcpus < 1 hard limit from cloud run concurrency = (workers * threads) if (expected_milli_cpus >= 1000) else 1 + cloud_func_spec = udf_def.CloudRunFunctionConfig( + code=udf_def.CodeDef.from_func(def_, full_package_requirements), + signature=func_signature, + timeout_seconds=cloud_function_timeout, + max_instance_count=cloud_function_max_instance_count, + vpc_connector=cloud_function_vpc_connector, + vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings + or "private-ranges-only", + memory_mib=cloud_function_memory_mib, + cpus=cloud_function_cpus, + ingress_settings=cloud_function_ingress_settings, + workers=workers, + threads=threads, + concurrency=concurrency, + ) + + # If reuse of any existing function with the same name (indicated by the + # same hash of its source code and config) is not intended, then attach a unique + # suffix to the intended function name to make it unique. + random_suffix = "".join( + random.choices(string.ascii_lowercase + string.digits, k=4) + ) + # Derive the name of the cloud function underlying the intended BQ + # remote function. Use the session id to identify the GCF for unnamed + # functions. The named remote functions are treated as a persistant + # artifacts, so let's keep them independent of session id, which also + # makes their naming more stable for the same udf code + cloud_function_name = get_cloud_function_name( + cloud_func_spec, + session_id=self._session.session_id if (name is None) else None, + uniq_suffix=random_suffix if (not reuse) else None, + ) + + cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) # Create the cloud function if it does not exist if not cf_endpoint: cf_endpoint = self.create_cloud_function( - def_, - random_name=cloud_function_name, - input_types=input_types, - output_type=output_type, - package_requirements=package_requirements, - timeout_seconds=cloud_function_timeout, - max_instance_count=cloud_function_max_instance_count, - is_row_processor=is_row_processor, - vpc_connector=cloud_function_vpc_connector, - vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, - memory_mib=cloud_function_memory_mib, - cpus=cloud_function_cpus, - ingress_settings=cloud_function_ingress_settings, - workers=workers, - threads=threads, - concurrency=concurrency, + cloud_function_name, cloud_func_spec ) else: logger.info(f"Cloud function {cloud_function_name} already exists.") - # Derive the name of the remote function - remote_function_name = name - if not remote_function_name: - remote_function_name = _utils.get_bigframes_function_name( - function_hash, self._session.session_id, uniq_suffix - ) - rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) - - # Create the BQ remote function in following circumstances: - # 1. It does not exist - # 2. It exists but the existing remote function has different - # configuration than intended - created_new = False - if not rf_endpoint or ( - rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id - ): - input_args = inspect.getargs(def_.__code__).args - if len(input_args) != len(input_types): - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "Exactly one type should be provided for every input arg.", - ) - self.create_bq_remote_function( - input_args, - input_types, - output_type, - cf_endpoint, - remote_function_name, - max_batching_rows, - bq_metadata, - ) + intended_rf_spec = udf_def.RemoteFunctionConfig( + endpoint=cf_endpoint, + connection_id=bq_connection_id, + max_batching_rows=max_batching_rows or 1000, + signature=func_signature, + bq_metadata=func_signature.protocol_metadata, + ) + remote_function_name = name or get_bigframes_function_name( + intended_rf_spec, + self._session.session_id, + random_suffix if (not reuse) else None, + ) - created_new = True + if reuse: + existing_rf_spec = self.get_remote_function_specs(remote_function_name) + # Create the BQ remote function in following circumstances: + # 1. It does not exist + # 2. It exists but the existing remote function has different + # configuration than intended + created_new = False + if not existing_rf_spec or (existing_rf_spec != intended_rf_spec): + self.create_bq_remote_function(remote_function_name, intended_rf_spec) + created_new = True + else: + logger.info(f"Remote function {remote_function_name} already exists.") + + return remote_function_name, cloud_function_name, created_new else: - logger.info(f"Remote function {remote_function_name} already exists.") - - return remote_function_name, cloud_function_name, created_new + self.create_bq_remote_function(remote_function_name, intended_rf_spec) + return remote_function_name, cloud_function_name, True - def get_remote_function_specs(self, remote_function_name): + def get_remote_function_specs( + self, remote_function_name: str + ) -> udf_def.RemoteFunctionConfig | None: """Check whether a remote function already exists for the udf.""" - http_endpoint = None - bq_connection = None - routines = self._bq_client.list_routines( - f"{self._gcp_project_id}.{self._bq_dataset}" - ) try: - for routine in routines: - routine = cast(bigquery.Routine, routine) - if routine.reference.routine_id == remote_function_name: - rf_options = routine.remote_function_options - if rf_options: - http_endpoint = rf_options.endpoint - bq_connection = rf_options.connection - if bq_connection: - bq_connection = os.path.basename(bq_connection) - break + routine = self._bq_client.get_routine( + f"{self._gcp_project_id}.{self._bq_dataset}.{remote_function_name}" + ) + if routine.reference.routine_id == remote_function_name: + try: + return udf_def.RemoteFunctionConfig.from_bq_routine(routine) + except udf_def.ReturnTypeMissingError: + # The remote function exists, but it's missing a return type. + # Something is wrong with the function, so we should replace it. + return None except google.api_core.exceptions.NotFound: - # The dataset might not exist, in which case the http_endpoint doesn't, either. + # The dataset might not exist, in which case the remote function doesn't, either. # Note: list_routines doesn't make an API request until we iterate on the response object. pass - return (http_endpoint, bq_connection) + return None + + +def get_cloud_function_name( + function_def: udf_def.CloudRunFunctionConfig, session_id=None, uniq_suffix=None +): + "Get a name for the cloud function for the given user defined function." + parts = [_BIGFRAMES_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_def.stable_hash().hex()) + if uniq_suffix: + parts.append(uniq_suffix) + return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) + + +def get_bigframes_function_name( + function: udf_def.RemoteFunctionConfig, session_id, uniq_suffix=None +): + "Get a name for the bigframes function for the given user defined function." + parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function.stable_hash().hex()] + if uniq_suffix: + parts.append(uniq_suffix) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +def get_managed_function_name( + function_def: udf_def.ManagedFunctionConfig, + session_id: str | None = None, +): + """Get a name for the bigframes managed function for the given user defined function.""" + parts = [_BIGFRAMES_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_def.stable_hash().hex()) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +def _validate_routine_name(name: str) -> None: + """Validate that the given name is a valid BigQuery routine name.""" + # Routine IDs can contain only letters (a-z, A-Z), numbers (0-9), or underscores (_) + # must also start with a letter or underscore only + if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name): + raise ValueError( + "Routine ID can contain only letters (a-z, A-Z), numbers (0-9), or underscores (_)" + ) def _infer_milli_cpus_from_memory(memory_mib: int) -> int: diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 7541936ede3..fe7889e9556 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -24,7 +24,6 @@ Any, cast, Dict, - get_origin, Literal, Mapping, Optional, @@ -51,7 +50,6 @@ if TYPE_CHECKING: from bigframes.session import Session -import pandas from bigframes.functions import _function_client, _utils @@ -241,7 +239,7 @@ def remote_function( cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, - max_batching_rows: Optional[int] = 1000, + max_batching_rows: Optional[int] = None, cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, @@ -558,41 +556,13 @@ def wrapper(func): func, **signature_kwargs, ) - if input_types is not None: - if not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - if _utils.has_conflict_input_type(py_sig, input_types): - msg = bfe.format_message( - "Conflicting input types detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace( - parameters=[ - par.replace(annotation=itype) - for par, itype in zip(py_sig.parameters.values(), input_types) - ] - ) - if output_type: - if _utils.has_conflict_output_type(py_sig, output_type): - msg = bfe.format_message( - "Conflicting return type detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace(return_annotation=output_type) - - # The function will actually be receiving a pandas Series, but allow - # both BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if new_sig := _convert_row_processor_sig(py_sig): - py_sig = new_sig - is_row_processor = True + py_sig = _resolve_signature(py_sig, input_types, output_type) remote_function_client = _function_client.FunctionClient( dataset_ref.project, bq_location, dataset_ref.dataset_id, bigquery_client, - bq_connection_id, bq_connection_manager, cloud_function_region, cloud_functions_client, @@ -605,25 +575,9 @@ def wrapper(func): session=session, # type: ignore ) - # resolve the output type that can be supported in the bigframes, - # ibis, BQ remote functions and cloud functions integration. - bqrf_metadata = None - post_process_routine = None - if get_origin(py_sig.return_annotation) is list: - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the cloud function - # and BQ level, and parse that to the intended output type at - # the bigframes level. - bqrf_metadata = _utils.get_bigframes_metadata( - python_output_type=py_sig.return_annotation - ) - post_process_routine = _utils.build_unnest_post_routine( - py_sig.return_annotation - ) - py_sig = py_sig.replace(return_annotation=str) - - udf_sig = udf_def.UdfSignature.from_py_signature(py_sig) + udf_sig = udf_def.UdfSignature.from_py_signature( + py_sig + ).to_remote_function_compatible() ( rf_name, @@ -631,21 +585,19 @@ def wrapper(func): created_new, ) = remote_function_client.provision_bq_remote_function( func, - input_types=udf_sig.sql_input_types, - output_type=udf_sig.sql_output_type, - reuse=reuse, + func_signature=udf_sig, + reuse=reuse or False, name=name, - package_requirements=packages, - max_batching_rows=max_batching_rows, + package_requirements=tuple(packages) if packages else tuple(), + max_batching_rows=max_batching_rows or 1000, cloud_function_timeout=cloud_function_timeout, cloud_function_max_instance_count=cloud_function_max_instances, - is_row_processor=is_row_processor, cloud_function_vpc_connector=cloud_function_vpc_connector, cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_cpus=cloud_function_cpus, cloud_function_ingress_settings=cloud_function_ingress_settings, - bq_metadata=bqrf_metadata, + bq_connection_id=bq_connection_id, ) bigframes_cloud_function = ( @@ -676,12 +628,13 @@ def wrapper(func): signature=udf_sig, ) decorator = functools.wraps(func) - if is_row_processor: + if udf_sig.is_row_processor: + msg = bfe.format_message("input_types=Series is in preview.") + warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) return decorator( bq_functions.BigqueryCallableRowRoutine( udf_definition, session, - post_routine=post_process_routine, cloud_function_ref=bigframes_cloud_function, local_func=func, is_managed=False, @@ -692,7 +645,6 @@ def wrapper(func): bq_functions.BigqueryCallableRoutine( udf_definition, session, - post_routine=post_process_routine, cloud_function_ref=bigframes_cloud_function, local_func=func, is_managed=False, @@ -868,34 +820,10 @@ def wrapper(func): func, **signature_kwargs, ) - if input_types is not None: - if not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - if _utils.has_conflict_input_type(py_sig, input_types): - msg = bfe.format_message( - "Conflicting input types detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace( - parameters=[ - par.replace(annotation=itype) - for par, itype in zip(py_sig.parameters.values(), input_types) - ] - ) - if output_type: - if _utils.has_conflict_output_type(py_sig, output_type): - msg = bfe.format_message( - "Conflicting return type detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace(return_annotation=output_type) + py_sig = _resolve_signature(py_sig, input_types, output_type) # The function will actually be receiving a pandas Series, but allow # both BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if new_sig := _convert_row_processor_sig(py_sig): - py_sig = new_sig - is_row_processor = True udf_sig = udf_def.UdfSignature.from_py_signature(py_sig) @@ -904,22 +832,22 @@ def wrapper(func): bq_location, dataset_ref.dataset_id, bigquery_client, - bq_connection_id, bq_connection_manager, session=session, # type: ignore ) - - bq_function_name = managed_function_client.provision_bq_managed_function( - func=func, - input_types=udf_sig.sql_input_types, - output_type=udf_sig.sql_output_type, - name=name, - packages=packages, + config = udf_def.ManagedFunctionConfig( + code=udf_def.CodeDef.from_func(func), + signature=udf_sig, max_batching_rows=max_batching_rows, container_cpu=container_cpu, container_memory=container_memory, - is_row_processor=is_row_processor, bq_connection_id=bq_connection_id, + capture_references=False, + ) + + bq_function_name = managed_function_client.provision_bq_managed_function( + name=name, + config=config, ) full_rf_name = ( managed_function_client.get_remote_function_fully_qualilfied_name( @@ -936,13 +864,17 @@ def wrapper(func): self._update_temp_artifacts(full_rf_name, "") decorator = functools.wraps(func) - if is_row_processor: + if udf_sig.is_row_processor: + msg = bfe.format_message("input_types=Series is in preview.") + warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) + assert session is not None # appease mypy return decorator( bq_functions.BigqueryCallableRowRoutine( udf_definition, session, local_func=func, is_managed=True ) ) else: + assert session is not None # appease mypy return decorator( bq_functions.BigqueryCallableRoutine( udf_definition, @@ -981,31 +913,31 @@ def deploy_udf( return self.udf(**kwargs)(func) -def _convert_row_processor_sig( - signature: inspect.Signature, -) -> Optional[inspect.Signature]: - import bigframes.series as bf_series - - if len(signature.parameters) >= 1: - first_param = next(iter(signature.parameters.values())) - param_type = first_param.annotation - # Type hints for Series inputs should use pandas.Series because the - # underlying serialization process converts the input to a string - # representation of a pandas Series (not bigframes Series). Using - # bigframes Series will lead to TypeError when creating the function - # remotely. See more from b/445182819. - if param_type == bf_series.Series: - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "Argument type hint must be Pandas Series, not BigFrames Series.", +def _resolve_signature( + py_sig: inspect.Signature, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, +) -> inspect.Signature: + if input_types is not None: + if not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + if _utils.has_conflict_input_type(py_sig, input_types): + msg = bfe.format_message( + "Conflicting input types detected, using the one from the decorator." ) - if param_type == pandas.Series: - msg = bfe.format_message("input_types=Series is in preview.") - warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) - return signature.replace( - parameters=[ - p.replace(annotation=str) if i == 0 else p - for i, p in enumerate(signature.parameters.values()) - ] + warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) + py_sig = py_sig.replace( + parameters=[ + par.replace(annotation=itype) + for par, itype in zip(py_sig.parameters.values(), input_types) + ] + ) + if output_type: + if _utils.has_conflict_output_type(py_sig, output_type): + msg = bfe.format_message( + "Conflicting return type detected, using the one from the decorator." ) - return None + warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) + py_sig = py_sig.replace(return_annotation=output_type) + + return py_sig diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py index b6dedeac504..e02cd94fb18 100644 --- a/bigframes/functions/_utils.py +++ b/bigframes/functions/_utils.py @@ -75,12 +75,12 @@ def _package_existed(package_requirements: list[str], package: str) -> bool: def get_updated_package_requirements( - package_requirements=None, - is_row_processor=False, - capture_references=True, - ignore_package_version=False, -): - requirements = [] + package_requirements: Sequence[str] = (), + is_row_processor: bool = False, + capture_references: bool = True, + ignore_package_version: bool = False, +) -> Sequence[str]: + requirements: list[str] = [] if capture_references: requirements.append(f"cloudpickle=={cloudpickle.__version__}") @@ -110,13 +110,12 @@ def get_updated_package_requirements( if not requirements: return package_requirements - if not package_requirements: - package_requirements = [] + result = list(package_requirements) for package in requirements: - if not _package_existed(package_requirements, package): - package_requirements.append(package) + if not _package_existed(result, package): + result.append(package) - return sorted(package_requirements) + return sorted(result) def clean_up_by_session_id( @@ -183,6 +182,11 @@ def clean_up_by_session_id( pass +def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: + return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" + + +# Deprecated: Use CodeDef.stable_hash() instead. def get_hash(def_, package_requirements=None): "Get hash (32 digits alphanumeric) of a function." # There is a known cell-id sensitivity of the cloudpickle serialization in @@ -208,46 +212,28 @@ def get_hash(def_, package_requirements=None): return hashlib.md5(def_repr).hexdigest() -def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: - return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" - - -def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): - "Get a name for the cloud function for the given user defined function." - parts = [_BIGFRAMES_FUNCTION_PREFIX] - if session_id: - parts.append(session_id) - parts.append(function_hash) - if uniq_suffix: - parts.append(uniq_suffix) - return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_bigframes_function_name(function_hash, session_id, uniq_suffix=None): - "Get a name for the bigframes function for the given user defined function." - parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function_hash] - if uniq_suffix: - parts.append(uniq_suffix) - return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_python_output_type_from_bigframes_metadata( +def get_python_output_type_str_from_bigframes_metadata( metadata_text: str, -) -> Optional[type]: +) -> Optional[str]: try: metadata_dict = json.loads(metadata_text) except (TypeError, json.decoder.JSONDecodeError): return None - try: - output_type = metadata_dict["value"]["python_array_output_type"] + return metadata_dict["value"]["python_array_output_type"] except KeyError: return None + +def get_python_output_type_from_bigframes_metadata( + metadata_text: str, +) -> Optional[type]: + output_type_str = get_python_output_type_str_from_bigframes_metadata(metadata_text) + for ( python_output_array_type ) in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: - if python_output_array_type.__name__ == output_type: + if python_output_array_type.__name__ == output_type_str: return list[python_output_array_type] # type: ignore return None @@ -293,20 +279,6 @@ def get_python_version(is_compat: bool = False) -> str: return f"python{major}{minor}" if is_compat else f"python-{major}.{minor}" -def build_unnest_post_routine(py_list_type: type[list]): - sdk_type = function_typing.sdk_array_output_type_from_python_type(py_list_type) - assert sdk_type.array_element_type is not None - inner_sdk_type = sdk_type.array_element_type - result_dtype = function_typing.sdk_type_to_bf_type(inner_sdk_type) - - def post_process(input): - import bigframes.bigquery as bbq - - return bbq.json_extract_string_array(input, value_dtype=result_dtype) - - return post_process - - def has_conflict_input_type( signature: inspect.Signature, input_types: Sequence[Any], diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 4e06cb16633..18a000c722f 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -15,7 +15,7 @@ from __future__ import annotations import logging -from typing import Callable, cast, get_origin, Optional, TYPE_CHECKING +from typing import Callable, Optional, TYPE_CHECKING if TYPE_CHECKING: from bigframes.session import Session @@ -26,7 +26,7 @@ import bigframes.formatting_helpers as bf_formatting from bigframes.functions import _function_session as bff_session -from bigframes.functions import _utils, function_typing, udf_def +from bigframes.functions import function_typing, udf_def logger = logging.getLogger(__name__) @@ -82,39 +82,30 @@ def _try_import_routine( routine: bigquery.Routine, session: bigframes.Session ) -> BigqueryCallableRoutine: udf_def = _routine_as_udf_def(routine) - override_type = _get_output_type_override(routine) is_remote = ( hasattr(routine, "remote_function_options") and routine.remote_function_options ) - if override_type is not None: - return BigqueryCallableRoutine( - udf_def, - session, - post_routine=_utils.build_unnest_post_routine(override_type), - ) return BigqueryCallableRoutine(udf_def, session, is_managed=not is_remote) def _try_import_row_routine( routine: bigquery.Routine, session: bigframes.Session ) -> BigqueryCallableRowRoutine: - udf_def = _routine_as_udf_def(routine) - override_type = _get_output_type_override(routine) + udf_def = _routine_as_udf_def(routine, is_row_processor=True) + is_remote = ( hasattr(routine, "remote_function_options") and routine.remote_function_options ) - if override_type is not None: - return BigqueryCallableRowRoutine( - udf_def, - session, - post_routine=_utils.build_unnest_post_routine(override_type), - ) return BigqueryCallableRowRoutine(udf_def, session, is_managed=not is_remote) -def _routine_as_udf_def(routine: bigquery.Routine) -> udf_def.BigqueryUdf: +def _routine_as_udf_def( + routine: bigquery.Routine, is_row_processor: bool = False +) -> udf_def.BigqueryUdf: try: - return udf_def.BigqueryUdf.from_routine(routine) + return udf_def.BigqueryUdf.from_routine( + routine, is_row_processor=is_row_processor + ) except udf_def.ReturnTypeMissingError: raise bf_formatting.create_exception_with_feedback_link( ValueError, "Function return type must be specified." @@ -126,30 +117,6 @@ def _routine_as_udf_def(routine: bigquery.Routine) -> udf_def.BigqueryUdf: ) -def _get_output_type_override(routine: bigquery.Routine) -> Optional[type[list]]: - if routine.description is not None and isinstance(routine.description, str): - if python_output_type := _utils.get_python_output_type_from_bigframes_metadata( - routine.description - ): - bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type) - - if bq_return_type is None or bq_return_type.type_kind != "STRING": - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "An explicit output_type should be provided only for a BigQuery function with STRING output.", - ) - if get_origin(python_output_type) is list: - return python_output_type - else: - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "Currently only list of " - "a type is supported as python output type.", - ) - - return None - - # TODO(b/399894805): Support managed function. def read_gbq_function( function_name: str, @@ -178,6 +145,7 @@ def read_gbq_function( ValueError, f"Unknown function '{routine_ref}'." ) + # TODO(493293086): Deprecate is_row_processor. if is_row_processor: return _try_import_row_routine(routine, session) else: @@ -198,14 +166,10 @@ def __init__( *, local_func: Optional[Callable] = None, cloud_function_ref: Optional[str] = None, - post_routine: Optional[ - Callable[[bigframes.series.Series], bigframes.series.Series] - ] = None, is_managed: bool = False, ): self._udf_def = udf_def self._session = session - self._post_routine = post_routine self._local_fun = local_func self._cloud_function = cloud_function_ref self._is_managed = is_managed @@ -250,22 +214,15 @@ def bigframes_cloud_function(self) -> Optional[str]: @property def input_dtypes(self): - return self.udf_def.signature.bf_input_types + return tuple(arg.bf_type for arg in self.udf_def.signature.inputs) @property def output_dtype(self): - return self.udf_def.signature.bf_output_type + return self.udf_def.signature.output.bf_type @property def bigframes_bigquery_function_output_dtype(self): - return self.output_dtype - - def _post_process_series( - self, series: bigframes.series.Series - ) -> bigframes.series.Series: - if self._post_routine is not None: - return self._post_routine(series) - return series + return self.udf_def.signature.output.emulating_type.bf_type class BigqueryCallableRowRoutine: @@ -282,14 +239,11 @@ def __init__( *, local_func: Optional[Callable] = None, cloud_function_ref: Optional[str] = None, - post_routine: Optional[ - Callable[[bigframes.series.Series], bigframes.series.Series] - ] = None, is_managed: bool = False, ): + assert udf_def.signature.is_row_processor self._udf_def = udf_def self._session = session - self._post_routine = post_routine self._local_fun = local_func self._cloud_function = cloud_function_ref self._is_managed = is_managed @@ -334,19 +288,12 @@ def bigframes_cloud_function(self) -> Optional[str]: @property def input_dtypes(self): - return self.udf_def.signature.bf_input_types + return tuple(arg.bf_type for arg in self.udf_def.signature.inputs) @property def output_dtype(self): - return self.udf_def.signature.bf_output_type + return self.udf_def.signature.output.bf_type @property def bigframes_bigquery_function_output_dtype(self): - return self.output_dtype - - def _post_process_series( - self, series: bigframes.series.Series - ) -> bigframes.series.Series: - if self._post_routine is not None: - return self._post_routine(series) - return series + return self.udf_def.signature.output.emulating_type.bf_type diff --git a/bigframes/functions/function_template.py b/bigframes/functions/function_template.py index e48ffda8ed1..33a3688cf19 100644 --- a/bigframes/functions/function_template.py +++ b/bigframes/functions/function_template.py @@ -19,18 +19,12 @@ import os import re import textwrap -from typing import Tuple -import cloudpickle +from bigframes.functions import udf_def logger = logging.getLogger(__name__) -# Protocol version 4 is available in python version 3.4 and above -# https://docs.python.org/3/library/pickle.html#data-stream-format -_pickle_protocol_version = 4 - - # Placeholder variables for testing. input_types = ("STRING",) output_type = "STRING" @@ -228,38 +222,39 @@ def udf_http_row_processor(request): return jsonify({"errorMessage": traceback.format_exc()}), 400 -def generate_udf_code(def_, directory): +def generate_udf_code(code_def: udf_def.CodeDef, directory: str): """Generate serialized code using cloudpickle given a udf.""" udf_code_file_name = "udf.py" udf_pickle_file_name = "udf.cloudpickle" # original code, only for debugging purpose - udf_code = textwrap.dedent(inspect.getsource(def_)) - udf_code_file_path = os.path.join(directory, udf_code_file_name) - with open(udf_code_file_path, "w") as f: - f.write(udf_code) + if code_def.function_source: + udf_code_file_path = os.path.join(directory, udf_code_file_name) + with open(udf_code_file_path, "w") as f: + f.write(code_def.function_source) # serialized udf udf_pickle_file_path = os.path.join(directory, udf_pickle_file_name) # TODO(b/345433300): try io.BytesIO to avoid writing to the file system with open(udf_pickle_file_path, "wb") as f: - cloudpickle.dump(def_, f, protocol=_pickle_protocol_version) + f.write(code_def.pickled_code) return udf_code_file_name, udf_pickle_file_name def generate_cloud_function_main_code( - def_, - directory, + code_def: udf_def.CodeDef, + directory: str, *, - input_types: Tuple[str], - output_type: str, - is_row_processor=False, + udf_signature: udf_def.UdfSignature, ): """Get main.py code for the cloud function for the given user defined function.""" # Pickle the udf with all its dependencies - udf_code_file, udf_pickle_file = generate_udf_code(def_, directory) + udf_code_file, udf_pickle_file = generate_udf_code(code_def, directory) + + input_types = tuple(arg.sql_type for arg in udf_signature.inputs) + output_type = udf_signature.output.sql_type code_blocks = [ f"""\ @@ -278,7 +273,7 @@ def generate_cloud_function_main_code( # For converting scalar outputs to the correct type. code_blocks.append(inspect.getsource(convert_to_bq_json)) - if is_row_processor: + if udf_signature.is_row_processor: code_blocks.append(inspect.getsource(get_pd_series)) handler_func_name = "udf_http_row_processor" code_blocks.append(inspect.getsource(udf_http_row_processor)) @@ -297,37 +292,37 @@ def generate_cloud_function_main_code( def generate_managed_function_code( - def_, - udf_name: str, - is_row_processor: bool, + code_def: udf_def.CodeDef, + signature: udf_def.UdfSignature, capture_references: bool, ) -> str: """Generates the Python code block for managed Python UDF.""" + udf_name = "unpickled_udf" if capture_references: # This code path ensures that if the udf body contains any # references to variables and/or imports outside the body, they are # captured as well. - import cloudpickle - - pickled = cloudpickle.dumps(def_) func_code = textwrap.dedent( f""" import cloudpickle - {udf_name} = cloudpickle.loads({pickled}) + {udf_name} = cloudpickle.loads({code_def.pickled_code!r}) """ ) else: # This code path ensures that if the udf body is self contained, # i.e. there are no references to variables or imports outside the # body. - func_code = textwrap.dedent(inspect.getsource(def_)) + assert code_def.function_source is not None + assert code_def.entry_point is not None + func_code = code_def.function_source + udf_name = code_def.entry_point match = re.search(r"^def ", func_code, flags=re.MULTILINE) if match is None: raise ValueError("The UDF is not defined correctly.") func_code = func_code[match.start() :] - if is_row_processor: + if signature.is_row_processor: udf_code = textwrap.dedent(inspect.getsource(get_pd_series)) udf_code = udf_code[udf_code.index("def") :] bigframes_handler_code = textwrap.dedent( @@ -337,20 +332,19 @@ def bigframes_handler(str_arg): """ ) - sig = inspect.signature(def_) - params = list(sig.parameters.values()) + params = list(arg.name for arg in signature.inputs) additional_params = params[1:] # Build the parameter list for the new handler function definition. # e.g., "str_arg, y: bool, z" handler_def_parts = ["str_arg"] - handler_def_parts.extend(str(p) for p in additional_params) + handler_def_parts.extend(additional_params) handler_def_str = ", ".join(handler_def_parts) # Build the argument list for the call to the original UDF. # e.g., "get_pd_series(str_arg), y, z" udf_call_parts = [f"{get_pd_series.__name__}(str_arg)"] - udf_call_parts.extend(p.name for p in additional_params) + udf_call_parts.extend(additional_params) udf_call_str = ", ".join(udf_call_parts) bigframes_handler_code = textwrap.dedent( @@ -370,7 +364,7 @@ def bigframes_handler(*args): ) udf_code_block = [] - if not capture_references and is_row_processor: + if not capture_references and signature.is_row_processor: # Enable postponed evaluation of type annotations. This converts all # type hints to strings at runtime, which is necessary for correctly # handling the type annotation of pandas.Series after the UDF code is diff --git a/bigframes/functions/function_typing.py b/bigframes/functions/function_typing.py index 30804f317c4..43ccfe9b25b 100644 --- a/bigframes/functions/function_typing.py +++ b/bigframes/functions/function_typing.py @@ -81,7 +81,7 @@ def __init__(self, type_, supported_types): def sdk_type_from_python_type( - t: type, allow_lists: bool = False + t: type, allow_lists: bool = True ) -> bigquery.StandardSqlDataType: if (get_origin(t) is list) and allow_lists: return sdk_array_output_type_from_python_type(t) diff --git a/bigframes/functions/udf_def.py b/bigframes/functions/udf_def.py index 078e45f32d4..3ebf2eeb47a 100644 --- a/bigframes/functions/udf_def.py +++ b/bigframes/functions/udf_def.py @@ -14,160 +14,531 @@ from __future__ import annotations import dataclasses +import functools import inspect -from typing import cast, Optional +import io +import os +import textwrap +from typing import Any, cast, get_args, get_origin, Optional, Sequence, Type import warnings +import cloudpickle from google.cloud import bigquery +import google_crc32c +import pandas as pd import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting from bigframes.functions import function_typing +# Protocol version 4 is available in python version 3.4 and above +# https://docs.python.org/3/library/pickle.html#data-stream-format +_pickle_protocol_version = 4 + class ReturnTypeMissingError(ValueError): pass @dataclasses.dataclass(frozen=True) -class UdfField: +class UdfArg: name: str = dataclasses.field() - dtype: bigquery.StandardSqlDataType = dataclasses.field(hash=False, compare=False) + dtype: DirectScalarType | RowSeriesInputFieldV1 + + def __post_init__(self): + assert isinstance(self.name, str) + assert isinstance(self.dtype, (DirectScalarType, RowSeriesInputFieldV1)) + + @classmethod + def from_py_param(cls, param: inspect.Parameter) -> UdfArg: + if param.annotation == pd.Series: + return cls(param.name, RowSeriesInputFieldV1()) + return cls(param.name, DirectScalarType(param.annotation)) @classmethod - def from_sdk(cls, arg: bigquery.RoutineArgument) -> UdfField: + def from_sdk(cls, arg: bigquery.RoutineArgument) -> UdfArg: assert arg.name is not None - assert arg.data_type is not None - return cls(arg.name, arg.data_type) + + if arg.data_type is None: + msg = bfe.format_message( + "The function has one or more missing input data types. BigQuery DataFrames " + f"will assume default data type {function_typing.DEFAULT_RF_TYPE} for them." + ) + warnings.warn(msg, category=bfe.UnknownDataTypeWarning) + sdk_type = function_typing.DEFAULT_RF_TYPE + else: + sdk_type = arg.data_type + return cls(arg.name, DirectScalarType.from_sdk_type(sdk_type)) + + @property + def py_type(self) -> type: + return self.dtype.py_type + + @property + def bf_type(self) -> bigframes.dtypes.Dtype: + return self.dtype.bf_type + + @property + def sql_type(self) -> str: + return self.dtype.sql_type + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.name.encode()) + hash_val.update(self.dtype.stable_hash()) + return hash_val.digest() @dataclasses.dataclass(frozen=True) -class UdfSignature: - input_types: tuple[UdfField, ...] = dataclasses.field() - output_bq_type: bigquery.StandardSqlDataType = dataclasses.field( - hash=False, compare=False - ) +class DirectScalarType: + """ + Represents a scalar value that is passed directly to the remote function. + + For these values, BigQuery handles the serialization and deserialization without any additional processing. + """ + + _py_type: type @property - def bf_input_types(self) -> tuple[bigframes.dtypes.Dtype, ...]: - return tuple( - function_typing.sdk_type_to_bf_type(arg.dtype) for arg in self.input_types + def py_type(self) -> type: + return self._py_type + + @property + def bf_type(self) -> bigframes.dtypes.Dtype: + return function_typing.sdk_type_to_bf_type( + function_typing.sdk_type_from_python_type(self._py_type) ) @property - def bf_output_type(self) -> bigframes.dtypes.Dtype: - return function_typing.sdk_type_to_bf_type(self.output_bq_type) + def sql_type(self) -> str: + sdk_type = function_typing.sdk_type_from_python_type(self._py_type) + return function_typing.sdk_type_to_sql_string(sdk_type) + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self._py_type.__name__.encode()) + return hash_val.digest() + + @classmethod + def from_sdk_type(cls, sdk_type: bigquery.StandardSqlDataType) -> DirectScalarType: + return cls(function_typing.sdk_type_to_py_type(sdk_type)) @property - def py_input_types(self) -> tuple[type, ...]: - return tuple( - function_typing.sdk_type_to_py_type(arg.dtype) for arg in self.input_types - ) + def emulating_type(self) -> DirectScalarType: + return self + + +@dataclasses.dataclass(frozen=True) +class VirtualListTypeV1: + """ + Represents a list of scalar values that is emulated as a JSON array string in the remote function. + + Only works as output paramter right now where array -> string in function runtime, and then string -> array in SQL post-processing (defined in out_expr()). + """ + + _PROTOCOL_ID = "virtual_list_v1" + + inner_dtype: DirectScalarType + + @property + def py_type(self) -> Type[list[Any]]: + return list[self.inner_dtype.py_type] # type: ignore + + @property + def bf_type(self) -> bigframes.dtypes.Dtype: + return bigframes.dtypes.list_type(self.inner_dtype.bf_type) @property - def py_output_type(self) -> type: - return function_typing.sdk_type_to_py_type(self.output_bq_type) + def emulating_type(self) -> DirectScalarType: + # Regardless of list inner type, string is used to emulate the list in the remote function. + return DirectScalarType(str) + + def out_expr( + self, expr: bigframes.core.expression.Expression + ) -> bigframes.core.expression.Expression: + # essentially we are undoing json.dumps in sql + import bigframes.operations as ops + + as_str_list = ops.JSONValueArray(json_path="$").as_expr(expr) + if self.inner_dtype.py_type is str: + return as_str_list + elif self.inner_dtype.py_type is bool: + # hack so we don't need to make ArrayMap support general expressions yet + # with b/495513753 we can map the equality operator instead + return ops.ArrayMapOp(ops.IsInOp(values=("true",))).as_expr(as_str_list) + else: + return ops.ArrayMapOp(ops.AsTypeOp(self.inner_dtype.bf_type)).as_expr( + as_str_list + ) + + @property + def sql_type(self) -> str: + return f"ARRAY<{self.inner_dtype.sql_type}>" + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self._PROTOCOL_ID.encode()) + hash_val.update(self.inner_dtype.stable_hash()) + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class RowSeriesInputFieldV1: + """ + Used to handle functions that logically take a series as an input, but handled via a string protocol in the remote function. + + For these, the serialization is dependent on index metadata, which must be provided by the caller. + """ + + _PROTOCOL_ID = "row_series_input_v1" + + @property + def py_type(self) -> type: + return pd.Series + + @property + def bf_type(self) -> bigframes.dtypes.Dtype: + # Code paths shouldn't hit this. + raise ValueError("Series does not have a corresponding BigFrames type.") + + @property + def sql_type(self) -> str: + return "STRING" @property - def sql_input_types(self) -> tuple[str, ...]: - return tuple( - function_typing.sdk_type_to_sql_string(arg.dtype) - for arg in self.input_types + def emulating_type(self) -> DirectScalarType: + # Regardless of list inner type, string is used to emulate the list in the remote function. + return DirectScalarType(str) + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self._PROTOCOL_ID.encode()) + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class UdfSignature: + """ + Represents the mapping of input types from bigframes to sql to python and back. + """ + + inputs: tuple[UdfArg, ...] = dataclasses.field() + output: DirectScalarType | VirtualListTypeV1 + + def __post_init__(self): + # Validate inputs and outputs are of the correct types. + assert all(isinstance(arg, UdfArg) for arg in self.inputs) + assert isinstance(self.output, (DirectScalarType, VirtualListTypeV1)) + + def to_sql_input_signature(self) -> str: + return ",".join( + f"{field.name} {field.sql_type}" + for field in self.with_devirtualize().inputs ) @property - def sql_output_type(self) -> str: - return function_typing.sdk_type_to_sql_string(self.output_bq_type) + def protocol_metadata(self) -> str | None: + import bigframes.functions._utils + + if isinstance(self.output, VirtualListTypeV1): + return bigframes.functions._utils.get_bigframes_metadata( + python_output_type=self.output.py_type + ) + return None + + @property + def is_virtual(self) -> bool: + dtypes = (self.output,) + tuple(arg.dtype for arg in self.inputs) + return not all(isinstance(dtype, DirectScalarType) for dtype in dtypes) + + @property + def is_row_processor(self) -> bool: + return any(isinstance(arg.dtype, RowSeriesInputFieldV1) for arg in self.inputs) + + def with_devirtualize(self) -> UdfSignature: + return UdfSignature( + inputs=tuple( + UdfArg(arg.name, arg.dtype.emulating_type) for arg in self.inputs + ), + output=self.output.emulating_type, + ) + # TODO(493293086): Deprecate is_row_processor. @classmethod - def from_routine(cls, routine: bigquery.Routine) -> UdfSignature: + def from_routine( + cls, routine: bigquery.Routine, is_row_processor: bool = False + ) -> UdfSignature: + import bigframes.functions._utils + + ## Handle return type if routine.return_type is None: - raise ReturnTypeMissingError + raise ReturnTypeMissingError( + f"Routine {routine} has no return type. Routine properties: {routine._properties}" + ) + bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type) - if ( - bq_return_type.type_kind is None - or bq_return_type.type_kind - not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + return_type: DirectScalarType | VirtualListTypeV1 = ( + DirectScalarType.from_sdk_type(bq_return_type) + ) + if python_output_type := bigframes.functions._utils.get_python_output_type_from_bigframes_metadata( + routine.description ): - raise ValueError( - f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}" - ) - - udf_fields = [] - for argument in routine.arguments: - if argument.data_type is None: - msg = bfe.format_message( - "The function has one or more missing input data types. BigQuery DataFrames " - f"will assume default data type {function_typing.DEFAULT_RF_TYPE} for them." - ) - warnings.warn(msg, category=bfe.UnknownDataTypeWarning) - assert argument.name is not None - udf_fields.append( - UdfField(argument.name, function_typing.DEFAULT_RF_TYPE) + if bq_return_type.type_kind != "STRING": + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "An explicit output_type should be provided only for a BigQuery function with STRING output.", ) + + if get_origin(python_output_type) is list: + inner_type = get_args(python_output_type)[0] + return_type = VirtualListTypeV1(DirectScalarType(inner_type)) else: - udf_fields.append(UdfField.from_sdk(argument)) + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "Currently only list of " + "a type is supported as python output type.", + ) + + ## Handle input types + udf_fields = [] + + for i, argument in enumerate(routine.arguments): + if is_row_processor and i == 0: + if argument.data_type.type_kind == "STRING": + udf_fields.append(UdfArg(argument.name, RowSeriesInputFieldV1())) + else: + raise ValueError( + "Row processor functions must have STRING input type as first argument." + ) + udf_fields.append(UdfArg.from_sdk(argument)) return cls( - input_types=tuple(udf_fields), - output_bq_type=bq_return_type, + inputs=tuple(udf_fields), + output=return_type, ) @classmethod def from_py_signature(cls, signature: inspect.Signature): - input_types: list[UdfField] = [] + import bigframes.series + + input_types: list[UdfArg] = [] for parameter in signature.parameters.values(): if parameter.annotation is inspect.Signature.empty: raise bf_formatting.create_exception_with_feedback_link( ValueError, "'input_types' was not set and parameter " f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function.", + "Types are required to use udfs.", + ) + if parameter.annotation is bigframes.series.Series: + raise TypeError( + "Argument type hint must be Pandas Series, not BigFrames Series." ) - bq_type = function_typing.sdk_type_from_python_type(parameter.annotation) - input_types.append(UdfField(parameter.name, bq_type)) + + input_types.append(UdfArg.from_py_param(parameter)) if signature.return_annotation is inspect.Signature.empty: raise bf_formatting.create_exception_with_feedback_link( ValueError, "'output_type' was not set and function is missing a " "return type annotation. Types are required to use " - "@remote_function.", + "udfs.", ) - output_bq_type = function_typing.sdk_type_from_python_type( - signature.return_annotation, - allow_lists=True, - ) - return cls(tuple(input_types), output_bq_type) + + output_type = DirectScalarType(signature.return_annotation) + return cls(tuple(input_types), output_type) + + def to_remote_function_compatible(self) -> UdfSignature: + # need to virtualize list outputs + if isinstance(self.output, DirectScalarType): + if get_origin(self.output.py_type) is list: + inner_py_type = get_args(self.output.py_type)[0] + return UdfSignature( + inputs=self.inputs, + output=VirtualListTypeV1(DirectScalarType(inner_py_type)), + ) + return self + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + for input_type in self.inputs: + hash_val.update(input_type.stable_hash()) + hash_val.update(self.output.stable_hash()) + return hash_val.digest() @dataclasses.dataclass(frozen=True) class BigqueryUdf: + """ + Represents the information needed to call a BigQuery remote function - not a full spec. + """ + routine_ref: bigquery.RoutineReference = dataclasses.field() signature: UdfSignature - # Used to provide alternative interpretations of output bq type, eg interpret int as timestamp - output_type_override: Optional[bigframes.dtypes.Dtype] = dataclasses.field( - default=None - ) - @property - def bigframes_output_type(self) -> bigframes.dtypes.Dtype: - return self.output_type_override or function_typing.sdk_type_to_bf_type( - self.signature.output_bq_type + def with_devirtualize(self) -> BigqueryUdf: + if not self.signature.is_virtual: + return self + return BigqueryUdf( + routine_ref=self.routine_ref, + signature=self.signature.with_devirtualize(), ) @classmethod - def from_routine(cls, routine: bigquery.Routine) -> BigqueryUdf: - signature = UdfSignature.from_routine(routine) - - if ( - signature.output_bq_type.type_kind is None - or signature.output_bq_type.type_kind - not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS - ): - raise ValueError( - f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}" - ) + def from_routine( + cls, routine: bigquery.Routine, is_row_processor: bool = False + ) -> BigqueryUdf: + signature = UdfSignature.from_routine( + routine, is_row_processor=is_row_processor + ) return cls(routine.reference, signature=signature) + + +@dataclasses.dataclass(frozen=True) +class CodeDef: + # Produced by cloudpickle, not compatible across python versions + pickled_code: bytes + # This is just the function itself, and does not include referenced objects/functions/modules + function_source: Optional[str] + entry_point: Optional[str] + package_requirements: tuple[str, ...] + + @classmethod + def from_func(cls, func, package_requirements: Sequence[str] | None = None): + bytes_io = io.BytesIO() + cloudpickle.dump(func, bytes_io, protocol=_pickle_protocol_version) + source = None + entry_point = None + try: + # dedent is hacky, but works for some nested functions + source = textwrap.dedent(inspect.getsource(func)) + entry_point = func.__name__ + except OSError: + pass + return cls( + pickled_code=bytes_io.getvalue(), + function_source=source, + entry_point=entry_point, + package_requirements=tuple(package_requirements or []), + ) + + @functools.cache + def stable_hash(self) -> bytes: + # There is a known cell-id sensitivity of the cloudpickle serialization in + # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of + # this, if a cell contains a udf decorated with @remote_function, a unique + # cloudpickle code is generated every time the cell is run, creating new + # cloud artifacts every time. This is slow and wasteful. + # A workaround of the same can be achieved by replacing the filename in the + # code object to a static value + # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. + # + # To respect the user code/environment let's make this modification on a + # copy of the udf, not on the original udf itself. + def_copy = cloudpickle.loads(self.pickled_code) + def_copy.__code__ = def_copy.__code__.replace( + co_filename="bigframes_place_holder_filename" + ) + + normalized_pickled_code = cloudpickle.dumps( + def_copy, protocol=_pickle_protocol_version + ) + + hash_val = google_crc32c.Checksum() + hash_val.update(normalized_pickled_code) + + if self.package_requirements: + for p in sorted(self.package_requirements): + hash_val.update(p.encode()) + + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class ManagedFunctionConfig: + code: CodeDef + signature: UdfSignature + max_batching_rows: Optional[int] + container_cpu: Optional[float] + container_memory: Optional[str] + bq_connection_id: Optional[str] + # capture_refernces=True -> deploy as cloudpickle + # capture_references=False -> deploy as source + capture_references: bool = False + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.code.stable_hash()) + hash_val.update(self.signature.stable_hash()) + hash_val.update(str(self.max_batching_rows).encode()) + hash_val.update(str(self.container_cpu).encode()) + hash_val.update(str(self.container_memory).encode()) + hash_val.update(str(self.bq_connection_id).encode()) + hash_val.update(str(self.capture_references).encode()) + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class CloudRunFunctionConfig: + code: CodeDef + signature: UdfSignature + timeout_seconds: int | None + max_instance_count: int | None + vpc_connector: str | None + vpc_connector_egress_settings: str + memory_mib: int | None + cpus: float | None + ingress_settings: str + workers: int | None + threads: int | None + concurrency: int | None + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.code.stable_hash()) + hash_val.update(self.signature.stable_hash()) + hash_val.update(str(self.timeout_seconds).encode()) + hash_val.update(str(self.max_instance_count).encode()) + hash_val.update(str(self.vpc_connector).encode()) + hash_val.update(str(self.vpc_connector_egress_settings).encode()) + hash_val.update(str(self.memory_mib).encode()) + hash_val.update(str(self.cpus).encode()) + hash_val.update(str(self.ingress_settings).encode()) + hash_val.update(str(self.workers).encode()) + hash_val.update(str(self.threads).encode()) + hash_val.update(str(self.concurrency).encode()) + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class RemoteFunctionConfig: + """ + Represents the information needed to create a BigQuery remote function. + """ + + endpoint: str + signature: UdfSignature + connection_id: str + max_batching_rows: int + bq_metadata: str | None = None + + @classmethod + def from_bq_routine(cls, routine: bigquery.Routine) -> RemoteFunctionConfig: + return cls( + endpoint=routine.remote_function_options.endpoint, + connection_id=os.path.basename(routine.remote_function_options.connection), + signature=UdfSignature.from_routine(routine), + max_batching_rows=routine.remote_function_options.max_batching_rows, + bq_metadata=routine.description, + ) + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.endpoint.encode()) + hash_val.update(self.signature.stable_hash()) + hash_val.update(self.connection_id.encode()) + hash_val.update(str(self.max_batching_rows).encode()) + hash_val.update(str(self.bq_metadata).encode()) + return hash_val.digest() diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 68842961e3f..78f592a10e7 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -286,7 +286,7 @@ class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor): """Multimodal embedding generator LLM model. .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). @@ -374,7 +374,7 @@ def predict( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. - The content column must be of string type or BigFrames Blob of image or video. + The content column must be of string type or BigFrames `ObjectRef `_ of image or video. max_retries (int, default 0): Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry. @@ -668,13 +668,13 @@ def predict( prompt (Iterable of str or bigframes.series.Series, or None, default None): .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). Construct a prompt struct column for prediction based on the input. The input must be an Iterable that can take string literals, - such as "summarize", string column(s) of X, such as X["str_col"], or blob column(s) of X, such as X["blob_col"]. + such as "summarize", string column(s) of X, such as X["str_col"], or `ObjectRef column(s) `_ of X, such as X["objectref_col"]. It creates a struct column of the items of the iterable, and use the concatenated result as the input prompt. No-op if set to None. output_schema (Mapping[str, str] or None, default None): The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of :. diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a1c7754ab5c..9f585843b84 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -25,6 +25,7 @@ ) from bigframes.operations.array_ops import ( ArrayIndexOp, + ArrayMapOp, ArrayReduceOp, ArraySliceOp, ArrayToStringOp, @@ -440,4 +441,5 @@ "NUMPY_TO_OP", "ToArrayOp", "ArrayReduceOp", + "ArrayMapOp", ] diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 6921299acd8..456ef3ecda3 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -203,7 +203,7 @@ def map( has_blob_column = False for column in columns: if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string + # Don't cast ObjectRef columns to string has_blob_column = True continue @@ -612,8 +612,11 @@ def sim_join( >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) - >>> df1.ai.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) - animal animal_1 + >>> res = df1.ai.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) + >>> print("---"); print(res) # doctest: +ELLIPSIS + --- + ... + animal animal_1 0 monkey baboon 1 spider scorpion diff --git a/bigframes/operations/array_ops.py b/bigframes/operations/array_ops.py index 61ada59cc7b..c5694e50baa 100644 --- a/bigframes/operations/array_ops.py +++ b/bigframes/operations/array_ops.py @@ -88,3 +88,17 @@ def output_type(self, *input_types): assert dtypes.is_array_like(input_type) inner_type = dtypes.get_array_inner_type(input_type) return self.aggregation.output_type(inner_type) + + +@dataclasses.dataclass(frozen=True) +class ArrayMapOp(base_ops.UnaryOp): + name: typing.ClassVar[str] = "array_map" + # TODO(b/495513753): Generalize to chained expressions + map_op: base_ops.UnaryOp + + def output_type(self, *input_types): + input_type = input_types[0] + assert dtypes.is_array_like(input_type) + inner_type = dtypes.get_array_inner_type(input_type) + out_inner_type = self.map_op.output_type(inner_type) + return dtypes.list_type(out_inner_type) diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index e610ce61d6e..9c51210df0e 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -31,7 +31,7 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - return self.function_def.bigframes_output_type + return self.function_def.signature.output.bf_type @dataclasses.dataclass(frozen=True) @@ -44,7 +44,7 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - return self.function_def.bigframes_output_type + return self.function_def.signature.output.bf_type @dataclasses.dataclass(frozen=True) @@ -57,4 +57,4 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - return self.function_def.bigframes_output_type + return self.function_def.signature.output.bf_type diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index f237959d0d3..b4456986308 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -382,7 +382,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals has_blob_column = False for column in columns: if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string + # Don't cast ObjectRef columns to string has_blob_column = True continue @@ -501,7 +501,7 @@ def map( has_blob_column = False for column in columns: if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string + # Don't cast ObjectRef columns to string has_blob_column = True continue diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index fcb60bf7782..4db900e7761 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -12,7 +12,64 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""BigQuery DataFrames provides a DataFrame API backed by the BigQuery engine.""" +""" +The primary entry point for the BigQuery DataFrames (BigFrames) pandas-compatible API. + +**BigQuery DataFrames** provides a Pythonic DataFrame and machine learning (ML) API +powered by the BigQuery engine. The ``bigframes.pandas`` module implements a large +subset of the pandas API, allowing you to perform large-scale data analysis +using familiar pandas syntax while the computations are executed in the cloud. + +**Key Features:** + +* **Petabyte-Scale Scalability:** Handle datasets that exceed local memory by + offloading computation to the BigQuery distributed engine. +* **Pandas Compatibility:** Use common pandas methods like + :func:`~bigframes.pandas.DataFrame.groupby`, + :func:`~bigframes.pandas.DataFrame.merge`, + :func:`~bigframes.pandas.DataFrame.pivot_table`, and more on BigQuery-backed + :class:`~bigframes.pandas.DataFrame` objects. +* **Direct BigQuery Integration:** Read from and write to BigQuery tables and + queries with :func:`bigframes.pandas.read_gbq` and + :func:`bigframes.pandas.DataFrame.to_gbq`. +* **User-defined Functions (UDFs):** Effortlessly deploy Python functions + functions using the :func:`bigframes.pandas.remote_function` and + :func:`bigframes.pandas.udf` decorators. +* **Data Ingestion:** Support for various formats including CSV, Parquet, JSON, + and Arrow via :func:`bigframes.pandas.read_csv`, + :func:`bigframes.pandas.read_parquet`, etc., which are automatically uploaded + to BigQuery for processing. Convert any pandas DataFrame into a BigQuery + DataFrame using :func:`bigframes.pandas.read_pandas`. + +**Example usage:** + + >>> import bigframes.pandas as bpd + +Initialize session and set options. + + >>> bpd.options.bigquery.project = "your-project-id" # doctest: +SKIP + +Load data from a BigQuery public dataset. + + >>> df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") # doctest: +SKIP + +Perform familiar pandas operations that execute in the cloud. + + >>> top_names = ( + ... df.groupby("name") + ... .agg({"number": "sum"}) + ... .sort_values("number", ascending=False) + ... .head(10) + ... ) # doctest: +SKIP + +Bring the final, aggregated results back to local memory if needed. + + >>> local_df = top_names.to_pandas() # doctest: +SKIP + +BigQuery DataFrames is designed for data scientists and analysts who need the +power of BigQuery with the ease of use of pandas. It eliminates the "data +movement bottleneck" by keeping your data in BigQuery for processing. +""" from __future__ import annotations diff --git a/bigframes/series.py b/bigframes/series.py index 23799a0a43c..7eb30beb826 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2042,7 +2042,6 @@ def apply( result_series = self._apply_unary_op( ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True) ) - result_series = func._post_process_series(result_series) return result_series @@ -2095,7 +2094,6 @@ def combine( result_series = self._apply_binary_op( other, ops.BinaryRemoteFunctionOp(function_def=func.udf_def) ) - result_series = func._post_process_series(result_series) return result_series bf_op = python_ops.python_callable_to_op(func) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7ea6e999545..75be3022d7d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -265,15 +265,20 @@ def __init__( metrics=self._metrics, publisher=self._publisher, ) + + labels = {} + if not self._strictly_ordered: + labels["bigframes-mode"] = "unordered" + self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( bqclient=self._clients_provider.bqclient, bqstoragereadclient=self._clients_provider.bqstoragereadclient, loader=self._loader, storage_manager=self._temp_storage_manager, - strictly_ordered=self._strictly_ordered, metrics=self._metrics, enable_polars_execution=context.enable_polars_execution, publisher=self._publisher, + labels=labels, ) def __del__(self): @@ -659,9 +664,11 @@ def read_gbq_query( ... WHERE year = 2016 ... GROUP BY pitcherFirstName, pitcherLastName ... ''', index_col="rowindex") - >>> df.head(2) + >>> print("START_OF_OUTPUT"); df.head(2) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE + START_OF_OUTPUT + ... pitcherFirstName pitcherLastName averagePitchSpeed - rowindex + ... 1 Albertin Chapman 96.514113 2 Zachary Britton 94.591039 @@ -2229,12 +2236,12 @@ def _create_temp_table( def from_glob_path( self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None ) -> dataframe.DataFrame: - r"""Create a BigFrames DataFrame that contains a BigFrames Blob column from a global wildcard path. + r"""Create a BigFrames DataFrame that contains a BigFrames `ObjectRef column `_ from a global wildcard path. This operation creates a temporary BQ Object Table under the hood and requires bigquery.connections.delegate permission or BigQuery Connection Admin role. If you have an existing BQ Object Table, use read_gbq_object_table(). .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). @@ -2247,7 +2254,7 @@ def from_glob_path( If None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach permission if the connection isn't fully set up. name (str): - The column name of the Blob column. + The column name of the ObjectRef column. Returns: bigframes.pandas.DataFrame: Result BigFrames DataFrame. @@ -2290,18 +2297,18 @@ def _create_bq_connection( def read_gbq_object_table( self, object_table: str, *, name: Optional[str] = None ) -> dataframe.DataFrame: - """Read an existing object table to create a BigFrames Blob DataFrame. Use the connection of the object table for the connection of the blob. + """Read an existing object table to create a BigFrames `ObjectRef `_ DataFrame. Use the connection of the object table for the connection of the ObjectRef. This function dosen't retrieve the object table data. If you want to read the data, use read_gbq() instead. .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). Args: object_table (str): name of the object table of form ... - name (str or None): the returned blob column name. + name (str or None): the returned ObjectRef column name. Returns: bigframes.pandas.DataFrame: diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 1e240a841c5..7cf9d9bd6dc 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -14,10 +14,10 @@ from __future__ import annotations +import concurrent.futures import math import threading from typing import Literal, Mapping, Optional, Sequence, Tuple -import weakref import google.api_core.exceptions from google.cloud import bigquery @@ -47,6 +47,7 @@ semi_executor, ) import bigframes.session._io.bigquery as bq_io +import bigframes.session.execution_cache as execution_cache import bigframes.session.execution_spec as ex_spec import bigframes.session.metrics import bigframes.session.planner @@ -59,58 +60,6 @@ _MAX_CLUSTER_COLUMNS = 4 MAX_SMALL_RESULT_BYTES = 10 * 1024 * 1024 * 1024 # 10G -SourceIdMapping = Mapping[str, str] - - -class ExecutionCache: - def __init__(self): - # current assumption is only 1 cache of a given node - # in future, might have multiple caches, with different layout, localities - self._cached_executions: weakref.WeakKeyDictionary[ - nodes.BigFrameNode, nodes.CachedTableNode - ] = weakref.WeakKeyDictionary() - self._uploaded_local_data: weakref.WeakKeyDictionary[ - local_data.ManagedArrowTable, - tuple[bq_data.BigqueryDataSource, SourceIdMapping], - ] = weakref.WeakKeyDictionary() - - @property - def mapping(self) -> Mapping[nodes.BigFrameNode, nodes.BigFrameNode]: - return self._cached_executions - - def cache_results_table( - self, - original_root: nodes.BigFrameNode, - data: bq_data.BigqueryDataSource, - ): - # Assumption: GBQ cached table uses field name as bq column name - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(field.id, field.id.sql) for field in original_root.fields - ) - ) - cached_replacement = nodes.CachedTableNode( - source=data, - scan_list=scan_list, - table_session=original_root.session, - original_node=original_root, - ) - assert original_root.schema == cached_replacement.schema - self._cached_executions[original_root] = cached_replacement - - def cache_remote_replacement( - self, - local_data: local_data.ManagedArrowTable, - bq_data: bq_data.BigqueryDataSource, - ): - # bq table has one extra column for offsets, those are implicit for local data - assert len(local_data.schema.items) + 1 == len(bq_data.table.physical_schema) - mapping = { - local_data.schema.items[i].column: bq_data.table.physical_schema[i].name - for i in range(len(local_data.schema)) - } - self._uploaded_local_data[local_data] = (bq_data, mapping) - class BigQueryCachingExecutor(executor.Executor): """Computes BigFrames values using BigQuery Engine. @@ -128,20 +77,20 @@ def __init__( bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, loader: loader.GbqDataLoader, *, - strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, enable_polars_execution: bool = False, publisher: bigframes.core.events.Publisher, + labels: Mapping[str, str] = {}, ): self.bqclient = bqclient self.storage_manager = storage_manager - self.strictly_ordered: bool = strictly_ordered - self.cache: ExecutionCache = ExecutionCache() + self.cache: execution_cache.ExecutionCache = execution_cache.ExecutionCache() self.metrics = metrics self.loader = loader self.bqstoragereadclient = bqstoragereadclient self._enable_polars_execution = enable_polars_execution self._publisher = publisher + self._labels = labels # TODO(tswast): Send events from semi-executors, too. self._semi_executors: Sequence[semi_executor.SemiExecutor] = ( @@ -298,7 +247,7 @@ def _export_gbq( ) sql = compiled.sql - if (existing_table is not None) and _if_schema_match( + if (existing_table is not None) and _is_schema_match( existing_table.schema, array_value.schema ): # b/409086472: Uses DML for table appends and replacements to avoid @@ -334,13 +283,14 @@ def _export_gbq( session=array_value.session, ) - has_timedelta_col = any( - t == bigframes.dtypes.TIMEDELTA_DTYPE for t in array_value.schema.dtypes + has_special_dtype_col = any( + t in (bigframes.dtypes.TIMEDELTA_DTYPE, bigframes.dtypes.OBJ_REF_DTYPE) + for t in array_value.schema.dtypes ) - if spec.if_exists != "append" and has_timedelta_col: + if spec.if_exists != "append" and has_special_dtype_col: # Only update schema if this is not modifying an existing table, and the - # new table contains timedelta columns. + # new table contains special columns (like timedelta or obj_ref). table = self.bqclient.get_table(spec.table) table.schema = array_value.schema.to_bigquery() self.bqclient.update_table(table, ["schema"]) @@ -409,8 +359,8 @@ def _run_execute_query( bigframes.options.compute.maximum_bytes_billed ) - if not self.strictly_ordered: - job_config.labels["bigframes-mode"] = "unordered" + if self._labels: + job_config.labels.update(self._labels) try: # Trick the type checker into thinking we got a literal. @@ -449,9 +399,6 @@ def _run_execute_query( else: raise - def replace_cached_subtrees(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: - return nodes.top_down(node, lambda x: self.cache.mapping.get(x, x)) - def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): """ Can the block be evaluated very cheaply? @@ -481,7 +428,7 @@ def prepare_plan( ): self._simplify_with_caching(plan) - plan = self.replace_cached_subtrees(plan) + plan = self.cache.subsitute_cached_subplans(plan) plan = rewrite.column_pruning(plan) plan = plan.top_down(rewrite.fold_row_counts) @@ -526,7 +473,7 @@ def _cache_with_session_awareness( self._cache_with_cluster_cols( bigframes.core.ArrayValue(target), cluster_cols_sql_names ) - elif self.strictly_ordered: + elif not target.order_ambiguous: self._cache_with_offsets(bigframes.core.ArrayValue(target)) else: self._cache_with_cluster_cols(bigframes.core.ArrayValue(target), []) @@ -551,7 +498,7 @@ def _cache_most_complex_subtree(self, node: nodes.BigFrameNode) -> bool: node, min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), max_complexity=QUERY_COMPLEXITY_LIMIT, - cache=dict(self.cache.mapping), + cache=self.cache, # Heuristic: subtree_compleixty * (copies of subtree)^2 heuristic=lambda complexity, count: math.log(complexity) + 2 * math.log(count), @@ -568,49 +515,58 @@ def _substitute_large_local_sources(self, original_root: nodes.BigFrameNode): Replace large local sources with the uploaded version of those datasources. """ # Step 1: Upload all previously un-uploaded data + needs_upload = [] for leaf in original_root.unique_nodes(): if isinstance(leaf, nodes.ReadLocalNode): if ( leaf.local_data_source.metadata.total_bytes > bigframes.constants.MAX_INLINE_BYTES ): - self._upload_local_data(leaf.local_data_source) + needs_upload.append(leaf.local_data_source) + + futures: dict[concurrent.futures.Future, local_data.ManagedArrowTable] = dict() + for local_source in needs_upload: + future = self.loader.read_data_async( + local_source, bigframes.core.guid.generate_guid() + ) + futures[future] = local_source + try: + for future in concurrent.futures.as_completed(futures.keys()): + self.cache.cache_remote_replacement(futures[future], future.result()) + except Exception as e: + # cancel all futures + for future in futures: + future.cancel() + raise e # Step 2: Replace local scans with remote scans def map_local_scans(node: nodes.BigFrameNode): if not isinstance(node, nodes.ReadLocalNode): return node - if node.local_data_source not in self.cache._uploaded_local_data: - return node - bq_source, source_mapping = self.cache._uploaded_local_data[ + uploaded_local_data = self.cache.get_uploaded_local_data( node.local_data_source - ] - scan_list = node.scan_list.remap_source_ids(source_mapping) + ) + if uploaded_local_data is None: + return node + + scan_list = node.scan_list.remap_source_ids( + uploaded_local_data.source_mapping + ) # offsets_col isn't part of ReadTableNode, so emulate by adding to end of scan_list if node.offsets_col is not None: # Offsets are always implicitly the final column of uploaded data # See: Loader.load_data scan_list = scan_list.append( - bq_source.table.physical_schema[-1].name, + uploaded_local_data.bq_source.table.physical_schema[-1].name, bigframes.dtypes.INT_DTYPE, node.offsets_col, ) - return nodes.ReadTableNode(bq_source, scan_list, node.session) + return nodes.ReadTableNode( + uploaded_local_data.bq_source, scan_list, node.session + ) return original_root.bottom_up(map_local_scans) - def _upload_local_data(self, local_table: local_data.ManagedArrowTable): - if local_table in self.cache._uploaded_local_data: - return - # Lock prevents concurrent repeated work, but slows things down. - # Might be better as a queue and a worker thread - with self._upload_lock: - if local_table not in self.cache._uploaded_local_data: - uploaded = self.loader.load_data_or_write_data( - local_table, bigframes.core.guid.generate_guid() - ) - self.cache.cache_remote_replacement(local_table, uploaded) - def _execute_plan_gbq( self, plan: nodes.BigFrameNode, @@ -734,16 +690,16 @@ def _result_schema( ) -def _if_schema_match( - table_schema: Tuple[bigquery.SchemaField, ...], schema: schemata.ArraySchema +def _is_schema_match( + table_schema: Tuple[bigquery.SchemaField, ...], + schema: schemata.ArraySchema, ) -> bool: if len(table_schema) != len(schema.items): return False - for field in table_schema: - if field.name not in schema.names: + for field, schema_item in zip(table_schema, schema.items): + if field.name != schema_item.column: return False - if bigframes.dtypes.convert_schema_field(field)[1] != schema.get_type( - field.name - ): + _, field_dtype = bigframes.dtypes.convert_schema_field(field) + if field_dtype != schema_item.dtype: return False return True diff --git a/bigframes/session/execution_cache.py b/bigframes/session/execution_cache.py new file mode 100644 index 00000000000..782a1c5c4eb --- /dev/null +++ b/bigframes/session/execution_cache.py @@ -0,0 +1,88 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +from typing import Mapping, Optional +import weakref + +from bigframes.core import bq_data, local_data, nodes + +SourceIdMapping = Mapping[str, str] + + +@dataclasses.dataclass(frozen=True) +class UploadedLocalData: + bq_source: bq_data.BigqueryDataSource + source_mapping: SourceIdMapping + + +class ExecutionCache: + def __init__(self): + # effectively two separate caches that don't interact + self._cached_executions: weakref.WeakKeyDictionary[ + nodes.BigFrameNode, bq_data.BigqueryDataSource + ] = weakref.WeakKeyDictionary() + # This upload cache is entirely independent of the plan cache. + self._uploaded_local_data: weakref.WeakKeyDictionary[ + local_data.ManagedArrowTable, + UploadedLocalData, + ] = weakref.WeakKeyDictionary() + + def subsitute_cached_subplans(self, root: nodes.BigFrameNode) -> nodes.BigFrameNode: + def replace_if_cached(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if node not in self._cached_executions: + return node + # Assumption: GBQ cached table uses field name as bq column name + scan_list = nodes.ScanList( + tuple(nodes.ScanItem(field.id, field.id.sql) for field in node.fields) + ) + bq_data = self._cached_executions[node] + cached_replacement = nodes.CachedTableNode( + source=bq_data, + scan_list=scan_list, + table_session=node.session, + original_node=node, + ) + assert node.schema == cached_replacement.schema + return cached_replacement + + return nodes.top_down(root, replace_if_cached) + + def cache_results_table( + self, + original_root: nodes.BigFrameNode, + data: bq_data.BigqueryDataSource, + ): + self._cached_executions[original_root] = data + + ## Local data upload caching + def cache_remote_replacement( + self, + local_data: local_data.ManagedArrowTable, + bq_data: bq_data.BigqueryDataSource, + ): + # bq table has one extra column for offsets, those are implicit for local data + assert len(local_data.schema.items) + 1 == len(bq_data.table.physical_schema) + mapping = { + local_data.schema.items[i].column: bq_data.table.physical_schema[i].name + for i in range(len(local_data.schema)) + } + self._uploaded_local_data[local_data] = UploadedLocalData(bq_data, mapping) + + def get_uploaded_local_data( + self, local_data: local_data.ManagedArrowTable + ) -> Optional[UploadedLocalData]: + return self._uploaded_local_data.get(local_data) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 0944c0dab6f..7b5d1bcaf1f 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -300,6 +300,17 @@ def __init__( self._session = session self._clock = session_time.BigQuerySyncedClock(bqclient) self._clock.sync() + self._threadpool = concurrent.futures.ThreadPoolExecutor( + max_workers=1, thread_name_prefix="bigframes-loader" + ) + + def read_data_async( + self, local_data: local_data.ManagedArrowTable, offsets_col: str + ) -> concurrent.futures.Future[bq_data.BigqueryDataSource]: + future = self._threadpool.submit( + self._load_data_or_write_data, local_data, offsets_col + ) + return future def read_pandas( self, @@ -350,7 +361,7 @@ def read_managed_data( session=self._session, ) - def load_data_or_write_data( + def _load_data_or_write_data( self, data: local_data.ManagedArrowTable, offsets_col: str, diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index 26a944d760a..bd2fa41c5e9 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -508,20 +508,6 @@ def cleanup_function_assets( pass -def get_function_name(func, package_requirements=None, is_row_processor=False): - """Get a bigframes function name for testing given a udf.""" - # Augment user package requirements with any internal package - # requirements. - package_requirements = bff_utils.get_updated_package_requirements( - package_requirements, is_row_processor - ) - - # Compute a unique hash representing the user code. - function_hash = bff_utils.get_hash(func, package_requirements) - - return f"bigframes_{function_hash}" - - def _apply_ops_to_sql( obj: bpd.DataFrame, ops_list: Sequence[ex.Expression], diff --git a/bigframes/version.py b/bigframes/version.py index 4928dd5c209..8352be131df 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.38.0" +__version__ = "2.39.0" # {x-release-please-start-date} -__release_date__ = "2026-03-16" +__release_date__ = "2026-03-31" # {x-release-please-end} diff --git a/docs/index.rst b/docs/index.rst index 00c59a6745e..19b05bc1b68 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,46 +1,52 @@ .. BigQuery DataFrames documentation main file -Welcome to BigQuery DataFrames -============================== +Scalable Python Data Analysis with BigQuery DataFrames (BigFrames) +================================================================== -**BigQuery DataFrames** (``bigframes``) provides a Pythonic interface for data analysis that scales to petabytes. It gives you the best of both worlds: the familiar API of **pandas** and **scikit-learn**, powered by the distributed computing engine of **BigQuery**. +.. meta:: + :description: BigQuery DataFrames (BigFrames) provides a scalable, pandas-compatible Python API for data analysis and machine learning on petabyte-scale datasets using the BigQuery engine. -BigQuery DataFrames consists of three main components: +**BigQuery DataFrames** (``bigframes``) is an open-source Python library that brings the power of **distributed computing** to your data science workflow. By providing a familiar **pandas** and **scikit-learn** compatible API, BigFrames allows you to analyze and model massive datasets where they live—directly in **BigQuery**. -* **bigframes.pandas**: A pandas-compatible API for data exploration and transformation. -* **bigframes.ml**: A scikit-learn-like interface for BigQuery ML, including integration with Gemini. -* **bigframes.bigquery**: Specialized functions for managing BigQuery resources and deploying custom logic. +Why Choose BigQuery DataFrames? +------------------------------- -Why BigQuery DataFrames? ------------------------- +BigFrames eliminates the "data movement bottleneck." Instead of downloading large datasets to a local environment, BigFrames translates your Python code into optimized SQL, executing complex transformations across the BigQuery fleet. -BigFrames allows you to process data where it lives. Instead of downloading massive datasets to your local machine, BigFrames translates your Python code into SQL and executes it across the BigQuery fleet. +* **Petabyte-Scale Scalability:** Effortlessly process datasets that far exceed local memory limits. +* **Familiar Python Ecosystem:** Use the same ``read_gbq``, ``groupby``, ``merge``, and ``pivot_table`` functions you already know from pandas. +* **Integrated Machine Learning:** Access BigQuery ML's powerful algorithms via a scikit-learn-like interface (``bigframes.ml``), including seamless **Gemini AI** integration. +* **Enterprise-Grade Security:** Maintain data governance and security by keeping your data within the BigQuery perimeter. +* **Hybrid Flexibility:** Easily move between distributed BigQuery processing and local pandas analysis with ``to_pandas()``. -* **Scalability:** Work with datasets that exceed local memory limits without complex refactoring. -* **Collaboration & Extensibility:** Bridge the gap between Python and SQL. Deploy custom Python functions to BigQuery, making your logic accessible to SQL-based teammates and data analysts. -* **Production-Ready Pipelines:** Move seamlessly from interactive notebooks to production. BigFrames simplifies data engineering by integrating with tools like **dbt** and **Airflow**, offering a simpler operational model than Spark. -* **Security & Governance:** Keep your data within the BigQuery perimeter. Benefit from enterprise-grade security, auditing, and data governance throughout your entire Python workflow. -* **Familiarity:** Use ``read_gbq``, ``merge``, ``groupby``, and ``pivot_table`` just like you do in pandas. +Core Components of BigFrames +---------------------------- -Quickstart ----------- +BigQuery DataFrames is organized into specialized modules designed for the modern data stack: -Install the library via pip: +1. :mod:`bigframes.pandas`: A high-performance, pandas-compatible API for scalable data exploration, cleaning, and transformation. +2. :mod:`bigframes.bigquery`: Specialized utilities for direct BigQuery resource management, including integrations with Gemini and other AI models in the :mod:`bigframes.bigquery.ai` submodule. + + +Quickstart: Scalable Data Analysis in Seconds +--------------------------------------------- + +Install BigQuery DataFrames via pip: .. code-block:: bash pip install --upgrade bigframes -Load and aggregate a public dataset in just a few lines: +The following example demonstrates how to perform a distributed aggregation on a public dataset with millions of rows using just a few lines of Python: .. code-block:: python import bigframes.pandas as bpd - # Load data from BigQuery + # Initialize BigFrames and load a public dataset df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") - # Perform familiar pandas operations at scale + # Perform familiar pandas operations that execute in the cloud top_names = ( df.groupby("name") .agg({"number": "sum"}) @@ -48,32 +54,28 @@ Load and aggregate a public dataset in just a few lines: .head(10) ) + # Bring the final, aggregated results back to local memory if needed print(top_names.to_pandas()) -User Guide ----------- +Explore the Documentation +------------------------- .. toctree:: :maxdepth: 2 + :caption: User Documentation user_guide/index -API reference -------------- - .. toctree:: - :maxdepth: 3 + :maxdepth: 2 + :caption: API Reference reference/index supported_pandas_apis -Changelog ---------- - -For a list of all BigQuery DataFrames releases: - .. toctree:: - :maxdepth: 2 + :maxdepth: 1 + :caption: Community & Updates changelog diff --git a/docs/reference/index.rst b/docs/reference/index.rst index cb295a43099..0ddfa5f0e3f 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -27,7 +27,8 @@ BigQuery DataFrames provides extensions to pandas DataFrame objects. .. autosummary:: :toctree: api - bigframes.extensions.pandas.dataframe_accessor.BigQueryDataFrameAccessor + bigframes.extensions.core.dataframe_accessor.BigQueryDataFrameAccessor + bigframes.extensions.core.dataframe_accessor.AIAccessor ML APIs ~~~~~~~ diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index af09616e055..a9695cf8c7a 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -44,6 +44,7 @@ User Guide :maxdepth: 1 AI Functions <../notebooks/generative_ai/ai_functions.ipynb> + AI Functions for Poster Analysis <../notebooks/generative_ai/ai_movie_poster.ipynb> AI Forecast <../notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb> LLM Code Generation <../notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb> LLM KMeans <../notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb> diff --git a/notebooks/generative_ai/ai_movie_poster.ipynb b/notebooks/generative_ai/ai_movie_poster.ipynb new file mode 100644 index 00000000000..b25e2b556e6 --- /dev/null +++ b/notebooks/generative_ai/ai_movie_poster.ipynb @@ -0,0 +1,732 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "XZpKUoHjXw3_" + }, + "outputs": [], + "source": [ + "# Copyright 2026 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEKzWP6jW9Oj" + }, + "source": [ + "# Analyzing movie posters with BigQuery Dataframe AI functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c9CCKXG5XTb-" + }, + "source": [ + "BigQuery Dataframe provides a Pythonic way to use AI functions directly with your dataframes. In this notebook, you will use these functions to analyze old\n", + "movie posters. These posters are images stored in a public Google Cloud Storage bucket: `gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CUJDa_7MPbL9" + }, + "source": [ + "## Set up" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D3iYtBSkYpCK" + }, + "source": [ + "Before you begin, you need to\n", + "\n", + "* Set up your permissions for generative AI functions with [these instructions](https://docs.cloud.google.com/bigquery/docs/permissions-for-ai-functions)\n", + "* Set up your Cloud Resource connection by following [these instructions](https://docs.cloud.google.com/bigquery/docs/create-cloud-resource-connection)\n", + "\n", + "Once you have the permissions set up, import the `bigframes.pandas` package, and\n", + "set your cloud project ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6nqoRHYbPAx3" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "MY_RPOJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", + "\n", + "bpd.options.bigquery.project = MY_RPOJECT_ID" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2XHcNHtvPhNW" + }, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eS-9A7DijfoQ" + }, + "source": [ + "First, you load the data from the GCS bucket to a BigQuery Dataframe with the `from_glob_path` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ZNPzFjCyPap0", + "outputId": "346d20b2-d615-4094-d24e-2d40e5c90ee2" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a moment of slot time. [Job bigframes-dev:US.48a27954-7a4a-4b9e-8176-ea227fd188ad details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in a minute of slot time. [Job bigframes-dev:US.09c48ecb-e041-4c18-a390-ca5a36fd07c3 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
poster
0
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " poster\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Replace with your own connection name.\n", + "MY_CONNECTION = 'bigframes-default-connection' # @param {type:\"string\"}\n", + "\n", + "movies = bpd.from_glob_path(\n", + " \"gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/*\",\n", + " connection = MY_CONNECTION,\n", + " name='poster')\n", + "movies.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EfkdDH08QnYw" + }, + "source": [ + "## Extract titles from posters" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "6CoZZ5tSQm1r", + "outputId": "1b3915ce-eb83-4be9-b1c1-d9a326dc9408" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in 2 minutes of slot time. [Job bigframes-dev:US.4a08a15f-5a2f-463b-bba8-734858ec992b details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitle
0Der Student von Prag
\n", + "

1 rows × 2 columns

\n", + "
[1 rows x 2 columns in total]" + ], + "text/plain": [ + " poster title\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag\n", + "\n", + "[1 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import bigframes.bigquery as bbq\n", + "\n", + "movies['title'] = bbq.ai.generate(\n", + " (\"What is the movie title for this poster? Name only\", movies['poster']),\n", + " endpoint='gemini-2.5-pro'\n", + ").struct.field(\"result\")\n", + "movies.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cFQHQ9S2lr6t" + }, + "source": [ + "Notice that `ai.generate()` has a `struct` return type, which holds not only the LLM response, but also the status. If you do not provide a field name for your answer, `\"result\"` will be the default name. You can access LLM response content with the struct accessor (e.g. `my_response.struct.filed(\"result\")`);." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R8kkUhgoS5Xz" + }, + "source": [ + "## Get movie release year\n", + "\n", + "In the example below, you will use `ai.generate_int()` to find the release year for each movie poster:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 976 + }, + "id": "cKZdHq0XS1iW", + "outputId": "72cbad57-4518-4e1e-97bb-333d424dba73" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in 4 minutes of slot time. [Job bigframes-dev:US.b60a151a-6cbc-405e-9c40-8a7461981a00 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
0Der Student von Prag1913
\n", + "

1 rows × 3 columns

\n", + "
[1 rows x 3 columns in total]" + ], + "text/plain": [ + " poster title \\\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag \n", + "\n", + " year \n", + "0 1913 \n", + "\n", + "[1 rows x 3 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "movies['year'] = bbq.ai.generate_int(\n", + " (\"What is the release year for this movie?\", movies['title']),\n", + " endpoint='gemini-2.5-pro'\n", + ").struct.field(\"result\")\n", + "\n", + "movies.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 250 + }, + "id": "yqRiNRY8_8fs", + "outputId": "efa60107-6883-4f5c-8e40-43c7287ea7fb" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
posterstruct<uri: string, version: string, authorize...
titlestring[pyarrow]
yearInt64
\n", + "

" + ], + "text/plain": [ + "poster structJob bigframes-dev:US.c9bb23f0-5ceb-4d6c-8241-960c496274ae details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
8Shoulder Arms1918
\n", + "

1 rows × 3 columns

\n", + "
[1 rows x 3 columns in total]" + ], + "text/plain": [ + " poster title year\n", + "8 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Shoulder Arms 1918\n", + "\n", + "[1 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "us_movies = movies[bbq.ai.if_(\n", + " (\"The movie \", movies['title'], \" was made in US\")\n", + ")]\n", + "us_movies.head(1)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/setup.py b/setup.py index 2179fe3e964..4ff5ec45872 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ # please keep these in sync with the minimum versions in testing/constraints-3.10.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", - "gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0", + "gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0, !=2026.3.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", "google-cloud-bigquery[bqstorage,pandas] >=3.36.0", @@ -46,6 +46,7 @@ "google-cloud-bigquery-connection >=1.12.0", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", + "google-crc32c >=1.0.0,<2.0.0", "grpc-google-iam-v1 >= 0.14.2", "numpy >=1.24.0", "pandas >=1.5.3", diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 114b600d9de..9848d360968 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -20,6 +20,7 @@ import shutil import tempfile import textwrap +import uuid import warnings import google.api_core.exceptions @@ -32,7 +33,6 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions -import bigframes.functions._utils as bff_utils import bigframes.pandas as bpd import bigframes.series from bigframes.testing.utils import ( @@ -526,24 +526,6 @@ def add_one(x): # Make a unique udf add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) - # Expected cloud function name for the unique udf - package_requirements = bff_utils.get_updated_package_requirements() - add_one_uniq_hash = bff_utils.get_hash(add_one_uniq, package_requirements) - add_one_uniq_cf_name = bff_utils.get_cloud_function_name( - add_one_uniq_hash, session.session_id - ) - - # There should be no cloud function yet for the unique udf - cloud_functions = list( - get_cloud_functions( - session.cloudfunctionsclient, - session.bqclient.project, - session.bqclient.location, - name=add_one_uniq_cf_name, - ) - ) - assert len(cloud_functions) == 0 - # The first time both the cloud function and the bq remote function don't # exist and would be created remote_add_one = session.remote_function( @@ -555,6 +537,9 @@ def add_one(x): cloud_function_service_account="default", )(add_one_uniq) + assert remote_add_one.bigframes_cloud_function is not None + add_one_uniq_cf_name = remote_add_one.bigframes_cloud_function.split("/")[-1] + # There should have been excactly one cloud function created at this point cloud_functions = list( get_cloud_functions( @@ -1230,7 +1215,7 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_custom_sa(scalars_dfs): +def test_remote_function_via_session_custom_sa(scalars_pandas_df_index): # TODO(shobs): Automate the following set-up during testing in the test project. # # For upfront convenience, the following set up has been statically created @@ -1249,14 +1234,13 @@ def test_remote_function_via_session_custom_sa(scalars_dfs): rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) try: - # TODO(shobs): Figure out why the default ingress setting - # (internal-only) does not work here + @rf_session.remote_function( input_types=[int], output_type=int, reuse=False, cloud_function_service_account=gcf_service_account, - cloud_function_ingress_settings="all", + cloud_function_ingress_settings="internal-and-gclb", ) def double_num(x): if x is None: @@ -1270,13 +1254,12 @@ def double_num(x): assert gcf.service_config.service_account_email == gcf_service_account # assert that the function works as expected on data - scalars_df, scalars_pandas_df = scalars_dfs - bf_int64_col = scalars_df["int64_col"] + bf_int64_col = rf_session.read_pandas(scalars_pandas_df_index.int64_col) bf_result_col = bf_int64_col.apply(double_num) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() - pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col = scalars_pandas_df_index.int64_col pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x + x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) @@ -1303,7 +1286,7 @@ def double_num(x): ) @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_via_session_custom_build_sa( - scalars_dfs, set_build_service_account + set_build_service_account, scalars_pandas_df_index ): # TODO(shobs): Automate the following set-up during testing in the test project. # @@ -1321,15 +1304,14 @@ def test_remote_function_via_session_custom_build_sa( rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) try: - # TODO(shobs): Figure out why the default ingress setting - # (internal-only) does not work here + @rf_session.remote_function( input_types=[int], output_type=int, reuse=False, cloud_function_service_account="default", cloud_build_service_account=set_build_service_account, - cloud_function_ingress_settings="all", + cloud_function_ingress_settings="internal-and-gclb", ) def double_num(x): if x is None: @@ -1342,14 +1324,11 @@ def double_num(x): ) assert gcf.build_config.service_account == expected_build_service_account - # assert that the function works as expected on data - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] + bf_int64_col = rf_session.read_pandas(scalars_pandas_df_index.int64_col) bf_result_col = bf_int64_col.apply(double_num) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() - pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col = scalars_pandas_df_index.int64_col pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x + x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) @@ -1436,7 +1415,7 @@ def square_num(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_vpc(scalars_dfs): +def test_remote_function_via_session_vpc(scalars_pandas_df_index): # TODO(shobs): Automate the following set-up during testing in the test project. # # For upfront convenience, the following set up has been statically created @@ -1466,8 +1445,6 @@ def double_num(x): return x return x + x - # TODO(shobs): See if the test vpc can be configured to make this flow - # work with the default ingress setting (internal-only) double_num_remote = rf_session.remote_function( input_types=[int], output_type=int, @@ -1475,7 +1452,7 @@ def double_num(x): cloud_function_service_account="default", cloud_function_vpc_connector=gcf_vpc_connector, cloud_function_vpc_connector_egress_settings="all", - cloud_function_ingress_settings="all", + cloud_function_ingress_settings="internal-and-gclb", )(double_num) gcf = rf_session.cloudfunctionsclient.get_function( @@ -1489,15 +1466,12 @@ def double_num(x): # cloud_function_vpc_connector_egress_settings="all" earlier. assert gcf.service_config.vpc_connector_egress_settings == 2 - # assert that the function works as expected on data - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] + bf_int64_col = rf_session.read_pandas(scalars_pandas_df_index.int64_col) bf_result_col = bf_int64_col.apply(double_num_remote) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() - pd_int64_col = scalars_pandas_df["int64_col"] - pd_result_col = pd_int64_col.apply(double_num).astype("Int64") + pd_int64_col = scalars_pandas_df_index.int64_col + pd_result_col = pd_int64_col.apply(double_num) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_frame_equal(bf_result, pd_result, check_dtype=False) @@ -1573,7 +1547,9 @@ def square(x): bq_routine = session.bqclient.get_routine( square_remote.bigframes_bigquery_function ) - assert bq_routine.remote_function_options.max_batching_rows == max_batching_rows + assert bq_routine.remote_function_options.max_batching_rows == ( + max_batching_rows or 1000 + ) scalars_df, scalars_pandas_df = scalars_dfs @@ -1693,6 +1669,51 @@ def square(x): ) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_reflects_config_change_with_reuse(session): + square_remote = None + square_remote_2 = None + try: + + def square(x): + return x * x + + # random alphanumeric name starting with a letter + deploy_name = "a" + str(uuid.uuid4().hex) + square_remote = session.remote_function( + input_types=[int], + name=deploy_name, + output_type=int, + reuse=True, + cloud_function_service_account="default", + cloud_function_cpus=1, + )(square) + square_remote_2 = session.remote_function( + input_types=[int], + name=deploy_name, + output_type=int, + reuse=True, + cloud_function_service_account="default", + cloud_function_cpus=2, + )(square) + + # Assert that the GCF is created with the intended max instance count + gcf = session.cloudfunctionsclient.get_function( + name=square_remote_2.bigframes_cloud_function + ) + assert float(gcf.service_config.available_cpu) == 2.0 + finally: + # clean up the gcp assets created for the remote function + if square_remote is not None: + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient + ) + if square_remote_2 is not None: + cleanup_function_assets( + square_remote_2, session.bqclient, session.cloudfunctionsclient + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_df_apply_axis_1(session, scalars_dfs): columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] @@ -2693,25 +2714,6 @@ def square(x: int) -> int: ) -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_ingress_settings_w_all(session): - ingress_settings_args = {"cloud_function_ingress_settings": "all"} - - with pytest.raises( - google.api_core.exceptions.FailedPrecondition, - match="400.*allowedIngress violated", - ): - - def square(x: int) -> int: - return x * x - - session.remote_function( - reuse=False, - cloud_function_service_account="default", - **ingress_settings_args, - )(square) - - @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_ingress_settings_unsupported(session): with pytest.raises( diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index c2e9036eed7..eabd36ab387 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -155,7 +155,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): ) -# @pytest.mark.flaky(retries=2) +@pytest.mark.flaky(retries=2) def test_xgbclassifier_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index 72a0ee469b5..8500ad9d5f1 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -88,6 +88,7 @@ def test_arima_plus_model_fit_score( result, columns=expected_columns, index=2 if id_col_name else 1, + col_exact=False, ) # save, load to ensure configuration was kept diff --git a/tests/system/large/ml/test_llm.py b/tests/system/large/ml/test_llm.py index 6e2695b1b53..dba5dc8e4d3 100644 --- a/tests/system/large/ml/test_llm.py +++ b/tests/system/large/ml/test_llm.py @@ -63,7 +63,7 @@ def test_create_load_gemini_text_generator_model( "gemini-2.5-flash-lite", ), ) -# @pytest.mark.flaky(retries=2) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_default_params_success( llm_text_df, model_name, session, bq_connection ): @@ -198,6 +198,7 @@ def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name): "evaluation_status", ], index=1, + col_exact=False, ) @@ -226,6 +227,7 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) "label", "evaluation_status", ], + col_exact=False, ) diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 1ee60dafd66..0a9875a989f 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -34,11 +34,25 @@ from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff import bigframes.session._io.bigquery -from bigframes.testing.utils import assert_frame_equal, get_function_name +from bigframes.testing.utils import assert_frame_equal, assert_series_equal _prefixer = test_utils.prefixer.Prefixer("bigframes", "") +def get_function_name(func, package_requirements=None, is_row_processor=False): + """Get a bigframes function name for testing given a udf.""" + # Augment user package requirements with any internal package + # requirements. + package_requirements = bff_utils.get_updated_package_requirements( + package_requirements or [], is_row_processor + ) + + # Compute a unique hash representing the user code. + function_hash = bff_utils.get_hash(func, package_requirements) + + return f"bigframes_{function_hash}" + + @pytest.fixture(scope="module") def bq_cf_connection() -> str: """Pre-created BQ connection in the test project in US location, used to @@ -102,7 +116,7 @@ def get_bq_connection_id_path_format(connection_id_dot_format): return f"projects/{fields[0]}/locations/{fields[1]}/connections/{fields[2]}" -# @pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, bigqueryconnection_client, @@ -617,7 +631,7 @@ def bytes_to_hex(mybytes: bytes) -> bytes: )(bytes_to_hex) bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() - pd.testing.assert_series_equal( + assert_series_equal( bf_result, pd_result, ) @@ -785,7 +799,7 @@ def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_un pd_result = pd_s.apply(func) bf_result = bf_s.apply(func) assert bigframes.dtypes.is_array_string_like(bf_result.dtype) - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False ) @@ -826,7 +840,7 @@ def test_read_gbq_function_runs_existing_udf_2_params_array_output( pd_result = pd_df["col0"].combine(pd_df["col1"], func) bf_result = bf_df["col0"].combine(bf_df["col1"], func) assert bigframes.dtypes.is_array_string_like(bf_result.dtype) - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False ) @@ -881,7 +895,7 @@ def test_read_gbq_function_runs_existing_udf_4_params_array_output( ) bf_result = bf_df.apply(func, axis=1) assert bigframes.dtypes.is_array_string_like(bf_result.dtype) - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False ) @@ -1060,9 +1074,7 @@ def test_read_gbq_function_respects_python_output_type( actual = s.apply(func).to_pandas() # ignore type disparities, e.g. "int64" in pandas v/s "Int64" in bigframes - pd.testing.assert_series_equal( - expected, actual, check_dtype=False, check_index_type=False - ) + assert_series_equal(expected, actual, check_dtype=False, check_index_type=False) @pytest.mark.parametrize( @@ -1200,9 +1212,7 @@ def add_ints(row: pandas.Series) -> int: # bf_result.to_numpy() produces an array of numpy.float64's # (in system_prerelease tests), while pd_result.to_numpy() produces an # array of ints, ignore this mismatch by using check_exact=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_exact=False - ) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_exact=False) # Read back the deployed BQ remote function using read_gbq_function. func_ref = session.read_gbq_function( @@ -1215,9 +1225,7 @@ def add_ints(row: pandas.Series) -> int: assert func_ref.bigframes_remote_function == func_ref.bigframes_bigquery_function # type: ignore bf_result_gbq = scalars_df[columns].apply(func_ref, axis=1).to_pandas() - pd.testing.assert_series_equal( - pd_result, bf_result_gbq, check_dtype=False, check_exact=False - ) + assert_series_equal(pd_result, bf_result_gbq, check_dtype=False, check_exact=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1253,9 +1261,7 @@ def add_ints(row: pandas.Series) -> int: # bf_result.to_numpy() produces an array of numpy.float64's # (in system_prerelease tests), while pd_result.to_numpy() produces an # array of ints, ignore this mismatch by using check_exact=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_exact=False - ) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_exact=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1286,9 +1292,7 @@ def add_numbers(row): # bf_result.index[0].dtype is 'string[pyarrow]' while # pd_result.index[0].dtype is 'object', ignore this mismatch by using # check_index_type=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_apply_axis_1_unsupported_callable(scalars_dfs): @@ -1452,7 +1456,7 @@ def is_odd(x: int) -> bool: bf_result = bf_method(is_odd_remote).to_pandas() # ignore any dtype difference - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1501,7 +1505,7 @@ def add(x: int, y: int) -> int: ) # ignore any dtype difference - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1563,7 +1567,7 @@ def add_pandas(s: pd.Series) -> float: bf_result = bf_df[bf_filter].apply(add_remote, axis=1).to_pandas() # ignore any dtype difference - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 134f82e96e9..23487983ee3 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -493,7 +493,7 @@ def test_arima_plus_score( dtype="Float64", ) pd.testing.assert_frame_equal( - result, + result[expected.columns], expected, rtol=0.1, check_index_type=False, @@ -594,7 +594,7 @@ def test_arima_plus_score_series( dtype="Float64", ) pd.testing.assert_frame_equal( - result, + result[expected.columns], expected, rtol=0.1, check_index_type=False, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9683a8bc52d..db8842bd323 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5902,6 +5902,19 @@ def test_to_gbq_table_labels(scalars_df_index): assert table.labels["test"] == "labels" +def test_to_gbq_obj_ref_persists(session): + # Test that saving and loading an Object Reference retains its dtype + bdf = session.from_glob_path( + "gs://cloud-samples-data/vision/ocr/*.jpg", name="uris" + ).head(1) + + destination_table = "bigframes-dev.bigframes_tests_sys.test_obj_ref_persistence" + bdf.to_gbq(destination_table, if_exists="replace") + + loaded_df = session.read_gbq(destination_table) + assert loaded_df["uris"].dtype == dtypes.OBJ_REF_DTYPE + + @pytest.mark.parametrize( ("col_names", "ignore_index"), [ @@ -6283,3 +6296,20 @@ def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): with pytest.raises(KeyError): bf_df.agg(agg_funcs) + + +def test_empty_agg_projection_succeeds(): + # Tests that the compiler generates a SELECT 1 fallback for empty aggregations, + # protecting against BigQuery syntax errors when both groups and metrics are empty. + import importlib + + bq = importlib.import_module( + "bigframes_vendored.ibis.backends.sql.compilers.bigquery" + ) + sg = importlib.import_module("bigframes_vendored.sqlglot") + + compiler = bq.BigQueryCompiler() + res = compiler.visit_Aggregate( + "op", parent=sg.table("parent_table"), groups=[], metrics=[] + ) + assert "SELECT 1" in res.sql() diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fece679d061..b40dcca7d76 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -631,6 +631,13 @@ def test_to_gbq_if_exists_is_replace(scalars_dfs, dataset_id): assert len(gcs_df) == len(scalars_pandas_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + # When replacing a table with same schema but different column order + reordered_df = scalars_df[scalars_df.columns[::-1]] + reordered_df.to_gbq(destination_table, if_exists="replace") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, reordered_df.columns) + # When replacing a table with different schema partitial_scalars_df = scalars_df.drop(columns=["string_col"]) partitial_scalars_df.to_gbq(destination_table, if_exists="replace") @@ -1002,6 +1009,28 @@ def test_to_gbq_timedelta_tag_ignored_when_appending(bigquery_client, dataset_id assert table.schema[0].description is None +def test_to_gbq_obj_ref(session, dataset_id: str, bigquery_client): + destination_table = f"{dataset_id}.test_to_gbq_obj_ref" + sql = """ + SELECT + 'gs://cloud-samples-data/vision/ocr/sign.jpg' AS uri_col + """ + df = session.read_gbq(sql) + df["obj_ref_col"] = df["uri_col"].str.to_blob() + df = df.drop(columns=["uri_col"]) + + df.to_gbq(destination_table) + + table = bigquery_client.get_table(destination_table) + obj_ref_field = next(f for f in table.schema if f.name == "obj_ref_col") + assert obj_ref_field.field_type == "RECORD" + assert obj_ref_field.description == "bigframes_dtype: OBJ_REF_DTYPE" + + reloaded_df = session.read_gbq(destination_table) + assert reloaded_df["obj_ref_col"].dtype == dtypes.OBJ_REF_DTYPE + assert len(reloaded_df) == 1 + + @pytest.mark.parametrize( ("index"), [True, False], diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 522e8db9e45..18368fc5126 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1490,3 +1490,34 @@ def test_multiindex_eq_const(scalars_df_index, scalars_pandas_df_index): bigframes.testing.utils.assert_index_equal( pandas.Index(pd_result, dtype="boolean"), bf_result.to_pandas() ) + + +def test_count_empty_multiindex_columns(session): + df = pandas.DataFrame( + [], index=[1, 2], columns=pandas.MultiIndex.from_tuples([], names=["a", "b"]) + ) + bdf = session.read_pandas(df) + + # count() operation unpivots columns, triggering the empty MultiIndex bug internally + count_df = bdf.count() + + # The local fix ensures that empty unpivoted columns generate properly typed NULLs + # rather than failing syntax validation downstream in BigQuery. + # We compile to `.sql` to verify it succeeds locally without evaluating on BigQuery natively. + _ = count_df.to_frame().sql + + # Assert structural layout is correct + assert count_df.index.nlevels == 2 + assert list(count_df.index.names) == ["a", "b"] + + +def test_dataframe_melt_multiindex(session): + # Tests that `melt` operations via count do not cause MultiIndex drops in Arrow + df = pandas.DataFrame({"A": [1], "B": ["string"], "C": [3]}) + df.columns = pandas.MultiIndex.from_tuples( + [("Group1", "A"), ("Group2", "B"), ("Group1", "C")] + ) + bdf = session.read_pandas(df) + + count_df = bdf.count().to_pandas() + assert count_df.shape[0] == 3 diff --git a/tests/unit/bigquery/test_table.py b/tests/unit/bigquery/test_table.py deleted file mode 100644 index badce5e5e23..00000000000 --- a/tests/unit/bigquery/test_table.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License""); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from unittest import mock - -import pytest - -import bigframes.bigquery -import bigframes.core.sql.table -import bigframes.session - - -@pytest.fixture -def mock_session(): - return mock.create_autospec(spec=bigframes.session.Session) - - -def test_create_external_table_ddl(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_replace(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - replace=True, - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE OR REPLACE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_if_not_exists(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - if_not_exists=True, - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE IF NOT EXISTS my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_partition_columns(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - partition_columns={"part1": "DATE", "part2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) WITH PARTITION COLUMNS (part1 DATE, part2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_connection(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - connection_name="my-connection", - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) WITH CONNECTION `my-connection` OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -@mock.patch("bigframes.bigquery._operations.table._get_table_metadata") -def test_create_external_table(get_table_metadata_mock, mock_session): - bigframes.bigquery.create_external_table( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - session=mock_session, - ) - mock_session.read_gbq_query.assert_called_once() - generated_sql = mock_session.read_gbq_query.call_args[0][0] - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert generated_sql == expected - get_table_metadata_mock.assert_called_once() diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py b/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py index c6c1c211510..7d4f53254db 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py @@ -42,3 +42,23 @@ def test_func(input: sge.Expression) -> sge.Expression: ValueError, match=r".*first parameter must be a window operator.*" ): test_func(sge.to_identifier("A")) + + +def test_register_already_registered_raise_error(): + reg = op_registration.OpRegistration() + + @reg.register(agg_ops.SizeOp) + def test_func1(op, input): + return input + + with pytest.raises(ValueError, match=r".*is already registered.*"): + + @reg.register(agg_ops.SizeOp) + def test_func2(op, input): + return input + + +def test_getitem_not_registered_raise_error(): + reg = op_registration.OpRegistration() + with pytest.raises(ValueError, match=r".*is not registered.*"): + _ = reg[agg_ops.SizeOp()] diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql index 4200470b655..a1f089424a1 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql @@ -1,3 +1,4 @@ SELECT - `string_list_col`[SAFE_OFFSET(1)] AS `string_list_col` -FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file + IF(SUBSTRING(`string_col`, 2, 1) <> '', SUBSTRING(`string_col`, 2, 1), NULL) AS `string_index`, + [`int64_col`, `int64_too`][SAFE_OFFSET(1)] AS `array_index` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql index 26fc32f68dc..1053ec1c2c6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql @@ -18,5 +18,10 @@ SELECT SELECT COALESCE(LOGICAL_OR(bf_arr_reduce_uid), FALSE) FROM UNNEST(`bool_list_col`) AS bf_arr_reduce_uid - ) AS `any_bool` + ) AS `any_bool`, + ( + SELECT + ARRAY_AGG(bf_arr_reduce_uid IGNORE NULLS) + FROM UNNEST(`string_list_col`) AS bf_arr_reduce_uid + ) AS `array_agg_str` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice/out.sql new file mode 100644 index 00000000000..ffec3b8e934 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice/out.sql @@ -0,0 +1,17 @@ +SELECT + SUBSTRING(`string_col`, 2, 4) AS `string_slice`, + ARRAY( + SELECT + el + FROM UNNEST([`int64_col`, `int64_too`]) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 + ) AS `slice_only_start`, + ARRAY( + SELECT + el + FROM UNNEST([`int64_col`, `int64_too`]) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 AND slice_idx < 5 + ) AS `slice_start_stop` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_only_start/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_only_start/out.sql deleted file mode 100644 index c37e27b2cf4..00000000000 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_only_start/out.sql +++ /dev/null @@ -1,9 +0,0 @@ -SELECT - ARRAY( - SELECT - el - FROM UNNEST(`string_list_col`) AS el WITH OFFSET AS slice_idx - WHERE - slice_idx >= 1 - ) AS `string_list_col` -FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_start_and_stop/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_start_and_stop/out.sql deleted file mode 100644 index 70417daf5c8..00000000000 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_start_and_stop/out.sql +++ /dev/null @@ -1,9 +0,0 @@ -SELECT - ARRAY( - SELECT - el - FROM UNNEST(`string_list_col`) AS el WITH OFFSET AS slice_idx - WHERE - slice_idx >= 1 AND slice_idx < 5 - ) AS `string_list_col` -FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url_with_duration/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url_with_duration/out.sql new file mode 100644 index 00000000000..2e8b60230fa --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url_with_duration/out.sql @@ -0,0 +1,3 @@ +SELECT + OBJ.GET_ACCESS_URL(`string_col`, 'READ', INTERVAL 3600 MICROSECOND) AS `string_col` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref_json/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref_json/out.sql new file mode 100644 index 00000000000..dc84b3bec12 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref_json/out.sql @@ -0,0 +1,3 @@ +SELECT + OBJ.MAKE_REF(`string_col`) AS `string_col` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql index 7afe926ab41..d6f6587ead9 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql @@ -4,5 +4,6 @@ SELECT `int64_col`, `int64_col` & `int64_col` AS `int_and_int`, `bool_col` AND `bool_col` AS `bool_and_bool`, - IF(`bool_col` = FALSE, `bool_col`, NULL) AS `bool_and_null` + IF(`bool_col` = FALSE, `bool_col`, NULL) AS `bool_and_null`, + IF(`bool_col` = FALSE, `bool_col`, NULL) AS `null_and_bool` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql index 89a80b05a8a..dad4cee9d0b 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql @@ -4,5 +4,6 @@ SELECT `int64_col`, `int64_col` | `int64_col` AS `int_and_int`, `bool_col` OR `bool_col` AS `bool_and_bool`, - IF(`bool_col` = TRUE, `bool_col`, NULL) AS `bool_and_null` + IF(`bool_col` = TRUE, `bool_col`, NULL) AS `bool_and_null`, + IF(`bool_col` = TRUE, `bool_col`, NULL) AS `null_and_bool` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql index 74a8e810817..4be3b9f94ad 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql @@ -13,5 +13,11 @@ SELECT ) OR ( NOT `bool_col` AND CAST(NULL AS BOOLEAN) - ) AS `bool_and_null` + ) AS `bool_and_null`, + ( + `bool_col` AND NOT CAST(NULL AS BOOLEAN) + ) + OR ( + NOT `bool_col` AND CAST(NULL AS BOOLEAN) + ) AS `null_and_bool` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql index 37554c77e06..7827731881e 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql @@ -5,6 +5,7 @@ SELECT `int64_col` = `int64_col` AS `int_eq_int`, `int64_col` = 1 AS `int_eq_1`, `int64_col` IS NULL AS `int_eq_null`, + `int64_col` IS NULL AS `null_eq_int`, `int64_col` = CAST(`bool_col` AS INT64) AS `int_eq_bool`, CAST(`bool_col` AS INT64) = `int64_col` AS `bool_eq_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql index f66e8435ebf..5903cf03699 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` >= `int64_col` AS `int_ge_int`, `int64_col` >= 1 AS `int_ge_1`, + NULL AS `null_ge_int`, `int64_col` >= CAST(`bool_col` AS INT64) AS `int_ge_bool`, CAST(`bool_col` AS INT64) >= `int64_col` AS `bool_ge_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql index d97f9d1d423..42bf029240f 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` > `int64_col` AS `int_gt_int`, `int64_col` > 1 AS `int_gt_1`, + NULL AS `null_gt_int`, `int64_col` > CAST(`bool_col` AS INT64) AS `int_gt_bool`, CAST(`bool_col` AS INT64) > `int64_col` AS `bool_gt_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql index d1af7c57ae9..b6d860d4723 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql @@ -10,5 +10,10 @@ SELECT COALESCE(`int64_col` IN (123456), FALSE) AS `ints_wo_match_nulls`, ( `float64_col` IS NULL - ) OR `float64_col` IN (1, 2, 3) AS `float_in_ints` + ) OR `float64_col` IN (1, 2, 3) AS `float_in_ints`, + ( + `int64_col` IS NULL + ) OR `int64_col` IN (2) AS `mixed_with_null`, + COALESCE(CAST(`bool_col` AS INT64) IN (1, 2.5), FALSE) AS `bool_in_mixed`, + `int64_col` IS NULL AS `only_null_match` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql index e4e542d1c58..c6c86510102 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` <= `int64_col` AS `int_le_int`, `int64_col` <= 1 AS `int_le_1`, + NULL AS `null_le_int`, `int64_col` <= CAST(`bool_col` AS INT64) AS `int_le_bool`, CAST(`bool_col` AS INT64) <= `int64_col` AS `bool_le_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql index d616aecc8c2..ec5c317a8e5 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` < `int64_col` AS `int_lt_int`, `int64_col` < 1 AS `int_lt_1`, + NULL AS `null_lt_int`, `int64_col` < CAST(`bool_col` AS INT64) AS `int_lt_bool`, CAST(`bool_col` AS INT64) < `int64_col` AS `bool_lt_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql index abef6f93d62..448a6146294 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql @@ -7,6 +7,9 @@ SELECT ( `int64_col` ) IS NOT NULL AS `int_ne_null`, + ( + `int64_col` + ) IS NOT NULL AS `null_ne_int`, `int64_col` <> CAST(`bool_col` AS INT64) AS `int_ne_bool`, CAST(`bool_col` AS INT64) <> `int64_col` AS `bool_ne_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql index 8654f942707..4b0696386c1 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql @@ -5,6 +5,13 @@ SELECT 86400000000 ) ) AS INT64) AS `fixed_freq`, + CAST(FLOOR(IEEE_DIVIDE(UNIX_MICROS(CAST(`datetime_col` AS TIMESTAMP)) - 0, 86400000000)) AS INT64) AS `origin_epoch`, + CAST(FLOOR( + IEEE_DIVIDE( + UNIX_MICROS(CAST(`datetime_col` AS TIMESTAMP)) - UNIX_MICROS(CAST(CAST(`timestamp_col` AS DATE) AS TIMESTAMP)), + 86400000000 + ) + ) AS INT64) AS `origin_start_day`, CASE WHEN UNIX_MICROS( CAST(TIMESTAMP_TRUNC(`datetime_col`, WEEK(MONDAY)) + INTERVAL 6 DAY AS TIMESTAMP) @@ -22,5 +29,48 @@ SELECT 604800000000 ) ) AS INT64) + 1 - END AS `non_fixed_freq_weekly` + END AS `non_fixed_freq_weekly`, + CASE + WHEN ( + EXTRACT(YEAR FROM `datetime_col`) * 12 + EXTRACT(MONTH FROM `datetime_col`) - 1 + ) = ( + EXTRACT(YEAR FROM `timestamp_col`) * 12 + EXTRACT(MONTH FROM `timestamp_col`) - 1 + ) + THEN 0 + ELSE CAST(FLOOR( + IEEE_DIVIDE( + ( + EXTRACT(YEAR FROM `datetime_col`) * 12 + EXTRACT(MONTH FROM `datetime_col`) - 1 + ) - ( + EXTRACT(YEAR FROM `timestamp_col`) * 12 + EXTRACT(MONTH FROM `timestamp_col`) - 1 + ) - 1, + 1 + ) + ) AS INT64) + 1 + END AS `non_fixed_freq_monthly`, + CASE + WHEN ( + EXTRACT(YEAR FROM `datetime_col`) * 4 + EXTRACT(QUARTER FROM `datetime_col`) - 1 + ) = ( + EXTRACT(YEAR FROM `timestamp_col`) * 4 + EXTRACT(QUARTER FROM `timestamp_col`) - 1 + ) + THEN 0 + ELSE CAST(FLOOR( + IEEE_DIVIDE( + ( + EXTRACT(YEAR FROM `datetime_col`) * 4 + EXTRACT(QUARTER FROM `datetime_col`) - 1 + ) - ( + EXTRACT(YEAR FROM `timestamp_col`) * 4 + EXTRACT(QUARTER FROM `timestamp_col`) - 1 + ) - 1, + 1 + ) + ) AS INT64) + 1 + END AS `non_fixed_freq_quarterly`, + CASE + WHEN EXTRACT(YEAR FROM `datetime_col`) = EXTRACT(YEAR FROM `timestamp_col`) + THEN 0 + ELSE CAST(FLOOR( + IEEE_DIVIDE(EXTRACT(YEAR FROM `datetime_col`) - EXTRACT(YEAR FROM `timestamp_col`) - 1, 1) + ) AS INT64) + 1 + END AS `non_fixed_freq_yearly` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql index 3d0b8213b6e..5d98e445cc1 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql @@ -2,5 +2,6 @@ SELECT CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS DATETIME) AS `int64_col`, SAFE_CAST(`string_col` AS DATETIME), CAST(TIMESTAMP_MICROS(CAST(TRUNC(`float64_col` * 0.001) AS INT64)) AS DATETIME) AS `float64_col`, - SAFE_CAST(`timestamp_col` AS DATETIME) + SAFE_CAST(`timestamp_col` AS DATETIME), + CAST(PARSE_TIMESTAMP('%Y-%m-%d', `string_col`, 'UTC') AS DATETIME) AS `string_col_fmt` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql index 1e8910fad7c..e0fb530cc6d 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql @@ -5,5 +5,6 @@ SELECT CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 1000) AS INT64)) AS TIMESTAMP) AS `int64_col_ms`, CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col`) AS INT64)) AS TIMESTAMP) AS `int64_col_us`, CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS TIMESTAMP) AS `int64_col_ns`, - TIMESTAMP(`datetime_col`) AS `datetime_col` + TIMESTAMP(`datetime_col`) AS `datetime_col`, + PARSE_TIMESTAMP('%Y-%m-%d', `string_col`, 'UTC') AS `string_col_fmt` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_value_array/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_value_array/out.sql new file mode 100644 index 00000000000..8250c02934e --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_value_array/out.sql @@ -0,0 +1,3 @@ +SELECT + JSON_VALUE_ARRAY(`json_col`, '$') AS `json_col` +FROM `bigframes-dev`.`sqlglot_test`.`json_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql index 111684acd0c..3aa06fe16e3 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` + `int64_col` AS `int_add_int`, `int64_col` + 1 AS `int_add_1`, + NULL AS `int_add_null`, `int64_col` + CAST(`bool_col` AS INT64) AS `int_add_bool`, CAST(`bool_col` AS INT64) + `int64_col` AS `bool_add_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql index 1b8166684cf..e2ccf96410a 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql @@ -6,6 +6,7 @@ SELECT IEEE_DIVIDE(`int64_col`, `int64_col`) AS `int_div_int`, IEEE_DIVIDE(`int64_col`, 1) AS `int_div_1`, IEEE_DIVIDE(`int64_col`, 0.0) AS `int_div_0`, + NULL AS `int_div_null`, IEEE_DIVIDE(`int64_col`, `float64_col`) AS `int_div_float`, IEEE_DIVIDE(`float64_col`, `int64_col`) AS `float_div_int`, IEEE_DIVIDE(`float64_col`, 0.0) AS `float_div_0`, diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql new file mode 100644 index 00000000000..8307b1b8ada --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql @@ -0,0 +1,48 @@ +SELECT + `rowindex`, + `int64_col`, + `bool_col`, + `float64_col`, + CASE + WHEN `int64_col` = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, `int64_col`)) AS INT64) + END AS `int_div_int`, + CASE + WHEN 1 = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, 1)) AS INT64) + END AS `int_div_1`, + CASE + WHEN 0.0 = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, 0.0)) AS INT64) + END AS `int_div_0`, + NULL AS `int_div_null`, + CASE + WHEN `float64_col` = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, `float64_col`)) AS INT64) + END AS `int_div_float`, + CASE + WHEN `int64_col` = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `float64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`float64_col`, `int64_col`)) AS INT64) + END AS `float_div_int`, + CASE + WHEN 0.0 = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `float64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`float64_col`, 0.0)) AS INT64) + END AS `float_div_0`, + NULL AS `float_div_null`, + CASE + WHEN CAST(`bool_col` AS INT64) = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, CAST(`bool_col` AS INT64))) AS INT64) + END AS `int_div_bool`, + CASE + WHEN `int64_col` = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * CAST(`bool_col` AS INT64) + ELSE CAST(FLOOR(IEEE_DIVIDE(CAST(`bool_col` AS INT64), `int64_col`)) AS INT64) + END AS `bool_div_int` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql index 2a79820635c..78107415b43 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql @@ -189,5 +189,6 @@ SELECT MOD(CAST(`float64_col` AS BIGNUMERIC), CAST(0 AS BIGNUMERIC)) ) ELSE MOD(CAST(`float64_col` AS BIGNUMERIC), CAST(0 AS BIGNUMERIC)) - END AS `float_mod_0` + END AS `float_mod_0`, + NULL AS `float_mod_null` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql index 57aff081589..ebe8d571d65 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` * `int64_col` AS `int_mul_int`, `int64_col` * 1 AS `int_mul_1`, + NULL AS `int_mul_null`, `int64_col` * CAST(`bool_col` AS INT64) AS `int_mul_bool`, CAST(`bool_col` AS INT64) * `int64_col` AS `bool_mul_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql index 8f72522262c..7202903ebe3 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql @@ -241,5 +241,7 @@ SELECT ELSE 1 END ) - END AS `float_pow_1` + END AS `float_pow_1`, + NULL AS `float_pow_null`, + NULL AS `null_pow_float` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql index e1ca93d1363..c1d0350a664 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql @@ -2,8 +2,9 @@ SELECT `rowindex`, `int64_col`, `bool_col`, - `int64_col` - `int64_col` AS `int_add_int`, - `int64_col` - 1 AS `int_add_1`, - `int64_col` - CAST(`bool_col` AS INT64) AS `int_add_bool`, - CAST(`bool_col` AS INT64) - `int64_col` AS `bool_add_int` + `int64_col` - `int64_col` AS `int_sub_int`, + `int64_col` - 1 AS `int_sub_1`, + NULL AS `int_sub_null`, + `int64_col` - CAST(`bool_col` AS INT64) AS `int_sub_bool`, + CAST(`bool_col` AS INT64) - `int64_col` AS `bool_sub_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql index df4dc689f70..f011480ad30 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql @@ -1,3 +1,18 @@ SELECT - SUBSTRING(`string_col`, 2, 2) AS `string_col` + SUBSTRING(`string_col`, 2, 2) AS `1_3`, + SUBSTRING(`string_col`, 1, 3) AS `none_3`, + SUBSTRING(`string_col`, 2) AS `1_none`, + SUBSTRING(`string_col`, -3) AS `m3_none`, + SUBSTRING(`string_col`, 1, GREATEST(0, LENGTH(`string_col`) + -3)) AS `none_m3`, + SUBSTRING( + `string_col`, + GREATEST(1, LENGTH(`string_col`) + -4), + GREATEST(0, LENGTH(`string_col`) + -3) - GREATEST(0, LENGTH(`string_col`) + -5) + ) AS `m5_m3`, + SUBSTRING(`string_col`, 2, GREATEST(0, LENGTH(`string_col`) + -4)) AS `1_m3`, + SUBSTRING( + `string_col`, + GREATEST(1, LENGTH(`string_col`) + -2), + GREATEST(0, 5 - GREATEST(0, LENGTH(`string_col`) + -3)) + ) AS `m3_5` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py index 67c8bb0e5ca..4075e1c2784 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py @@ -1,13 +1,13 @@ # Copyright 2025 Google LLC # -# Licensed under the Apache License, Version 2.0 (the "License"); +# Licensed under the Apache License, Version 2.0 (the \"License\"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, +# distributed under the License is distributed on an \"AS IS\" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -16,7 +16,6 @@ from bigframes import operations as ops from bigframes.core import expression -from bigframes.operations._op_converters import convert_index, convert_slice import bigframes.operations.aggregations as agg_ops import bigframes.pandas as bpd from bigframes.testing import utils @@ -34,13 +33,18 @@ def test_array_to_string(repeated_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") -def test_array_index(repeated_types_df: bpd.DataFrame, snapshot): - col_name = "string_list_col" - bf_df = repeated_types_df[[col_name]] +def test_array_index(scalar_types_df: bpd.DataFrame, snapshot): + ops_map = { + "string_index": ops.ArrayIndexOp(index=1).as_expr("string_col"), + "array_index": expression.OpExpression( + ops.ArrayIndexOp(index=1), + (ops.ToArrayOp().as_expr("int64_col", "int64_too"),), + ), + } + sql = utils._apply_ops_to_sql( - bf_df, [convert_index(1).as_expr(col_name)], [col_name] + scalar_types_df, list(ops_map.values()), list(ops_map.keys()) ) - snapshot.assert_match(sql, "out.sql") @@ -50,6 +54,9 @@ def test_array_reduce_op(repeated_types_df: bpd.DataFrame, snapshot): "std_float": ops.ArrayReduceOp(agg_ops.StdOp()).as_expr("float_list_col"), "count_str": ops.ArrayReduceOp(agg_ops.CountOp()).as_expr("string_list_col"), "any_bool": ops.ArrayReduceOp(agg_ops.AnyOp()).as_expr("bool_list_col"), + "array_agg_str": ops.ArrayReduceOp(agg_ops.ArrayAggOp()).as_expr( + "string_list_col" + ), } sql = utils._apply_ops_to_sql( @@ -58,23 +65,23 @@ def test_array_reduce_op(repeated_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") -def test_array_slice_with_only_start(repeated_types_df: bpd.DataFrame, snapshot): - col_name = "string_list_col" - bf_df = repeated_types_df[[col_name]] - sql = utils._apply_ops_to_sql( - bf_df, [convert_slice(slice(1, None)).as_expr(col_name)], [col_name] - ) - - snapshot.assert_match(sql, "out.sql") - +def test_array_slice(scalar_types_df: bpd.DataFrame, snapshot): + array_expr = ops.ToArrayOp().as_expr("int64_col", "int64_too") + ops_map = { + "string_slice": ops.ArraySliceOp(start=1, stop=5).as_expr("string_col"), + "slice_only_start": expression.OpExpression( + ops.ArraySliceOp(start=1, stop=None), + (array_expr,), + ), + "slice_start_stop": expression.OpExpression( + ops.ArraySliceOp(start=1, stop=5), + (array_expr,), + ), + } -def test_array_slice_with_start_and_stop(repeated_types_df: bpd.DataFrame, snapshot): - col_name = "string_list_col" - bf_df = repeated_types_df[[col_name]] sql = utils._apply_ops_to_sql( - bf_df, [convert_slice(slice(1, 5)).as_expr(col_name)], [col_name] + scalar_types_df, list(ops_map.values()), list(ops_map.keys()) ) - snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py index 80aa22aaac9..ac032f46e68 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py @@ -14,7 +14,9 @@ import pytest +from bigframes import operations as ops import bigframes.pandas as bpd +from bigframes.testing import utils pytest.importorskip("pytest_snapshot") @@ -31,6 +33,28 @@ def test_obj_get_access_url(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_obj_get_access_url_with_duration(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "string_col" + bf_df = scalar_types_df[[col_name]] + sql = utils._apply_ops_to_sql( + bf_df, + [ops.ObjGetAccessUrl(mode="READ", duration=3600).as_expr(col_name)], + [col_name], + ) + snapshot.assert_match(sql, "out.sql") + + def test_obj_make_ref(scalar_types_df: bpd.DataFrame, snapshot): blob_df = scalar_types_df["string_col"].str.to_blob() snapshot.assert_match(blob_df.to_frame().sql, "out.sql") + + +def test_obj_make_ref_json(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "string_col" + bf_df = scalar_types_df[[col_name]] + sql = utils._apply_ops_to_sql( + bf_df, + [ops.obj_make_ref_json_op.as_expr(col_name)], + [col_name], + ) + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py index 601fd86e4e9..bd51ea905a2 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py @@ -26,6 +26,7 @@ def test_and_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_and_int"] = bf_df["int64_col"] & bf_df["int64_col"] bf_df["bool_and_bool"] = bf_df["bool_col"] & bf_df["bool_col"] bf_df["bool_and_null"] = bf_df["bool_col"] & pd.NA # type: ignore + bf_df["null_and_bool"] = pd.NA & bf_df["bool_col"] # type: ignore snapshot.assert_match(bf_df.sql, "out.sql") @@ -35,6 +36,7 @@ def test_or_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_and_int"] = bf_df["int64_col"] | bf_df["int64_col"] bf_df["bool_and_bool"] = bf_df["bool_col"] | bf_df["bool_col"] bf_df["bool_and_null"] = bf_df["bool_col"] | pd.NA # type: ignore + bf_df["null_and_bool"] = pd.NA | bf_df["bool_col"] # type: ignore snapshot.assert_match(bf_df.sql, "out.sql") @@ -44,4 +46,5 @@ def test_xor_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_and_int"] = bf_df["int64_col"] ^ bf_df["int64_col"] bf_df["bool_and_bool"] = bf_df["bool_col"] ^ bf_df["bool_col"] bf_df["bool_and_null"] = bf_df["bool_col"] ^ pd.NA # type: ignore + bf_df["null_and_bool"] = pd.NA ^ bf_df["bool_col"] # type: ignore snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py index 3c13bc798bc..05fa1b54345 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py @@ -44,6 +44,13 @@ def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): values=(None, 123456), match_nulls=False ).as_expr(int_col), "float_in_ints": ops.IsInOp(values=(1, 2, 3, None)).as_expr(float_col), + "mixed_with_null": ops.IsInOp( + values=("1.0", 2, None), match_nulls=True + ).as_expr(int_col), + "bool_in_mixed": ops.IsInOp(values=(1, 2.5)).as_expr(bool_col), + "only_null_match": ops.IsInOp(values=(None,), match_nulls=True).as_expr( + int_col + ), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) @@ -62,6 +69,7 @@ def test_eq_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_eq_int"] = bf_df["int64_col"] == bf_df["int64_col"] bf_df["int_eq_1"] = bf_df["int64_col"] == 1 bf_df["int_eq_null"] = bf_df["int64_col"] == pd.NA + bf_df["null_eq_int"] = pd.NA == bf_df["int64_col"] bf_df["int_eq_bool"] = bf_df["int64_col"] == bf_df["bool_col"] bf_df["bool_eq_int"] = bf_df["bool_col"] == bf_df["int64_col"] @@ -74,6 +82,7 @@ def test_gt_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_gt_int"] = bf_df["int64_col"] > bf_df["int64_col"] bf_df["int_gt_1"] = bf_df["int64_col"] > 1 + bf_df["null_gt_int"] = pd.NA > bf_df["int64_col"] bf_df["int_gt_bool"] = bf_df["int64_col"] > bf_df["bool_col"] bf_df["bool_gt_int"] = bf_df["bool_col"] > bf_df["int64_col"] @@ -86,6 +95,7 @@ def test_ge_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_ge_int"] = bf_df["int64_col"] >= bf_df["int64_col"] bf_df["int_ge_1"] = bf_df["int64_col"] >= 1 + bf_df["null_ge_int"] = pd.NA >= bf_df["int64_col"] bf_df["int_ge_bool"] = bf_df["int64_col"] >= bf_df["bool_col"] bf_df["bool_ge_int"] = bf_df["bool_col"] >= bf_df["int64_col"] @@ -98,6 +108,7 @@ def test_lt_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_lt_int"] = bf_df["int64_col"] < bf_df["int64_col"] bf_df["int_lt_1"] = bf_df["int64_col"] < 1 + bf_df["null_lt_int"] = pd.NA < bf_df["int64_col"] bf_df["int_lt_bool"] = bf_df["int64_col"] < bf_df["bool_col"] bf_df["bool_lt_int"] = bf_df["bool_col"] < bf_df["int64_col"] @@ -110,6 +121,7 @@ def test_le_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_le_int"] = bf_df["int64_col"] <= bf_df["int64_col"] bf_df["int_le_1"] = bf_df["int64_col"] <= 1 + bf_df["null_le_int"] = pd.NA <= bf_df["int64_col"] bf_df["int_le_bool"] = bf_df["int64_col"] <= bf_df["bool_col"] bf_df["bool_le_int"] = bf_df["bool_col"] <= bf_df["int64_col"] @@ -137,6 +149,7 @@ def test_ne_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_ne_int"] = bf_df["int64_col"] != bf_df["int64_col"] bf_df["int_ne_1"] = bf_df["int64_col"] != 1 bf_df["int_ne_null"] = bf_df["int64_col"] != pd.NA + bf_df["null_ne_int"] = pd.NA != bf_df["int64_col"] bf_df["int_ne_bool"] = bf_df["int64_col"] != bf_df["bool_col"] bf_df["bool_ne_int"] = bf_df["bool_col"] != bf_df["int64_col"] diff --git a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py index 76966d3c9bb..1d6ea99d340 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py @@ -64,9 +64,24 @@ def test_datetime_to_integer_label(scalar_types_df: bpd.DataFrame, snapshot): "fixed_freq": ops.DatetimeToIntegerLabelOp( freq=pd.tseries.offsets.Day(), origin="start", closed="left" # type: ignore ).as_expr("datetime_col", "timestamp_col"), + "origin_epoch": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.Day(), origin="epoch", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), + "origin_start_day": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.Day(), origin="start_day", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), "non_fixed_freq_weekly": ops.DatetimeToIntegerLabelOp( freq=pd.tseries.offsets.Week(weekday=6), origin="start", closed="left" # type: ignore ).as_expr("datetime_col", "timestamp_col"), + "non_fixed_freq_monthly": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.MonthEnd(), origin="start", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), + "non_fixed_freq_quarterly": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.QuarterEnd(startingMonth=12), origin="start", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), + "non_fixed_freq_yearly": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.YearEnd(), origin="start", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) @@ -183,6 +198,9 @@ def test_to_datetime(scalar_types_df: bpd.DataFrame, snapshot): col_names = ["int64_col", "string_col", "float64_col", "timestamp_col"] bf_df = scalar_types_df[col_names] ops_map = {col_name: ops.ToDatetimeOp().as_expr(col_name) for col_name in col_names} + ops_map["string_col_fmt"] = ops.ToDatetimeOp(format="%Y-%m-%d").as_expr( + "string_col" + ) sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) snapshot.assert_match(sql, "out.sql") @@ -198,6 +216,7 @@ def test_to_timestamp(scalar_types_df: bpd.DataFrame, snapshot): "int64_col_us": ops.ToTimestampOp(unit="us").as_expr("int64_col"), "int64_col_ns": ops.ToTimestampOp(unit="ns").as_expr("int64_col"), "datetime_col": ops.ToTimestampOp().as_expr("datetime_col"), + "string_col_fmt": ops.ToTimestampOp(format="%Y-%m-%d").as_expr("string_col"), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) diff --git a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py index 2667e482c88..da5baea5248 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py @@ -178,17 +178,13 @@ def test_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): "my_project.my_dataset.my_routine" ), signature=udf_def.UdfSignature( - input_types=( - udf_def.UdfField( + inputs=( + udf_def.UdfArg( "x", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.INT64 - ), + udf_def.DirectScalarType(int), ), ), - output_bq_type=bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + output=udf_def.DirectScalarType(float), ), ) ops_map = { @@ -211,23 +207,17 @@ def test_binary_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): "my_project.my_dataset.my_routine" ), signature=udf_def.UdfSignature( - input_types=( - udf_def.UdfField( + inputs=( + udf_def.UdfArg( "x", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.INT64 - ), + udf_def.DirectScalarType(int), ), - udf_def.UdfField( + udf_def.UdfArg( "y", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + udf_def.DirectScalarType(float), ), ), - output_bq_type=bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + output=udf_def.DirectScalarType(float), ), ) ) @@ -244,29 +234,21 @@ def test_nary_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): "my_project.my_dataset.my_routine" ), signature=udf_def.UdfSignature( - input_types=( - udf_def.UdfField( + inputs=( + udf_def.UdfArg( "x", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.INT64 - ), + udf_def.DirectScalarType(int), ), - udf_def.UdfField( + udf_def.UdfArg( "y", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + udf_def.DirectScalarType(float), ), - udf_def.UdfField( + udf_def.UdfArg( "z", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.STRING - ), + udf_def.DirectScalarType(str), ), ), - output_bq_type=bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + output=udf_def.DirectScalarType(float), ), ) ) diff --git a/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py index 1c5894fc966..fa6d6d546fd 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py @@ -95,6 +95,16 @@ def test_json_value(json_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_json_value_array(json_types_df: bpd.DataFrame, snapshot): + col_name = "json_col" + bf_df = json_types_df[[col_name]] + sql = utils._apply_ops_to_sql( + bf_df, [ops.JSONValueArray(json_path="$").as_expr(col_name)], [col_name] + ) + + snapshot.assert_match(sql, "out.sql") + + def test_parse_json(scalar_types_df: bpd.DataFrame, snapshot): col_name = "string_col" bf_df = scalar_types_df[[col_name]] diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py index f0237159bc7..1d2f0a5b44b 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py @@ -220,6 +220,9 @@ def test_pow(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_pow_1"] = bf_df["int64_col"] ** 1 bf_df["float_pow_1"] = bf_df["float64_col"] ** 1 + bf_df["float_pow_null"] = bf_df["float64_col"] ** pd.NA + bf_df["null_pow_float"] = pd.NA ** bf_df["float64_col"] + snapshot.assert_match(bf_df.sql, "out.sql") @@ -282,6 +285,7 @@ def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_add_int"] = bf_df["int64_col"] + bf_df["int64_col"] bf_df["int_add_1"] = bf_df["int64_col"] + 1 + bf_df["int_add_null"] = bf_df["int64_col"] + pd.NA bf_df["int_add_bool"] = bf_df["int64_col"] + bf_df["bool_col"] bf_df["bool_add_int"] = bf_df["bool_col"] + bf_df["int64_col"] @@ -323,6 +327,7 @@ def test_div_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_div_int"] = bf_df["int64_col"] / bf_df["int64_col"] bf_df["int_div_1"] = bf_df["int64_col"] / 1 bf_df["int_div_0"] = bf_df["int64_col"] / 0.0 + bf_df["int_div_null"] = bf_df["int64_col"] / pd.NA bf_df["int_div_float"] = bf_df["int64_col"] / bf_df["float64_col"] bf_df["float_div_int"] = bf_df["float64_col"] / bf_df["int64_col"] @@ -363,14 +368,18 @@ def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_div_int"] = bf_df["int64_col"] // bf_df["int64_col"] bf_df["int_div_1"] = bf_df["int64_col"] // 1 bf_df["int_div_0"] = bf_df["int64_col"] // 0.0 + bf_df["int_div_null"] = bf_df["int64_col"] // pd.NA bf_df["int_div_float"] = bf_df["int64_col"] // bf_df["float64_col"] bf_df["float_div_int"] = bf_df["float64_col"] // bf_df["int64_col"] bf_df["float_div_0"] = bf_df["float64_col"] // 0.0 + bf_df["float_div_null"] = bf_df["float64_col"] // pd.NA bf_df["int_div_bool"] = bf_df["int64_col"] // bf_df["bool_col"] bf_df["bool_div_int"] = bf_df["bool_col"] // bf_df["int64_col"] + snapshot.assert_match(bf_df.sql, "out.sql") + def test_floordiv_timedelta(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["timestamp_col", "date_col"]] @@ -401,6 +410,7 @@ def test_mul_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_mul_int"] = bf_df["int64_col"] * bf_df["int64_col"] bf_df["int_mul_1"] = bf_df["int64_col"] * 1 + bf_df["int_mul_null"] = bf_df["int64_col"] * pd.NA bf_df["int_mul_bool"] = bf_df["int64_col"] * bf_df["bool_col"] bf_df["bool_mul_int"] = bf_df["bool_col"] * bf_df["int64_col"] @@ -431,17 +441,20 @@ def test_mod_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["float_mod_1"] = bf_df["float64_col"] % 1 bf_df["float_mod_0"] = bf_df["float64_col"] % 0 + bf_df["float_mod_null"] = bf_df["float64_col"] % pd.NA + snapshot.assert_match(bf_df.sql, "out.sql") def test_sub_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col"]] - bf_df["int_add_int"] = bf_df["int64_col"] - bf_df["int64_col"] - bf_df["int_add_1"] = bf_df["int64_col"] - 1 + bf_df["int_sub_int"] = bf_df["int64_col"] - bf_df["int64_col"] + bf_df["int_sub_1"] = bf_df["int64_col"] - 1 + bf_df["int_sub_null"] = bf_df["int64_col"] - pd.NA - bf_df["int_add_bool"] = bf_df["int64_col"] - bf_df["bool_col"] - bf_df["bool_add_int"] = bf_df["bool_col"] - bf_df["int64_col"] + bf_df["int_sub_bool"] = bf_df["int64_col"] - bf_df["bool_col"] + bf_df["bool_sub_int"] = bf_df["bool_col"] - bf_df["int64_col"] snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py index fff2cc06df4..bb0e413486a 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py @@ -219,9 +219,17 @@ def test_str_pad(scalar_types_df: bpd.DataFrame, snapshot): def test_str_slice(scalar_types_df: bpd.DataFrame, snapshot): col_name = "string_col" bf_df = scalar_types_df[[col_name]] - sql = utils._apply_ops_to_sql( - bf_df, [ops.StrSliceOp(1, 3).as_expr(col_name)], [col_name] - ) + ops_map = { + "1_3": ops.StrSliceOp(1, 3).as_expr(col_name), + "none_3": ops.StrSliceOp(None, 3).as_expr(col_name), + "1_none": ops.StrSliceOp(1, None).as_expr(col_name), + "m3_none": ops.StrSliceOp(-3, None).as_expr(col_name), + "none_m3": ops.StrSliceOp(None, -3).as_expr(col_name), + "m5_m3": ops.StrSliceOp(-5, -3).as_expr(col_name), + "1_m3": ops.StrSliceOp(1, -3).as_expr(col_name), + "m3_5": ops.StrSliceOp(-3, 5).as_expr(col_name), + } + sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_bigframes_sql_scalar/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_bigframes_sql_scalar/out.sql new file mode 100644 index 00000000000..14853067c70 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_bigframes_sql_scalar/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + ROUND(`int64_col` + `int64_too`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table/out.sql b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table/out.sql new file mode 100644 index 00000000000..867282de0e7 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table/out.sql @@ -0,0 +1,7 @@ +CREATE EXTERNAL TABLE `my-project.my_dataset.my_table` ( + `col1` INT64, + `col2` STRING +) OPTIONS ( + format='CSV', + uris=['gs://bucket/path*'] +) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_all_options/out.sql b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_all_options/out.sql new file mode 100644 index 00000000000..a08ddf5ee5d --- /dev/null +++ b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_all_options/out.sql @@ -0,0 +1,10 @@ +CREATE OR REPLACE EXTERNAL TABLE `my-project.my_dataset.my_table` ( + `col1` INT64, + `col2` STRING +) WITH CONNECTION `my-connection` WITH PARTITION COLUMNS ( + `part1` DATE, + `part2` STRING +) OPTIONS ( + format='CSV', + uris=['gs://bucket/path*'] +) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_if_not_exists/out.sql b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_if_not_exists/out.sql new file mode 100644 index 00000000000..e05a553317b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_if_not_exists/out.sql @@ -0,0 +1,6 @@ +CREATE EXTERNAL TABLE IF NOT EXISTS `my-project.my_dataset.my_table` ( + `col1` INT64 +) OPTIONS ( + format='CSV', + uris=['gs://bucket/path*'] +) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/sql/test_ddl.py b/tests/unit/core/compile/sqlglot/sql/test_ddl.py index 14d3708883d..48080cd6b9c 100644 --- a/tests/unit/core/compile/sqlglot/sql/test_ddl.py +++ b/tests/unit/core/compile/sqlglot/sql/test_ddl.py @@ -12,13 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import pytest +import bigframes.bigquery import bigframes.core.compile.sqlglot.sql as sql +import bigframes.session pytest.importorskip("pytest_snapshot") +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + def test_load_data_minimal(snapshot): expr = sql.load_data( "my-project.my_dataset.my_table", @@ -40,3 +49,39 @@ def test_load_data_all_options(snapshot): connection_name="my-connection", ) snapshot.assert_match(sql.to_sql(expr), "out.sql") + + +@mock.patch("bigframes.bigquery._operations.table._get_table_metadata") +def test_create_external_table(get_table_metadata_mock, mock_session, snapshot): + bigframes.bigquery.create_external_table( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + session=mock_session, + ) + mock_session.read_gbq_query.assert_called_once() + generated_sql = mock_session.read_gbq_query.call_args[0][0] + snapshot.assert_match(generated_sql, "out.sql") + get_table_metadata_mock.assert_called_once() + + +def test_create_external_table_all_options(snapshot): + expr = sql.create_external_table( + "my-project.my_dataset.my_table", + replace=True, + columns={"col1": "INT64", "col2": "STRING"}, + partition_columns={"part1": "DATE", "part2": "STRING"}, + connection_name="my-connection", + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + snapshot.assert_match(sql.to_sql(expr), "out.sql") + + +def test_create_external_table_if_not_exists(snapshot): + expr = sql.create_external_table( + "my-project.my_dataset.my_table", + if_not_exists=True, + columns={"col1": "INT64"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + snapshot.assert_match(sql.to_sql(expr), "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py index 327b8e4206a..7b3ee5f9229 100644 --- a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py +++ b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py @@ -43,3 +43,83 @@ def to_pandas(series, *, ordered): session.read_pandas.assert_called_once() snapshot.assert_match(result, "out.sql") + + +def test_ai_forecast(snapshot, monkeypatch): + import bigframes.bigquery.ai + import bigframes.session + + session = mock.create_autospec(bigframes.session.Session) + bf_df = mock.create_autospec(bpd.DataFrame) + session.read_pandas.return_value = bf_df + + def mock_ai_forecast(df, **kwargs): + assert df is bf_df + result_df = mock.create_autospec(bpd.DataFrame) + result_df.to_pandas.return_value = kwargs + return result_df + + import bigframes.bigquery.ai + + monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_ai_forecast) + + df = pd.DataFrame({"date": ["2020-01-01"], "value": [1.0]}) + result = df.bigquery.ai.forecast( + timestamp_col="date", + data_col="value", + horizon=5, + session=session, + ) + + session.read_pandas.assert_called_once() + assert result == { + "timestamp_col": "date", + "data_col": "value", + "model": "TimesFM 2.0", + "id_cols": None, + "horizon": 5, + "confidence_level": 0.95, + "context_window": None, + "output_historical_time_series": False, + } + + +def test_bigframes_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot): + session = mock.create_autospec(bigframes.session.Session) + + result = scalar_types_df.bigquery.sql_scalar( + "ROUND({int64_col} + {int64_too})", + output_dtype=pd.Int64Dtype(), + session=session, + ) + + session.read_pandas.assert_not_called() + # Bigframes implementation returns a bigframes.series.Series + sql, _, _ = result.to_frame()._to_sql_query(include_index=True) + snapshot.assert_match(sql, "out.sql") + + +def test_bigframes_ai_forecast(snapshot, monkeypatch): + import bigframes.bigquery.ai + import bigframes.session + + session = mock.create_autospec(bigframes.session.Session) + bf_df = mock.create_autospec(bpd.DataFrame) + + def mock_ai_forecast(df, **kwargs): + assert df is bf_df + result_df = mock.create_autospec(bpd.DataFrame) + return result_df + + monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_ai_forecast) + + result = bf_df.bigquery.ai.forecast( + timestamp_col="date", + data_col="value", + horizon=5, + session=session, + ) + + session.read_pandas.assert_not_called() + # BigFrames accessor returns the bf_df directly without calling to_pandas + assert result is not None diff --git a/tests/unit/extensions/pandas/test_registration.py b/tests/unit/extensions/pandas/test_registration.py index 12580980916..7007d6f9f2f 100644 --- a/tests/unit/extensions/pandas/test_registration.py +++ b/tests/unit/extensions/pandas/test_registration.py @@ -22,6 +22,8 @@ def test_bigframes_import_registers_accessor(): df = pd.DataFrame({"a": [1]}) # If bigframes was imported, df.bigquery should exist assert hasattr(df, "bigquery") - from bigframes.extensions.pandas.dataframe_accessor import BigQueryDataFrameAccessor + from bigframes.extensions.pandas.dataframe_accessor import ( + PandasBigQueryDataFrameAccessor, + ) - assert isinstance(df.bigquery, BigQueryDataFrameAccessor) + assert isinstance(df.bigquery, PandasBigQueryDataFrameAccessor) diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index e9e0d0df677..bfb6192a2c4 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -12,36 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re - -import pandas import pytest -import bigframes.exceptions import bigframes.functions.function as bff from bigframes.testing import mocks -def test_series_input_types_to_str(): - """Check that is_row_processor=True uses str as the input type to serialize a row.""" - session = mocks.create_bigquery_session() - remote_function_decorator = bff.remote_function( - session=session, cloud_function_service_account="default" - ) - - with pytest.warns( - bigframes.exceptions.PreviewWarning, - match=re.escape("input_types=Series is in preview."), - ): - - @remote_function_decorator - def axis_1_function(myparam: pandas.Series) -> str: # type: ignore - return "Hello, " + myparam["str_col"] + "!" # type: ignore - - # Still works as a normal function. - assert axis_1_function(pandas.Series({"str_col": "World"})) == "Hello, World!" - - def test_missing_input_types(): session = mocks.create_bigquery_session() remote_function_decorator = bff.remote_function( @@ -78,36 +54,6 @@ def function_without_return_annotation(myparam: int): remote_function_decorator(function_without_return_annotation) -def test_deploy_remote_function(): - session = mocks.create_bigquery_session() - - def my_remote_func(x: int) -> int: - return x * 2 - - deployed = session.deploy_remote_function( - my_remote_func, cloud_function_service_account="test_sa@example.com" - ) - - # Test that the function would have been deployed somewhere. - assert deployed.bigframes_bigquery_function - - -def test_deploy_remote_function_with_name(): - session = mocks.create_bigquery_session() - - def my_remote_func(x: int) -> int: - return x * 2 - - deployed = session.deploy_remote_function( - my_remote_func, - name="my_custom_name", - cloud_function_service_account="test_sa@example.com", - ) - - # Test that the function would have been deployed somewhere. - assert "my_custom_name" in deployed.bigframes_bigquery_function - - def test_deploy_udf(): session = mocks.create_bigquery_session() diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py index e200e7c12a1..5ca26fe96f6 100644 --- a/tests/unit/functions/test_remote_function_utils.py +++ b/tests/unit/functions/test_remote_function_utils.py @@ -41,82 +41,6 @@ def test_get_remote_function_locations( assert cf_region == expected_cf_region -@pytest.mark.parametrize( - "func_hash, session_id, uniq_suffix, expected_name", - [ - ( - "hash123", - None, - None, - "bigframes-hash123", - ), - ( - "hash456", - "session789", - None, - "bigframes-session789-hash456", - ), - ( - "hash123", - None, - "suffixABC", - "bigframes-hash123-suffixABC", - ), - ( - "hash456", - "session789", - "suffixDEF", - "bigframes-session789-hash456-suffixDEF", - ), - ], -) -def test_get_cloud_function_name(func_hash, session_id, uniq_suffix, expected_name): - """Tests the construction of the cloud function name from its parts.""" - result = _utils.get_cloud_function_name(func_hash, session_id, uniq_suffix) - - assert result == expected_name - - -@pytest.mark.parametrize( - "function_hash, session_id, uniq_suffix, expected_name", - [ - ( - "hash123", - "session456", - None, - "bigframes_session456_hash123", - ), - ( - "hash789", - "sessionABC", - "suffixDEF", - "bigframes_sessionABC_hash789_suffixDEF", - ), - ], -) -def test_get_bigframes_function_name( - function_hash, session_id, uniq_suffix, expected_name -): - """Tests the construction of the BigQuery function name from its parts.""" - result = _utils.get_bigframes_function_name(function_hash, session_id, uniq_suffix) - - assert result == expected_name - - -def test_get_updated_package_requirements_no_extra_package(): - """Tests with no extra package.""" - result = _utils.get_updated_package_requirements(capture_references=False) - - assert result is None - - initial_packages = ["xgboost"] - result = _utils.get_updated_package_requirements( - initial_packages, capture_references=False - ) - - assert result == initial_packages - - @patch("bigframes.functions._utils.numpy.__version__", "1.24.4") @patch("bigframes.functions._utils.pyarrow.__version__", "14.0.1") @patch("bigframes.functions._utils.pandas.__version__", "2.0.3") @@ -162,7 +86,7 @@ def test_get_updated_package_requirements_capture_references_false(): # Case 1: Only capture_references=False. result_1 = _utils.get_updated_package_requirements(capture_references=False) - assert result_1 is None + assert len(result_1) == 0 # Case 2: capture_references=False but is_row_processor=True. expected_2 = ["numpy==1.24.4", "pandas==2.0.3", "pyarrow==14.0.1"] @@ -264,78 +188,6 @@ def test_package_existed_helper(): assert not _utils._package_existed([], "pandas") -def _function_add_one(x): - return x + 1 - - -def _function_add_two(x): - return x + 2 - - -@pytest.mark.parametrize( - "func1, func2, should_be_equal, description", - [ - ( - _function_add_one, - _function_add_one, - True, - "Identical functions should have the same hash.", - ), - ( - _function_add_one, - _function_add_two, - False, - "Different functions should have different hashes.", - ), - ], -) -def test_get_hash_without_package_requirements( - func1, func2, should_be_equal, description -): - """Tests function hashes without any requirements.""" - hash1 = _utils.get_hash(func1) - hash2 = _utils.get_hash(func2) - - if should_be_equal: - assert hash1 == hash2, f"FAILED: {description}" - else: - assert hash1 != hash2, f"FAILED: {description}" - - -@pytest.mark.parametrize( - "reqs1, reqs2, should_be_equal, description", - [ - ( - None, - ["pandas>=1.0"], - False, - "Hash with or without requirements should differ from hash.", - ), - ( - ["pandas", "numpy", "scikit-learn"], - ["numpy", "scikit-learn", "pandas"], - True, - "Same requirements should produce the same hash.", - ), - ( - ["pandas==1.0"], - ["pandas==2.0"], - False, - "Different requirement versions should produce different hashes.", - ), - ], -) -def test_get_hash_with_package_requirements(reqs1, reqs2, should_be_equal, description): - """Tests how package requirements affect the final hash.""" - hash1 = _utils.get_hash(_function_add_one, package_requirements=reqs1) - hash2 = _utils.get_hash(_function_add_one, package_requirements=reqs2) - - if should_be_equal: - assert hash1 == hash2, f"FAILED: {description}" - else: - assert hash1 != hash2, f"FAILED: {description}" - - # Helper functions for signature inspection tests def _func_one_arg_annotated(x: int) -> int: """A function with one annotated arg and an annotated return type.""" diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 0e600de9645..bb2b57d4090 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -71,3 +71,11 @@ def test_infer_literal_type_arrow_scalar(scalar, expected_dtype): ) def test_contains_db_dtypes_json_arrow_type(type_, expected): assert bigframes.dtypes.contains_db_dtypes_json_arrow_type(type_) == expected + + +def test_convert_to_schema_field_list_description(): + bf_dtype = bigframes.dtypes.OBJ_REF_DTYPE + list_bf_dtype = bigframes.dtypes.list_type(bf_dtype) + field = bigframes.dtypes.convert_to_schema_field("my_list", list_bf_dtype) + assert field.description == "bigframes_dtype: OBJ_REF_DTYPE" + assert field.mode == "REPEATED" diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index b342c7e4a99..1d3f63d216d 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -21,9 +21,6 @@ schema_from_bigquery_table, ) from bigframes_vendored.ibis.backends.bigquery.datatypes import BigQuerySchema -from bigframes_vendored.ibis.backends.bigquery.udf.core import ( - PythonToJavaScriptTranslator, -) from bigframes_vendored.ibis.backends.sql import SQLBackend from bigframes_vendored.ibis.backends.sql.compilers import BigQueryCompiler from bigframes_vendored.ibis.backends.sql.datatypes import BigQueryType @@ -731,15 +728,7 @@ def compile( ): """Compile an Ibis expression to a SQL string.""" query = self._to_sqlglot(expr, limit=limit, params=params, **kwargs) - udf_sources = [] - for udf_node in expr.op().find(ops.ScalarUDF): - compile_func = getattr( - self, f"_compile_{udf_node.__input_type__.name.lower()}_udf" - ) - if sql := compile_func(udf_node): - udf_sources.append(sql.sql(self.name, pretty=True)) - - sql = ";\n".join([*udf_sources, query.sql(dialect=self.name, pretty=True)]) + sql = query.sql(dialect=self.name, pretty=True) self._log(sql) return sql @@ -1186,68 +1175,6 @@ def _clean_up_cached_table(self, name): force=True, ) - def _get_udf_source(self, udf_node: ops.ScalarUDF): - name = type(udf_node).__name__ - type_mapper = self.compiler.udf_type_mapper - - body = PythonToJavaScriptTranslator(udf_node.__func__).compile() - config = udf_node.__config__ - libraries = config.get("libraries", []) - - signature = [ - sge.ColumnDef( - this=sg.to_identifier(name, quoted=self.compiler.quoted), - kind=type_mapper.from_ibis(param.annotation.pattern.dtype), - ) - for name, param in udf_node.__signature__.parameters.items() - ] - - lines = ['"""'] - - if config.get("strict", True): - lines.append('"use strict";') - - lines += [ - body, - "", - f"return {udf_node.__func_name__}({', '.join(udf_node.argnames)});", - '"""', - ] - - func = sge.Create( - kind="FUNCTION", - this=sge.UserDefinedFunction( - this=sg.to_identifier(name), expressions=signature, wrapped=True - ), - # not exactly what I had in mind, but it works - # - # quoting is too simplistic to handle multiline strings - expression=sge.Var(this="\n".join(lines)), - exists=False, - properties=sge.Properties( - expressions=[ - sge.TemporaryProperty(), - sge.ReturnsProperty(this=type_mapper.from_ibis(udf_node.dtype)), - sge.StabilityProperty( - this="IMMUTABLE" if config.get("determinism") else "VOLATILE" - ), - sge.LanguageProperty(this=sg.to_identifier("js")), - ] - + [ - sge.Property( - this=sg.to_identifier("library"), - value=self.compiler.f.array(*libraries), - ) - ] - * bool(libraries) - ), - ) - - return func - - def _compile_python_udf(self, udf_node: ops.ScalarUDF) -> None: - return self._get_udf_source(udf_node) - def _register_udfs(self, expr: ir.Expr) -> None: """No op because UDFs made with CREATE TEMPORARY FUNCTION must be followed by a query.""" diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/core.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/core.py deleted file mode 100644 index 6f59a2becd7..00000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/core.py +++ /dev/null @@ -1,604 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/backends/bigquery/udf/core.py - -"""Translate a Python AST to JavaScript.""" - -from __future__ import annotations - -import ast -from collections import ChainMap -import contextlib -import functools -import inspect -import textwrap -from typing import TYPE_CHECKING - -from bigframes_vendored.ibis.backends.bigquery.udf.find import find_names -from bigframes_vendored.ibis.backends.bigquery.udf.rewrite import rewrite - -if TYPE_CHECKING: - from collections.abc import Callable - - -class SymbolTable(ChainMap): - """ChainMap subclass implementing scope for the translator. - - Notes - ----- - JavaScript requires declarations in strict mode, so to implement this we - shove a "let" at the beginning of every variable name if it doesn't already - exist in the current scope. - - """ - - def __getitem__(self, key): - if key not in self: - self[key] = key - return f"let {key}" - return key - - -def indent(lines, spaces=4): - """Indent `lines` by `spaces` spaces. - - Parameters - ---------- - lines : Union[str, List[str]] - A string or list of strings to indent - spaces : int - The number of spaces to indent `lines` - - Returns - ------- - indented_lines : str - - """ - if isinstance(lines, str): - text = [lines] - text = "\n".join(lines) - return textwrap.indent(text, " " * spaces) - - -def semicolon(f: Callable) -> Callable: - """Add a semicolon to the result of a `visit_*` call.""" - - @functools.wraps(f) - def wrapper(*args, **kwargs): - return f(*args, **kwargs) + ";" - - return wrapper - - -@rewrite.register(ast.Call(func=ast.Name(id="print"))) -def rewrite_print(node): - return ast.Call( - func=ast.Attribute( - value=ast.Name(id="console", ctx=ast.Load()), - attr="log", - ctx=ast.Load(), - ), - args=node.args, - keywords=node.keywords, - ) - - -@rewrite.register(ast.Call(func=ast.Name(id="len"))) -def rewrite_len(node): - assert len(node.args) == 1 - return ast.Attribute(value=node.args[0], attr="length", ctx=ast.Load()) - - -@rewrite.register(ast.Call(func=ast.Attribute(attr="append"))) -def rewrite_append(node): - return ast.Call( - func=ast.Attribute(value=node.func.value, attr="push", ctx=ast.Load()), - args=node.args, - keywords=node.keywords, - ) - - -@rewrite.register( - ast.Call(func=ast.Attribute(value=ast.Name(id="Array"), attr="from_")) -) -def rewrite_array_from(node): - return ast.Call( - func=ast.Attribute(value=node.func.value, attr="from"), - args=node.args, - keywords=node.keywords, - ) - - -class PythonToJavaScriptTranslator: - constructor_map = { - "list": "Array", - "Array": "Array", - "Date": "Date", - "dict": "Object", - "Map": "Map", - "WeakMap": "WeakMap", - "str": "String", - "String": "String", - "set": "Set", - "Set": "Set", - "WeakSet": "WeakSet", - } - - def __init__(self, function): - self.function = function - self.source = textwrap.dedent(inspect.getsource(function)) - self.ast = ast.parse(self.source) - self.scope = SymbolTable() - self.current_function = None - self.current_class = None - self.is_generator = False - self.is_nested_definition = False - - def compile(self): - return self.visit(self.ast) - - def visit(self, node): - node = rewrite(node) - typename = node.__class__.__name__ - method_name = f"visit_{typename}" - method = getattr(self, method_name, None) - if method is None: - raise NotImplementedError(f"{method_name!r} nodes not yet implemented") - assert callable(method) - - result = method(node) - return result - - def visit_Name(self, node): - if self.current_class is not None and node.id == "self": - return "this" - return node.id - - def visit_Yield(self, node): - self.is_generator = True - return f"yield {self.visit(node.value)}" - - def visit_YieldFrom(self, node): - self.is_generator = True - return f"yield* {self.visit(node.value)}" - - @semicolon - def visit_Assign(self, node): - try: - (target,) = node.targets - except ValueError: - raise NotImplementedError("Only single assignment supported for now") - - if not isinstance(target, (ast.Name, ast.Subscript, ast.Attribute)): - raise NotImplementedError( - "Only index, attribute, and variable name assignment " - f"supported, got {type(target).__name__}" - ) - - is_name = isinstance(target, ast.Name) - compiled_target = self.visit(target) - if not is_name or ( - self.current_class is not None and compiled_target.startswith("this.") - ): - self.scope[compiled_target] = compiled_target - return f"{self.scope[compiled_target]} = {self.visit(node.value)}" - - def translate_special_method(self, name): - return {"__init__": "constructor"}.get(name, name) - - def visit_FunctionDef(self, node): - self.current_function = node - - is_property_getter = any( - getattr(dec, "id", None) == "property" for dec in node.decorator_list - ) - - if self.current_class is None: # not a method - if is_property_getter: - raise TypeError("Functions cannot be properties, only methods can") - prefix = "function" - else: - if is_property_getter and self.is_generator: - raise TypeError("generator methods cannot be properties") - prefix = "get " * is_property_getter - - with self.local_scope(): - body = indent(map(self.visit, node.body)) - - if self.is_generator: - prefix += "* " - else: - prefix += " " * (self.current_class is None) - - lines = [ - prefix - + self.translate_special_method(node.name) - + f"({self.visit(node.args)}) {{", - body, - "}", - ] - - self.current_function = None - self.is_generator = False - return "\n".join(lines) - - @semicolon - def visit_Return(self, node): - return f"return {self.visit(node.value)}" - - def visit_Add(self, node): - return "+" - - def visit_Sub(self, node): - return "-" - - def visit_Mult(self, node): - return "*" - - def visit_Div(self, node): - return "/" - - def visit_FloorDiv(self, node): - raise AssertionError("should never reach FloorDiv") - - def visit_Pow(self, node): - raise AssertionError("should never reach Pow") - - def visit_UnaryOp(self, node): - return f"({self.visit(node.op)}{self.visit(node.operand)})" - - def visit_USub(self, node): - return "-" - - def visit_UAdd(self, node): - return "+" - - def visit_BinOp(self, node): - left, op, right = node.left, node.op, node.right - - if isinstance(op, ast.Pow): - return f"Math.pow({self.visit(left)}, {self.visit(right)})" - elif isinstance(op, ast.FloorDiv): - return f"Math.floor({self.visit(left)} / {self.visit(right)})" - return f"({self.visit(left)} {self.visit(op)} {self.visit(right)})" - - def visit_Constant(self, node): - value = node.value - if value is None: - return "null" - if isinstance(value, bool): - return "true" if value else "false" - if isinstance(value, (int, float, str)): - return repr(value) - raise NotImplementedError( - f"{value.__class__.__name__!r} constants not yet implemented" - ) - - def visit_NameConstant(self, node): - value = node.value - if value is True: - return "true" - elif value is False: - return "false" - assert ( - value is None - ), f"value is not True and is not False, must be None, got {value}" - return "null" - - def visit_Str(self, node): - return repr(node.s) - - def visit_Num(self, node): - return repr(node.n) - - def visit_List(self, node): - return "[{}]".format(", ".join(map(self.visit, node.elts))) - - def visit_Tuple(self, node): - # tuples becomes lists in javascript - return "[{}]".format(", ".join(map(self.visit, node.elts))) - - def visit_Dict(self, node): - return "{{{}}}".format( - ", ".join( - f"[{self.visit(key)}]: {self.visit(value)}" - for key, value in zip(node.keys, node.values) - ) - ) - - @semicolon - def visit_Expr(self, node): - return self.visit(node.value) - - def visit_Starred(self, node): - return f"...{self.visit(node.value)}" - - def visit_Call(self, node): - thing_to_call = self.visit(node.func) - constructors = self.__class__.constructor_map - args = ", ".join(map(self.visit, node.args)) - try: - thing_to_call = constructors[thing_to_call] - except KeyError: - format_string = "{}({})" - else: - format_string = "(new {}({}))" - return format_string.format(thing_to_call, args) - - def visit_Attribute(self, node): - return f"{self.visit(node.value)}.{node.attr}" - - def visit_For(self, node): - lines = [f"for (let {self.visit(node.target)} of {self.visit(node.iter)}) {{"] - with self.local_scope(): - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - return "\n".join(lines) - - def visit_While(self, node): - lines = [f"while ({self.visit(node.test)}) {{"] - with self.local_scope(): - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - return "\n".join(lines) - - @semicolon - def visit_Break(self, node): - return "break" - - @semicolon - def visit_Continue(self, node): - return "continue" - - def visit_Eq(self, node): - return "===" - - def visit_NotEq(self, node): - return "!==" - - def visit_Or(self, node): - return "||" - - def visit_And(self, node): - return "&&" - - def visit_BoolOp(self, node): - return "({})".format( - f" {self.visit(node.op)} ".join(map(self.visit, node.values)) - ) - - def visit_Lt(self, node): - return "<" - - def visit_LtE(self, node): - return "<=" - - def visit_Gt(self, node): - return ">" - - def visit_GtE(self, node): - return ">=" - - def visit_Compare(self, node): - rights = node.comparators - ops = node.ops - - left = node.left - comparisons = [] - for op, right in zip(ops, rights): - comparisons.append( - f"({self.visit(left)} {self.visit(op)} {self.visit(right)})" - ) - left = right - return " && ".join(comparisons) - - @semicolon - def visit_AugAssign(self, node): - target = self.visit(node.target) - op = self.visit(node.op) - value = self.visit(node.value) - return f"{target} {op}= {value}" - - def visit_Module(self, node): - return "\n\n".join(map(self.visit, node.body)) - - def visit_arg(self, node): - if self.current_class is not None and node.arg == "self": - return "" - return node.arg - - def visit_arguments(self, node): - args = list(filter(None, map(self.visit, node.args[:]))) - vararg = node.vararg - if vararg is not None: - args.append(f"...{vararg.arg}") - return ", ".join(args) - - def visit_Lambda(self, node): - args = node.args - generated_args = self.visit(args) - return f"(({generated_args}) => {self.visit(node.body)})" - - @contextlib.contextmanager - def local_scope(self): - """Assign symbols to local variables.""" - self.scope = self.scope.new_child() - try: - yield self.scope - finally: - self.scope = self.scope.parents - - def visit_If(self, node): - lines = [f"if ({self.visit(node.test)}) {{"] - - with self.local_scope(): - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - - if node.orelse: - lines[-1] += " else {" - with self.local_scope(): - lines.append(indent(map(self.visit, node.orelse))) - lines.append("}") - return "\n".join(lines) - - def visit_IfExp(self, node): - test = self.visit(node.test) - body = self.visit(node.body) - orelse = self.visit(node.orelse) - return f"({test} ? {body} : {orelse})" - - def visit_Index(self, node): - return self.visit(node.value) - - def visit_Subscript(self, node): - return f"{self.visit(node.value)}[{self.visit(node.slice)}]" - - def visit_ClassDef(self, node): - self.current_class = node - bases = node.bases - - lines = [f"class {node.name}"] - if bases: - lines[-1] += " extends {}".format(", ".join(map(self.visit, bases))) - lines[-1] += " {" - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - self.current_class = None - self.__class__.constructor_map[node.name] = node.name - return "\n".join(lines) - - def visit_Not(self, node): - return "!" - - def visit_ListComp(self, node): - """Generate a curried lambda function. - - [x + y for x, y in [[1, 4], [2, 5], [3, 6]]] - - becomes - - [[1, 4], [2, 5], [3, 6]]].map(([x, y]) => x + y) - """ - try: - (generator,) = node.generators - except ValueError: - raise NotImplementedError("Only single loop comprehensions are allowed") - - names = find_names(generator.target) - argslist = [ast.arg(arg=name.id, annotation=None) for name in names] - if len(names) <= 1: - signature = ast.arguments( - args=argslist, - vararg=None, - kwonlyargs=[], - kw_defaults=[], - kwarg=None, - defaults=[], - ) - else: - signature = ast.List(elts=argslist, ctx=ast.Load()) - - array = generator.iter - lam_sig = functools.partial(ast.Lambda, args=signature) - - filters = generator.ifs - if filters: - filt = ast.BoolOp(op=ast.And(), values=filters) - # array.filter - method = ast.Attribute(value=array, attr="filter", ctx=ast.Load()) - # array.filter(func) - array = ast.Call(func=method, args=[lam_sig(body=filt)], keywords=[]) - - method = ast.Attribute(value=array, attr="map", ctx=ast.Load()) - mapped = ast.Call(func=method, args=[lam_sig(body=node.elt)], keywords=[]) - result = self.visit(mapped) - return result - - def visit_Delete(self, node): - return "\n".join(f"delete {self.visit(target)};" for target in node.targets) - - -if __name__ == "__main__": - import bigframes_vendored.ibis - from bigframes_vendored.ibis import udf - - @udf.scalar.python(strict=False) - def my_func(a: float, b: float, n: float) -> list[float]: - class Rectangle: - def __init__(self, width, height): - self.width = width - self.height = height - - @property - def area(self): - return self.width * self.height - - @property - def perimeter(self): - return self.width * 2 + self.height * 2 - - def foobar(self, n): - yield from range(n) - - def sum(values): - result = 0 - for value in values: - result += value - console.log(result) # noqa: F821 - return values.reduce(lambda a, b: a + b, 0) - - def range(n): - i = 0 - while i < n: - yield i - i += 1 - - some_stuff = [x + y for x, y in [[1, 4], [2, 5], [3, 6]] if 2 < x < 3] - some_stuff1 = [range(x) for x in [1, 2, 3]] - some_stuff2 = [x + y for x, y in [(1, 4), (2, 5), (3, 6)]] - print(some_stuff) # noqa: T201 - print(some_stuff1) # noqa: T201 - print(some_stuff2) # noqa: T201 - - x = 1 - y = 2 - x = 3 - values = [] - for i in range(10): - values.append(i) - - i = 0 - foo = 2 - bar = lambda x: x # noqa: E731 - bazel = lambda x: y # noqa: E731 - while i < n: - foo = bar(bazel(10)) - i += 1 - console.log(i) # noqa: F821 - - foo = 2 - - if i == 10 and (y < 2 or i != 42): - y += 2 - else: - y -= 2 - - z = 42.0 - w = 3 - w = not False - yyz = None - print(yyz) # noqa: T201 - foobar = x < y < z < w # x < y and y < z and z < w - foobar = 1 - baz = foobar // 3 - console.log(baz) # noqa: F821 - - my_obj = {"a": 1, "b": 2} # noqa: F841 - - z = (x if y else b) + 2 + foobar - foo = Rectangle(1, 2) - nnn = len(values) - return [sum(values) - a + b * y**-x, z, foo.width, nnn] - - print( - bigframes_vendored.ibis.bigquery.compile(my_func(42.7, 13.2, 1)) - ) # noqa: T201 diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/find.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/find.py deleted file mode 100644 index b1f353ae4f1..00000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/find.py +++ /dev/null @@ -1,64 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/backends/bigquery/udf/find.py - -from __future__ import annotations - -import ast - -import toolz - - -class NameFinder: - """Helper class to find the unique names in an AST.""" - - __slots__ = () - - def find(self, node): - typename = type(node).__name__ - method = getattr(self, f"find_{typename}", None) - if method is None: - fields = getattr(node, "_fields", None) - if fields is None: - return - for field in fields: - value = getattr(node, field) - yield from self.find(value) - else: - yield from method(node) - - def find_Name(self, node): - # TODO not sure if this is robust to scope changes - yield node - - def find_list(self, node): - return list(toolz.concat(map(self.find, node))) - - def find_Call(self, node): - if not isinstance(node.func, ast.Name): - fields = node._fields - else: - fields = [field for field in node._fields if field != "func"] - return toolz.concat(map(self.find, (getattr(node, field) for field in fields))) - - -def find_names(node: ast.AST) -> list[ast.Name]: - """Return the unique `ast.Name` instances in an AST. - - Examples - -------- - >>> import ast - >>> node = ast.parse("a + b") - >>> names = find_names(node) - >>> names - [<....Name object at 0x...>, <....Name object at 0x...>] - >>> names[0].id - 'a' - >>> names[1].id - 'b' - - """ - return list( - toolz.unique( - filter(None, NameFinder().find(node)), - key=lambda node: (node.id, type(node.ctx)), - ) - ) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/rewrite.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/rewrite.py deleted file mode 100644 index 6d2b0df7cdf..00000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/rewrite.py +++ /dev/null @@ -1,54 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/backends/bigquery/udf/rewrite.py - -from __future__ import annotations - -import ast -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Callable - - -def matches(value: ast.AST, pattern: ast.AST) -> bool: - """Check whether `value` matches `pattern`.""" - # types must match exactly - if type(value) is not type(pattern): - return False - - # primitive value, such as None, True, False etc - if not isinstance(value, ast.AST) and not isinstance(pattern, ast.AST): - return value == pattern - - fields = [ - (field, getattr(pattern, field)) - for field in pattern._fields - if hasattr(pattern, field) - ] - return all( - matches(getattr(value, field_name), field_value) - for field_name, field_value in fields - ) - - -class Rewriter: - """AST pattern matcher to enable rewrite rules.""" - - def __init__(self): - self.funcs: list[tuple[ast.AST, Callable[[ast.expr], ast.expr]]] = [] - - def register(self, pattern): - def wrapper(f): - self.funcs.append((pattern, f)) - return f - - return wrapper - - def __call__(self, node): - # TODO: more efficient way of doing this? - for pattern, func in self.funcs: - if matches(node, pattern): - return func(node) - return node - - -rewrite = Rewriter() diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index b95e4280538..341b25ca1c5 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -1394,9 +1394,17 @@ def _generate_groups(groups): return map(sge.convert, range(1, len(groups) + 1)) def visit_Aggregate(self, op, *, parent, groups, metrics): - sel = sg.select( - *self._cleanup_names(groups), *self._cleanup_names(metrics), copy=False - ).from_(parent, copy=False) + exprs = [] + if groups: + exprs.extend(self._cleanup_names(groups)) + if metrics: + exprs.extend(self._cleanup_names(metrics)) + + if not exprs: + # Empty aggregated projections are invalid in BigQuery + exprs = [sge.Literal.number(1)] + + sel = sg.select(*exprs, copy=False).from_(parent, copy=False) if groups: sel = sel.group_by(*self._generate_groups(groups.values()), copy=False) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 1fa5432a166..cd462f9e8f5 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -540,6 +540,15 @@ def visit_TimestampFromUNIX(self, op, *, arg, unit): def visit_Cast(self, op, *, arg, to): from_ = op.arg.dtype + if to.is_null(): + return sge.Null() + if arg is NULL or ( + isinstance(arg, sge.Cast) + and getattr(arg, "to", None) is not None + and str(arg.to).upper() == "NULL" + ): + if to.is_struct() or to.is_array(): + return sge.Cast(this=NULL, to=self.type_mapper.from_ibis(to)) if from_.is_timestamp() and to.is_integer(): return self.f.unix_micros(arg) elif from_.is_integer() and to.is_timestamp(): diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f04d9989dd4..09b2a4045ff 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2026,7 +2026,6 @@ def where(self, cond, other): **Examples:** - >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2097,14 +2096,14 @@ def where(self, cond, other): with corresponding value from other. If cond is callable, it is computed on the Series/DataFrame and returns boolean Series/DataFrame or array. The callable must not change input - Series/DataFrame (though pandas doesn’t check it). + Series/DataFrame. other (scalar, DataFrame, or callable): Entries where cond is False are replaced with corresponding value from other. If other is callable, it is computed on the DataFrame and returns scalar or DataFrame. The callable must not - change input DataFrame (though pandas doesn’t check it). If not - specified, entries will be filled with the corresponding NULL - value (np.nan for numpy dtypes, pd.NA for extension dtypes). + change input DataFrame. If not specified, entries will be filled + with the corresponding NULL value (np.nan for numpy dtypes, + pd.NA for extension dtypes). Returns: DataFrame: DataFrame after the replacement. diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 655f801b3df..c5f9f8330f6 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -1,5 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py +from __future__ import annotations + from datetime import date, datetime from typing import List, Mapping, Tuple, Union diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 3190c92b921..79faa53de2b 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -86,9 +86,11 @@ def read_gbq( ... WHERE year = 2016 ... GROUP BY pitcherFirstName, pitcherLastName ... ''', index_col="rowindex") - >>> df.head(2) + >>> print("START_OF_OUTPUT"); df.head(2) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE + START_OF_OUTPUT + ... pitcherFirstName pitcherLastName averagePitchSpeed - rowindex + ... 1 Albertin Chapman 96.514113 2 Zachary Britton 94.591039 diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 4928dd5c209..8352be131df 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.38.0" +__version__ = "2.39.0" # {x-release-please-start-date} -__release_date__ = "2026-03-16" +__release_date__ = "2026-03-31" # {x-release-please-end}