From 2e5311e2242b039da4c8e37b7b48942fa8ed34c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 17 Mar 2026 00:49:49 +0000 Subject: [PATCH 01/25] docs: gemini retouch of the index page for seo (#2514) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/bigquery/__init__.py | 35 +++++++++++++++-- bigframes/bigquery/ai.py | 46 +++++++++++++++++++++-- bigframes/pandas/__init__.py | 59 ++++++++++++++++++++++++++++- docs/index.rst | 68 +++++++++++++++++----------------- 4 files changed, 168 insertions(+), 40 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 14e0f315dc..f083887045 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -12,9 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This module integrates BigQuery built-in functions for use with DataFrame objects, -such as array functions: -https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ +""" +Access BigQuery-specific operations and namespaces within BigQuery DataFrames. + +This module provides specialized functions and sub-modules that expose BigQuery's +advanced capabilities to DataFrames and Series. It acts as a bridge between the +pandas-compatible API and the full power of BigQuery SQL. + +Key sub-modules include: + +* :mod:`bigframes.bigquery.ai`: Generative and predictive AI functions (Gemini, BQML). +* :mod:`bigframes.bigquery.ml`: Direct access to BigQuery ML model operations. +* :mod:`bigframes.bigquery.obj`: Support for BigQuery object tables. + +This module also provides direct access to optimized BigQuery functions for: + +* **JSON Processing:** High-performance functions like ``json_extract``, ``json_value``, + and ``parse_json`` for handling semi-structured data. +* **Geospatial Analysis:** Comprehensive geographic functions such as ``st_area``, + ``st_distance``, and ``st_centroid`` (``ST_`` prefixed functions). +* **Array Operations:** Tools for working with BigQuery arrays, including ``array_agg`` + and ``array_length``. +* **Vector Search:** Integration with BigQuery's vector search and indexing + capabilities for high-dimensional data. +* **Custom SQL:** The ``sql_scalar`` function allows embedding raw SQL snippets for + advanced operations not yet directly mapped in the API. + +By using these functions, you can leverage BigQuery's high-performance engine for +domain-specific tasks while maintaining a Python-centric development experience. + +For the full list of BigQuery standard SQL functions, see: +https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-reference +""" import sys diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index bb24d5dc33..25a7df7781 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -12,9 +12,49 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This module integrates BigQuery built-in AI functions for use with Series/DataFrame objects, -such as AI.GENERATE_BOOL: -https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool""" +""" +Integrate BigQuery built-in AI functions into your BigQuery DataFrames workflow. + +The ``bigframes.bigquery.ai`` module provides a Pythonic interface to leverage BigQuery ML's +generative AI and predictive functions directly on BigQuery DataFrames and Series objects. +These functions enable you to perform advanced AI tasks at scale without moving data +out of BigQuery. + +Key capabilities include: + +* **Generative AI:** Use :func:`bigframes.bigquery.ai.generate` (Gemini) to + perform text analysis, translation, or + content generation. Specialized versions like + :func:`~bigframes.bigquery.ai.generate_bool`, + :func:`~bigframes.bigquery.ai.generate_int`, and + :func:`~bigframes.bigquery.ai.generate_double` are available for structured + outputs. +* **Embeddings:** Generate vector embeddings for text using + :func:`~bigframes.bigquery.ai.generate_embedding`, which are essential for + semantic search and retrieval-augmented generation (RAG) workflows. +* **Classification and Scoring:** Apply machine learning models to your data for + predictive tasks with :func:`~bigframes.bigquery.ai.classify` and + :func:`~bigframes.bigquery.ai.score`. +* **Forecasting:** Predict future values in time-series data using + :func:`~bigframes.bigquery.ai.forecast`. + +**Example usage:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> df = bpd.DataFrame({ + ... "text_input": [ + ... "Is this a positive review? The food was terrible.", + ... ], + ... }) # doctest: +SKIP + + >>> # Assuming a Gemini model has been created in BigQuery as 'my_gemini_model' + >>> result = bq.ai.generate_text("my_gemini_model", df["text_input"]) # doctest: +SKIP + +For more information on the underlying BigQuery ML syntax, see: +https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool +""" from bigframes.bigquery._operations.ai import ( classify, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index fcb60bf778..4db900e776 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -12,7 +12,64 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""BigQuery DataFrames provides a DataFrame API backed by the BigQuery engine.""" +""" +The primary entry point for the BigQuery DataFrames (BigFrames) pandas-compatible API. + +**BigQuery DataFrames** provides a Pythonic DataFrame and machine learning (ML) API +powered by the BigQuery engine. The ``bigframes.pandas`` module implements a large +subset of the pandas API, allowing you to perform large-scale data analysis +using familiar pandas syntax while the computations are executed in the cloud. + +**Key Features:** + +* **Petabyte-Scale Scalability:** Handle datasets that exceed local memory by + offloading computation to the BigQuery distributed engine. +* **Pandas Compatibility:** Use common pandas methods like + :func:`~bigframes.pandas.DataFrame.groupby`, + :func:`~bigframes.pandas.DataFrame.merge`, + :func:`~bigframes.pandas.DataFrame.pivot_table`, and more on BigQuery-backed + :class:`~bigframes.pandas.DataFrame` objects. +* **Direct BigQuery Integration:** Read from and write to BigQuery tables and + queries with :func:`bigframes.pandas.read_gbq` and + :func:`bigframes.pandas.DataFrame.to_gbq`. +* **User-defined Functions (UDFs):** Effortlessly deploy Python functions + functions using the :func:`bigframes.pandas.remote_function` and + :func:`bigframes.pandas.udf` decorators. +* **Data Ingestion:** Support for various formats including CSV, Parquet, JSON, + and Arrow via :func:`bigframes.pandas.read_csv`, + :func:`bigframes.pandas.read_parquet`, etc., which are automatically uploaded + to BigQuery for processing. Convert any pandas DataFrame into a BigQuery + DataFrame using :func:`bigframes.pandas.read_pandas`. + +**Example usage:** + + >>> import bigframes.pandas as bpd + +Initialize session and set options. + + >>> bpd.options.bigquery.project = "your-project-id" # doctest: +SKIP + +Load data from a BigQuery public dataset. + + >>> df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") # doctest: +SKIP + +Perform familiar pandas operations that execute in the cloud. + + >>> top_names = ( + ... df.groupby("name") + ... .agg({"number": "sum"}) + ... .sort_values("number", ascending=False) + ... .head(10) + ... ) # doctest: +SKIP + +Bring the final, aggregated results back to local memory if needed. + + >>> local_df = top_names.to_pandas() # doctest: +SKIP + +BigQuery DataFrames is designed for data scientists and analysts who need the +power of BigQuery with the ease of use of pandas. It eliminates the "data +movement bottleneck" by keeping your data in BigQuery for processing. +""" from __future__ import annotations diff --git a/docs/index.rst b/docs/index.rst index 00c59a6745..19b05bc1b6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,46 +1,52 @@ .. BigQuery DataFrames documentation main file -Welcome to BigQuery DataFrames -============================== +Scalable Python Data Analysis with BigQuery DataFrames (BigFrames) +================================================================== -**BigQuery DataFrames** (``bigframes``) provides a Pythonic interface for data analysis that scales to petabytes. It gives you the best of both worlds: the familiar API of **pandas** and **scikit-learn**, powered by the distributed computing engine of **BigQuery**. +.. meta:: + :description: BigQuery DataFrames (BigFrames) provides a scalable, pandas-compatible Python API for data analysis and machine learning on petabyte-scale datasets using the BigQuery engine. -BigQuery DataFrames consists of three main components: +**BigQuery DataFrames** (``bigframes``) is an open-source Python library that brings the power of **distributed computing** to your data science workflow. By providing a familiar **pandas** and **scikit-learn** compatible API, BigFrames allows you to analyze and model massive datasets where they live—directly in **BigQuery**. -* **bigframes.pandas**: A pandas-compatible API for data exploration and transformation. -* **bigframes.ml**: A scikit-learn-like interface for BigQuery ML, including integration with Gemini. -* **bigframes.bigquery**: Specialized functions for managing BigQuery resources and deploying custom logic. +Why Choose BigQuery DataFrames? +------------------------------- -Why BigQuery DataFrames? ------------------------- +BigFrames eliminates the "data movement bottleneck." Instead of downloading large datasets to a local environment, BigFrames translates your Python code into optimized SQL, executing complex transformations across the BigQuery fleet. -BigFrames allows you to process data where it lives. Instead of downloading massive datasets to your local machine, BigFrames translates your Python code into SQL and executes it across the BigQuery fleet. +* **Petabyte-Scale Scalability:** Effortlessly process datasets that far exceed local memory limits. +* **Familiar Python Ecosystem:** Use the same ``read_gbq``, ``groupby``, ``merge``, and ``pivot_table`` functions you already know from pandas. +* **Integrated Machine Learning:** Access BigQuery ML's powerful algorithms via a scikit-learn-like interface (``bigframes.ml``), including seamless **Gemini AI** integration. +* **Enterprise-Grade Security:** Maintain data governance and security by keeping your data within the BigQuery perimeter. +* **Hybrid Flexibility:** Easily move between distributed BigQuery processing and local pandas analysis with ``to_pandas()``. -* **Scalability:** Work with datasets that exceed local memory limits without complex refactoring. -* **Collaboration & Extensibility:** Bridge the gap between Python and SQL. Deploy custom Python functions to BigQuery, making your logic accessible to SQL-based teammates and data analysts. -* **Production-Ready Pipelines:** Move seamlessly from interactive notebooks to production. BigFrames simplifies data engineering by integrating with tools like **dbt** and **Airflow**, offering a simpler operational model than Spark. -* **Security & Governance:** Keep your data within the BigQuery perimeter. Benefit from enterprise-grade security, auditing, and data governance throughout your entire Python workflow. -* **Familiarity:** Use ``read_gbq``, ``merge``, ``groupby``, and ``pivot_table`` just like you do in pandas. +Core Components of BigFrames +---------------------------- -Quickstart ----------- +BigQuery DataFrames is organized into specialized modules designed for the modern data stack: -Install the library via pip: +1. :mod:`bigframes.pandas`: A high-performance, pandas-compatible API for scalable data exploration, cleaning, and transformation. +2. :mod:`bigframes.bigquery`: Specialized utilities for direct BigQuery resource management, including integrations with Gemini and other AI models in the :mod:`bigframes.bigquery.ai` submodule. + + +Quickstart: Scalable Data Analysis in Seconds +--------------------------------------------- + +Install BigQuery DataFrames via pip: .. code-block:: bash pip install --upgrade bigframes -Load and aggregate a public dataset in just a few lines: +The following example demonstrates how to perform a distributed aggregation on a public dataset with millions of rows using just a few lines of Python: .. code-block:: python import bigframes.pandas as bpd - # Load data from BigQuery + # Initialize BigFrames and load a public dataset df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") - # Perform familiar pandas operations at scale + # Perform familiar pandas operations that execute in the cloud top_names = ( df.groupby("name") .agg({"number": "sum"}) @@ -48,32 +54,28 @@ Load and aggregate a public dataset in just a few lines: .head(10) ) + # Bring the final, aggregated results back to local memory if needed print(top_names.to_pandas()) -User Guide ----------- +Explore the Documentation +------------------------- .. toctree:: :maxdepth: 2 + :caption: User Documentation user_guide/index -API reference -------------- - .. toctree:: - :maxdepth: 3 + :maxdepth: 2 + :caption: API Reference reference/index supported_pandas_apis -Changelog ---------- - -For a list of all BigQuery DataFrames releases: - .. toctree:: - :maxdepth: 2 + :maxdepth: 1 + :caption: Community & Updates changelog From 66e21ecf46852fd57a47fa469c0e9e27489d328c Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 17 Mar 2026 09:54:03 -0700 Subject: [PATCH 02/25] test: Fix ingress settings for vpc tests to not use "all" (#2519) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .../large/functions/test_remote_function.py | 41 +++++++------------ 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 114b600d9d..a631d97a2e 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -1230,7 +1230,7 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_custom_sa(scalars_dfs): +def test_remote_function_via_session_custom_sa(scalars_pandas_df_index): # TODO(shobs): Automate the following set-up during testing in the test project. # # For upfront convenience, the following set up has been statically created @@ -1249,14 +1249,13 @@ def test_remote_function_via_session_custom_sa(scalars_dfs): rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) try: - # TODO(shobs): Figure out why the default ingress setting - # (internal-only) does not work here + @rf_session.remote_function( input_types=[int], output_type=int, reuse=False, cloud_function_service_account=gcf_service_account, - cloud_function_ingress_settings="all", + cloud_function_ingress_settings="internal-and-gclb", ) def double_num(x): if x is None: @@ -1270,13 +1269,12 @@ def double_num(x): assert gcf.service_config.service_account_email == gcf_service_account # assert that the function works as expected on data - scalars_df, scalars_pandas_df = scalars_dfs - bf_int64_col = scalars_df["int64_col"] + bf_int64_col = rf_session.read_pandas(scalars_pandas_df_index.int64_col) bf_result_col = bf_int64_col.apply(double_num) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() - pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col = scalars_pandas_df_index.int64_col pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x + x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) @@ -1303,7 +1301,7 @@ def double_num(x): ) @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_via_session_custom_build_sa( - scalars_dfs, set_build_service_account + set_build_service_account, scalars_pandas_df_index ): # TODO(shobs): Automate the following set-up during testing in the test project. # @@ -1321,15 +1319,14 @@ def test_remote_function_via_session_custom_build_sa( rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) try: - # TODO(shobs): Figure out why the default ingress setting - # (internal-only) does not work here + @rf_session.remote_function( input_types=[int], output_type=int, reuse=False, cloud_function_service_account="default", cloud_build_service_account=set_build_service_account, - cloud_function_ingress_settings="all", + cloud_function_ingress_settings="internal-and-gclb", ) def double_num(x): if x is None: @@ -1342,14 +1339,11 @@ def double_num(x): ) assert gcf.build_config.service_account == expected_build_service_account - # assert that the function works as expected on data - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] + bf_int64_col = rf_session.read_pandas(scalars_pandas_df_index.int64_col) bf_result_col = bf_int64_col.apply(double_num) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() - pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col = scalars_pandas_df_index.int64_col pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x + x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) @@ -1436,7 +1430,7 @@ def square_num(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_vpc(scalars_dfs): +def test_remote_function_via_session_vpc(scalars_pandas_df_index): # TODO(shobs): Automate the following set-up during testing in the test project. # # For upfront convenience, the following set up has been statically created @@ -1466,8 +1460,6 @@ def double_num(x): return x return x + x - # TODO(shobs): See if the test vpc can be configured to make this flow - # work with the default ingress setting (internal-only) double_num_remote = rf_session.remote_function( input_types=[int], output_type=int, @@ -1475,7 +1467,7 @@ def double_num(x): cloud_function_service_account="default", cloud_function_vpc_connector=gcf_vpc_connector, cloud_function_vpc_connector_egress_settings="all", - cloud_function_ingress_settings="all", + cloud_function_ingress_settings="internal-and-gclb", )(double_num) gcf = rf_session.cloudfunctionsclient.get_function( @@ -1489,15 +1481,12 @@ def double_num(x): # cloud_function_vpc_connector_egress_settings="all" earlier. assert gcf.service_config.vpc_connector_egress_settings == 2 - # assert that the function works as expected on data - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] + bf_int64_col = rf_session.read_pandas(scalars_pandas_df_index.int64_col) bf_result_col = bf_int64_col.apply(double_num_remote) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() - pd_int64_col = scalars_pandas_df["int64_col"] - pd_result_col = pd_int64_col.apply(double_num).astype("Int64") + pd_int64_col = scalars_pandas_df_index.int64_col + pd_result_col = pd_int64_col.apply(double_num) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_frame_equal(bf_result, pd_result, check_dtype=False) From edceb3511ac68f1137941c8f7bebca1aaa0c36ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 17 Mar 2026 17:27:09 +0000 Subject: [PATCH 03/25] chore: Remove unused Ibis Python to JS UDF compiler (#2521) The Python to JavaScript compiler previously vendored from ibis and located at `third_party/bigframes_vendored/ibis/backends/bigquery/udf` was unused in the production codebase and not supported by the vendored backend. This PR removes the `udf/` directory and any references to `PythonToJavaScriptTranslator` and `_get_udf_source` from `third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py`. --- *PR created automatically by Jules for task [17001767655361890847](https://jules.google.com/task/17001767655361890847) started by @tswast* Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: tswast <247555+tswast@users.noreply.github.com> --- .../ibis/backends/bigquery/__init__.py | 75 +-- .../ibis/backends/bigquery/udf/__init__.py | 0 .../ibis/backends/bigquery/udf/core.py | 604 ------------------ .../ibis/backends/bigquery/udf/find.py | 64 -- .../ibis/backends/bigquery/udf/rewrite.py | 54 -- 5 files changed, 1 insertion(+), 796 deletions(-) delete mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/udf/__init__.py delete mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/udf/core.py delete mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/udf/find.py delete mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/udf/rewrite.py diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index b342c7e4a9..1d3f63d216 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -21,9 +21,6 @@ schema_from_bigquery_table, ) from bigframes_vendored.ibis.backends.bigquery.datatypes import BigQuerySchema -from bigframes_vendored.ibis.backends.bigquery.udf.core import ( - PythonToJavaScriptTranslator, -) from bigframes_vendored.ibis.backends.sql import SQLBackend from bigframes_vendored.ibis.backends.sql.compilers import BigQueryCompiler from bigframes_vendored.ibis.backends.sql.datatypes import BigQueryType @@ -731,15 +728,7 @@ def compile( ): """Compile an Ibis expression to a SQL string.""" query = self._to_sqlglot(expr, limit=limit, params=params, **kwargs) - udf_sources = [] - for udf_node in expr.op().find(ops.ScalarUDF): - compile_func = getattr( - self, f"_compile_{udf_node.__input_type__.name.lower()}_udf" - ) - if sql := compile_func(udf_node): - udf_sources.append(sql.sql(self.name, pretty=True)) - - sql = ";\n".join([*udf_sources, query.sql(dialect=self.name, pretty=True)]) + sql = query.sql(dialect=self.name, pretty=True) self._log(sql) return sql @@ -1186,68 +1175,6 @@ def _clean_up_cached_table(self, name): force=True, ) - def _get_udf_source(self, udf_node: ops.ScalarUDF): - name = type(udf_node).__name__ - type_mapper = self.compiler.udf_type_mapper - - body = PythonToJavaScriptTranslator(udf_node.__func__).compile() - config = udf_node.__config__ - libraries = config.get("libraries", []) - - signature = [ - sge.ColumnDef( - this=sg.to_identifier(name, quoted=self.compiler.quoted), - kind=type_mapper.from_ibis(param.annotation.pattern.dtype), - ) - for name, param in udf_node.__signature__.parameters.items() - ] - - lines = ['"""'] - - if config.get("strict", True): - lines.append('"use strict";') - - lines += [ - body, - "", - f"return {udf_node.__func_name__}({', '.join(udf_node.argnames)});", - '"""', - ] - - func = sge.Create( - kind="FUNCTION", - this=sge.UserDefinedFunction( - this=sg.to_identifier(name), expressions=signature, wrapped=True - ), - # not exactly what I had in mind, but it works - # - # quoting is too simplistic to handle multiline strings - expression=sge.Var(this="\n".join(lines)), - exists=False, - properties=sge.Properties( - expressions=[ - sge.TemporaryProperty(), - sge.ReturnsProperty(this=type_mapper.from_ibis(udf_node.dtype)), - sge.StabilityProperty( - this="IMMUTABLE" if config.get("determinism") else "VOLATILE" - ), - sge.LanguageProperty(this=sg.to_identifier("js")), - ] - + [ - sge.Property( - this=sg.to_identifier("library"), - value=self.compiler.f.array(*libraries), - ) - ] - * bool(libraries) - ), - ) - - return func - - def _compile_python_udf(self, udf_node: ops.ScalarUDF) -> None: - return self._get_udf_source(udf_node) - def _register_udfs(self, expr: ir.Expr) -> None: """No op because UDFs made with CREATE TEMPORARY FUNCTION must be followed by a query.""" diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/core.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/core.py deleted file mode 100644 index 6f59a2becd..0000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/core.py +++ /dev/null @@ -1,604 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/backends/bigquery/udf/core.py - -"""Translate a Python AST to JavaScript.""" - -from __future__ import annotations - -import ast -from collections import ChainMap -import contextlib -import functools -import inspect -import textwrap -from typing import TYPE_CHECKING - -from bigframes_vendored.ibis.backends.bigquery.udf.find import find_names -from bigframes_vendored.ibis.backends.bigquery.udf.rewrite import rewrite - -if TYPE_CHECKING: - from collections.abc import Callable - - -class SymbolTable(ChainMap): - """ChainMap subclass implementing scope for the translator. - - Notes - ----- - JavaScript requires declarations in strict mode, so to implement this we - shove a "let" at the beginning of every variable name if it doesn't already - exist in the current scope. - - """ - - def __getitem__(self, key): - if key not in self: - self[key] = key - return f"let {key}" - return key - - -def indent(lines, spaces=4): - """Indent `lines` by `spaces` spaces. - - Parameters - ---------- - lines : Union[str, List[str]] - A string or list of strings to indent - spaces : int - The number of spaces to indent `lines` - - Returns - ------- - indented_lines : str - - """ - if isinstance(lines, str): - text = [lines] - text = "\n".join(lines) - return textwrap.indent(text, " " * spaces) - - -def semicolon(f: Callable) -> Callable: - """Add a semicolon to the result of a `visit_*` call.""" - - @functools.wraps(f) - def wrapper(*args, **kwargs): - return f(*args, **kwargs) + ";" - - return wrapper - - -@rewrite.register(ast.Call(func=ast.Name(id="print"))) -def rewrite_print(node): - return ast.Call( - func=ast.Attribute( - value=ast.Name(id="console", ctx=ast.Load()), - attr="log", - ctx=ast.Load(), - ), - args=node.args, - keywords=node.keywords, - ) - - -@rewrite.register(ast.Call(func=ast.Name(id="len"))) -def rewrite_len(node): - assert len(node.args) == 1 - return ast.Attribute(value=node.args[0], attr="length", ctx=ast.Load()) - - -@rewrite.register(ast.Call(func=ast.Attribute(attr="append"))) -def rewrite_append(node): - return ast.Call( - func=ast.Attribute(value=node.func.value, attr="push", ctx=ast.Load()), - args=node.args, - keywords=node.keywords, - ) - - -@rewrite.register( - ast.Call(func=ast.Attribute(value=ast.Name(id="Array"), attr="from_")) -) -def rewrite_array_from(node): - return ast.Call( - func=ast.Attribute(value=node.func.value, attr="from"), - args=node.args, - keywords=node.keywords, - ) - - -class PythonToJavaScriptTranslator: - constructor_map = { - "list": "Array", - "Array": "Array", - "Date": "Date", - "dict": "Object", - "Map": "Map", - "WeakMap": "WeakMap", - "str": "String", - "String": "String", - "set": "Set", - "Set": "Set", - "WeakSet": "WeakSet", - } - - def __init__(self, function): - self.function = function - self.source = textwrap.dedent(inspect.getsource(function)) - self.ast = ast.parse(self.source) - self.scope = SymbolTable() - self.current_function = None - self.current_class = None - self.is_generator = False - self.is_nested_definition = False - - def compile(self): - return self.visit(self.ast) - - def visit(self, node): - node = rewrite(node) - typename = node.__class__.__name__ - method_name = f"visit_{typename}" - method = getattr(self, method_name, None) - if method is None: - raise NotImplementedError(f"{method_name!r} nodes not yet implemented") - assert callable(method) - - result = method(node) - return result - - def visit_Name(self, node): - if self.current_class is not None and node.id == "self": - return "this" - return node.id - - def visit_Yield(self, node): - self.is_generator = True - return f"yield {self.visit(node.value)}" - - def visit_YieldFrom(self, node): - self.is_generator = True - return f"yield* {self.visit(node.value)}" - - @semicolon - def visit_Assign(self, node): - try: - (target,) = node.targets - except ValueError: - raise NotImplementedError("Only single assignment supported for now") - - if not isinstance(target, (ast.Name, ast.Subscript, ast.Attribute)): - raise NotImplementedError( - "Only index, attribute, and variable name assignment " - f"supported, got {type(target).__name__}" - ) - - is_name = isinstance(target, ast.Name) - compiled_target = self.visit(target) - if not is_name or ( - self.current_class is not None and compiled_target.startswith("this.") - ): - self.scope[compiled_target] = compiled_target - return f"{self.scope[compiled_target]} = {self.visit(node.value)}" - - def translate_special_method(self, name): - return {"__init__": "constructor"}.get(name, name) - - def visit_FunctionDef(self, node): - self.current_function = node - - is_property_getter = any( - getattr(dec, "id", None) == "property" for dec in node.decorator_list - ) - - if self.current_class is None: # not a method - if is_property_getter: - raise TypeError("Functions cannot be properties, only methods can") - prefix = "function" - else: - if is_property_getter and self.is_generator: - raise TypeError("generator methods cannot be properties") - prefix = "get " * is_property_getter - - with self.local_scope(): - body = indent(map(self.visit, node.body)) - - if self.is_generator: - prefix += "* " - else: - prefix += " " * (self.current_class is None) - - lines = [ - prefix - + self.translate_special_method(node.name) - + f"({self.visit(node.args)}) {{", - body, - "}", - ] - - self.current_function = None - self.is_generator = False - return "\n".join(lines) - - @semicolon - def visit_Return(self, node): - return f"return {self.visit(node.value)}" - - def visit_Add(self, node): - return "+" - - def visit_Sub(self, node): - return "-" - - def visit_Mult(self, node): - return "*" - - def visit_Div(self, node): - return "/" - - def visit_FloorDiv(self, node): - raise AssertionError("should never reach FloorDiv") - - def visit_Pow(self, node): - raise AssertionError("should never reach Pow") - - def visit_UnaryOp(self, node): - return f"({self.visit(node.op)}{self.visit(node.operand)})" - - def visit_USub(self, node): - return "-" - - def visit_UAdd(self, node): - return "+" - - def visit_BinOp(self, node): - left, op, right = node.left, node.op, node.right - - if isinstance(op, ast.Pow): - return f"Math.pow({self.visit(left)}, {self.visit(right)})" - elif isinstance(op, ast.FloorDiv): - return f"Math.floor({self.visit(left)} / {self.visit(right)})" - return f"({self.visit(left)} {self.visit(op)} {self.visit(right)})" - - def visit_Constant(self, node): - value = node.value - if value is None: - return "null" - if isinstance(value, bool): - return "true" if value else "false" - if isinstance(value, (int, float, str)): - return repr(value) - raise NotImplementedError( - f"{value.__class__.__name__!r} constants not yet implemented" - ) - - def visit_NameConstant(self, node): - value = node.value - if value is True: - return "true" - elif value is False: - return "false" - assert ( - value is None - ), f"value is not True and is not False, must be None, got {value}" - return "null" - - def visit_Str(self, node): - return repr(node.s) - - def visit_Num(self, node): - return repr(node.n) - - def visit_List(self, node): - return "[{}]".format(", ".join(map(self.visit, node.elts))) - - def visit_Tuple(self, node): - # tuples becomes lists in javascript - return "[{}]".format(", ".join(map(self.visit, node.elts))) - - def visit_Dict(self, node): - return "{{{}}}".format( - ", ".join( - f"[{self.visit(key)}]: {self.visit(value)}" - for key, value in zip(node.keys, node.values) - ) - ) - - @semicolon - def visit_Expr(self, node): - return self.visit(node.value) - - def visit_Starred(self, node): - return f"...{self.visit(node.value)}" - - def visit_Call(self, node): - thing_to_call = self.visit(node.func) - constructors = self.__class__.constructor_map - args = ", ".join(map(self.visit, node.args)) - try: - thing_to_call = constructors[thing_to_call] - except KeyError: - format_string = "{}({})" - else: - format_string = "(new {}({}))" - return format_string.format(thing_to_call, args) - - def visit_Attribute(self, node): - return f"{self.visit(node.value)}.{node.attr}" - - def visit_For(self, node): - lines = [f"for (let {self.visit(node.target)} of {self.visit(node.iter)}) {{"] - with self.local_scope(): - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - return "\n".join(lines) - - def visit_While(self, node): - lines = [f"while ({self.visit(node.test)}) {{"] - with self.local_scope(): - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - return "\n".join(lines) - - @semicolon - def visit_Break(self, node): - return "break" - - @semicolon - def visit_Continue(self, node): - return "continue" - - def visit_Eq(self, node): - return "===" - - def visit_NotEq(self, node): - return "!==" - - def visit_Or(self, node): - return "||" - - def visit_And(self, node): - return "&&" - - def visit_BoolOp(self, node): - return "({})".format( - f" {self.visit(node.op)} ".join(map(self.visit, node.values)) - ) - - def visit_Lt(self, node): - return "<" - - def visit_LtE(self, node): - return "<=" - - def visit_Gt(self, node): - return ">" - - def visit_GtE(self, node): - return ">=" - - def visit_Compare(self, node): - rights = node.comparators - ops = node.ops - - left = node.left - comparisons = [] - for op, right in zip(ops, rights): - comparisons.append( - f"({self.visit(left)} {self.visit(op)} {self.visit(right)})" - ) - left = right - return " && ".join(comparisons) - - @semicolon - def visit_AugAssign(self, node): - target = self.visit(node.target) - op = self.visit(node.op) - value = self.visit(node.value) - return f"{target} {op}= {value}" - - def visit_Module(self, node): - return "\n\n".join(map(self.visit, node.body)) - - def visit_arg(self, node): - if self.current_class is not None and node.arg == "self": - return "" - return node.arg - - def visit_arguments(self, node): - args = list(filter(None, map(self.visit, node.args[:]))) - vararg = node.vararg - if vararg is not None: - args.append(f"...{vararg.arg}") - return ", ".join(args) - - def visit_Lambda(self, node): - args = node.args - generated_args = self.visit(args) - return f"(({generated_args}) => {self.visit(node.body)})" - - @contextlib.contextmanager - def local_scope(self): - """Assign symbols to local variables.""" - self.scope = self.scope.new_child() - try: - yield self.scope - finally: - self.scope = self.scope.parents - - def visit_If(self, node): - lines = [f"if ({self.visit(node.test)}) {{"] - - with self.local_scope(): - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - - if node.orelse: - lines[-1] += " else {" - with self.local_scope(): - lines.append(indent(map(self.visit, node.orelse))) - lines.append("}") - return "\n".join(lines) - - def visit_IfExp(self, node): - test = self.visit(node.test) - body = self.visit(node.body) - orelse = self.visit(node.orelse) - return f"({test} ? {body} : {orelse})" - - def visit_Index(self, node): - return self.visit(node.value) - - def visit_Subscript(self, node): - return f"{self.visit(node.value)}[{self.visit(node.slice)}]" - - def visit_ClassDef(self, node): - self.current_class = node - bases = node.bases - - lines = [f"class {node.name}"] - if bases: - lines[-1] += " extends {}".format(", ".join(map(self.visit, bases))) - lines[-1] += " {" - lines.append(indent(map(self.visit, node.body))) - lines.append("}") - self.current_class = None - self.__class__.constructor_map[node.name] = node.name - return "\n".join(lines) - - def visit_Not(self, node): - return "!" - - def visit_ListComp(self, node): - """Generate a curried lambda function. - - [x + y for x, y in [[1, 4], [2, 5], [3, 6]]] - - becomes - - [[1, 4], [2, 5], [3, 6]]].map(([x, y]) => x + y) - """ - try: - (generator,) = node.generators - except ValueError: - raise NotImplementedError("Only single loop comprehensions are allowed") - - names = find_names(generator.target) - argslist = [ast.arg(arg=name.id, annotation=None) for name in names] - if len(names) <= 1: - signature = ast.arguments( - args=argslist, - vararg=None, - kwonlyargs=[], - kw_defaults=[], - kwarg=None, - defaults=[], - ) - else: - signature = ast.List(elts=argslist, ctx=ast.Load()) - - array = generator.iter - lam_sig = functools.partial(ast.Lambda, args=signature) - - filters = generator.ifs - if filters: - filt = ast.BoolOp(op=ast.And(), values=filters) - # array.filter - method = ast.Attribute(value=array, attr="filter", ctx=ast.Load()) - # array.filter(func) - array = ast.Call(func=method, args=[lam_sig(body=filt)], keywords=[]) - - method = ast.Attribute(value=array, attr="map", ctx=ast.Load()) - mapped = ast.Call(func=method, args=[lam_sig(body=node.elt)], keywords=[]) - result = self.visit(mapped) - return result - - def visit_Delete(self, node): - return "\n".join(f"delete {self.visit(target)};" for target in node.targets) - - -if __name__ == "__main__": - import bigframes_vendored.ibis - from bigframes_vendored.ibis import udf - - @udf.scalar.python(strict=False) - def my_func(a: float, b: float, n: float) -> list[float]: - class Rectangle: - def __init__(self, width, height): - self.width = width - self.height = height - - @property - def area(self): - return self.width * self.height - - @property - def perimeter(self): - return self.width * 2 + self.height * 2 - - def foobar(self, n): - yield from range(n) - - def sum(values): - result = 0 - for value in values: - result += value - console.log(result) # noqa: F821 - return values.reduce(lambda a, b: a + b, 0) - - def range(n): - i = 0 - while i < n: - yield i - i += 1 - - some_stuff = [x + y for x, y in [[1, 4], [2, 5], [3, 6]] if 2 < x < 3] - some_stuff1 = [range(x) for x in [1, 2, 3]] - some_stuff2 = [x + y for x, y in [(1, 4), (2, 5), (3, 6)]] - print(some_stuff) # noqa: T201 - print(some_stuff1) # noqa: T201 - print(some_stuff2) # noqa: T201 - - x = 1 - y = 2 - x = 3 - values = [] - for i in range(10): - values.append(i) - - i = 0 - foo = 2 - bar = lambda x: x # noqa: E731 - bazel = lambda x: y # noqa: E731 - while i < n: - foo = bar(bazel(10)) - i += 1 - console.log(i) # noqa: F821 - - foo = 2 - - if i == 10 and (y < 2 or i != 42): - y += 2 - else: - y -= 2 - - z = 42.0 - w = 3 - w = not False - yyz = None - print(yyz) # noqa: T201 - foobar = x < y < z < w # x < y and y < z and z < w - foobar = 1 - baz = foobar // 3 - console.log(baz) # noqa: F821 - - my_obj = {"a": 1, "b": 2} # noqa: F841 - - z = (x if y else b) + 2 + foobar - foo = Rectangle(1, 2) - nnn = len(values) - return [sum(values) - a + b * y**-x, z, foo.width, nnn] - - print( - bigframes_vendored.ibis.bigquery.compile(my_func(42.7, 13.2, 1)) - ) # noqa: T201 diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/find.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/find.py deleted file mode 100644 index b1f353ae4f..0000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/find.py +++ /dev/null @@ -1,64 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/backends/bigquery/udf/find.py - -from __future__ import annotations - -import ast - -import toolz - - -class NameFinder: - """Helper class to find the unique names in an AST.""" - - __slots__ = () - - def find(self, node): - typename = type(node).__name__ - method = getattr(self, f"find_{typename}", None) - if method is None: - fields = getattr(node, "_fields", None) - if fields is None: - return - for field in fields: - value = getattr(node, field) - yield from self.find(value) - else: - yield from method(node) - - def find_Name(self, node): - # TODO not sure if this is robust to scope changes - yield node - - def find_list(self, node): - return list(toolz.concat(map(self.find, node))) - - def find_Call(self, node): - if not isinstance(node.func, ast.Name): - fields = node._fields - else: - fields = [field for field in node._fields if field != "func"] - return toolz.concat(map(self.find, (getattr(node, field) for field in fields))) - - -def find_names(node: ast.AST) -> list[ast.Name]: - """Return the unique `ast.Name` instances in an AST. - - Examples - -------- - >>> import ast - >>> node = ast.parse("a + b") - >>> names = find_names(node) - >>> names - [<....Name object at 0x...>, <....Name object at 0x...>] - >>> names[0].id - 'a' - >>> names[1].id - 'b' - - """ - return list( - toolz.unique( - filter(None, NameFinder().find(node)), - key=lambda node: (node.id, type(node.ctx)), - ) - ) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/rewrite.py b/third_party/bigframes_vendored/ibis/backends/bigquery/udf/rewrite.py deleted file mode 100644 index 6d2b0df7cd..0000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/udf/rewrite.py +++ /dev/null @@ -1,54 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/backends/bigquery/udf/rewrite.py - -from __future__ import annotations - -import ast -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Callable - - -def matches(value: ast.AST, pattern: ast.AST) -> bool: - """Check whether `value` matches `pattern`.""" - # types must match exactly - if type(value) is not type(pattern): - return False - - # primitive value, such as None, True, False etc - if not isinstance(value, ast.AST) and not isinstance(pattern, ast.AST): - return value == pattern - - fields = [ - (field, getattr(pattern, field)) - for field in pattern._fields - if hasattr(pattern, field) - ] - return all( - matches(getattr(value, field_name), field_value) - for field_name, field_value in fields - ) - - -class Rewriter: - """AST pattern matcher to enable rewrite rules.""" - - def __init__(self): - self.funcs: list[tuple[ast.AST, Callable[[ast.expr], ast.expr]]] = [] - - def register(self, pattern): - def wrapper(f): - self.funcs.append((pattern, f)) - return f - - return wrapper - - def __call__(self, node): - # TODO: more efficient way of doing this? - for pattern, func in self.funcs: - if matches(node, pattern): - return func(node) - return node - - -rewrite = Rewriter() From 1126cec9cdfcc1ec1062c60e5affbe1b60223767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 17 Mar 2026 19:05:35 +0000 Subject: [PATCH 04/25] feat: add `df.bigquery.ai.forecast` method to pandas dataframe accessor (#2518) Adds the `.bigquery.ai.forecast()` method to pandas DataFrame objects, which proxies to `bigframes.bigquery.ai.forecast()`. Added unit tests and mocked session responses. --- *PR created automatically by Jules for task [14604090974587392182](https://jules.google.com/task/14604090974587392182) started by @tswast* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: tswast <247555+tswast@users.noreply.github.com> --- bigframes/bigquery/_operations/ai.py | 11 +++ .../extensions/pandas/dataframe_accessor.py | 88 ++++++++++++++++++- .../sqlglot/test_dataframe_accessor.py | 39 ++++++++ 3 files changed, 137 insertions(+), 1 deletion(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index e578f4be4a..055a5cda79 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -880,6 +880,7 @@ def forecast( id_cols: Iterable[str] | None = None, horizon: int = 10, confidence_level: float = 0.95, + output_historical_time_series: bool = False, context_window: int | None = None, ) -> dataframe.DataFrame: """ @@ -914,6 +915,15 @@ def forecast( confidence_level (float, default 0.95): A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval. The default value is 0.95. The valid input range is [0, 1). + output_historical_time_series (bool, default False): + A BOOL value that determines whether the input data is returned + along with the forecasted data. Set this argument to TRUE to return + input data. The default value is FALSE. + + Returning the input data along with the forecasted data lets you + compare the historical value of the data column with the forecasted + value of the data column, or chart the change in the data column + values over time. context_window (int, optional): An int value that specifies the context window length used by BigQuery ML's built-in TimesFM model. The context window length determines how many of the most recent data points from the input time series are use by the model. @@ -945,6 +955,7 @@ def forecast( "timestamp_col": timestamp_col, "model": model, "horizon": horizon, + "output_historical_time_series": output_historical_time_series, "confidence_level": confidence_level, } if id_cols: diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py index 2cb44fe3c5..aab9f35bd9 100644 --- a/bigframes/extensions/pandas/dataframe_accessor.py +++ b/bigframes/extensions/pandas/dataframe_accessor.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import cast +from typing import cast, Iterable, Optional import pandas import pandas.api.extensions @@ -21,6 +21,85 @@ import bigframes.pandas as bpd +class AIAccessor: + """ + Pandas DataFrame accessor for BigQuery AI functions. + """ + + def __init__(self, pandas_obj: pandas.DataFrame): + self._obj = pandas_obj + + def forecast( + self, + *, + data_col: str, + timestamp_col: str, + model: str = "TimesFM 2.0", + id_cols: Optional[Iterable[str]] = None, + horizon: int = 10, + confidence_level: float = 0.95, + context_window: Optional[int] = None, + output_historical_time_series: bool = False, + session=None, + ) -> pandas.DataFrame: + """ + Forecast time series at future horizon using BigQuery AI.FORECAST. + + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast + + Args: + data_col (str): + A str value that specifies the name of the data column. The data column contains the data to forecast. + The data column must use one of the following data types: INT64, NUMERIC and FLOAT64 + timestamp_col (str): + A str value that specified the name of the time points column. + The time points column provides the time points used to generate the forecast. + The time points column must use one of the following data types: TIMESTAMP, DATE and DATETIME + model (str, default "TimesFM 2.0"): + A str value that specifies the name of the model. "TimesFM 2.0" and "TimesFM 2.5" are supported. + id_cols (Iterable[str], optional): + An iterable of str value that specifies the names of one or more ID columns. Each ID identifies a unique time series to forecast. + Specify one or more values for this argument in order to forecast multiple time series using a single query. + The columns that you specify must use one of the following data types: STRING, INT64, ARRAY and ARRAY + horizon (int, default 10): + An int value that specifies the number of time points to forecast. The default value is 10. The valid input range is [1, 10,000]. + confidence_level (float, default 0.95): + A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval. + The default value is 0.95. The valid input range is [0, 1). + context_window (int, optional): + An int value that specifies the context window length used by BigQuery ML's built-in TimesFM model. + The context window length determines how many of the most recent data points from the input time series are use by the model. + If you don't specify a value, the AI.FORECAST function automatically chooses the smallest possible context window length to use + that is still large enough to cover the number of time series data points in your input data. + output_historical_time_series (bool, default False): + A boolean value that determines whether to include the input time series history in the forecast. + session (bigframes.session.Session, optional): + The BigFrames session to use. If not provided, the default global session is used. + + Returns: + pandas.DataFrame: + The forecast DataFrame result. + """ + import bigframes.bigquery.ai + + if session is None: + session = bf_session.get_global_session() + + bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) + result = bigframes.bigquery.ai.forecast( + bf_df, + data_col=data_col, + timestamp_col=timestamp_col, + model=model, + id_cols=id_cols, + horizon=horizon, + confidence_level=confidence_level, + context_window=context_window, + output_historical_time_series=output_historical_time_series, + ) + return result.to_pandas(ordered=True) + + @pandas.api.extensions.register_dataframe_accessor("bigquery") class BigQueryDataFrameAccessor: """ @@ -32,6 +111,13 @@ class BigQueryDataFrameAccessor: def __init__(self, pandas_obj: pandas.DataFrame): self._obj = pandas_obj + @property + def ai(self) -> "AIAccessor": + """ + Accessor for BigQuery AI functions. + """ + return AIAccessor(self._obj) + def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None): """ Compute a new pandas Series by applying a SQL scalar function to the DataFrame. diff --git a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py index 327b8e4206..364f738353 100644 --- a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py +++ b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py @@ -43,3 +43,42 @@ def to_pandas(series, *, ordered): session.read_pandas.assert_called_once() snapshot.assert_match(result, "out.sql") + + +def test_ai_forecast(snapshot, monkeypatch): + import bigframes.bigquery.ai + import bigframes.session + + session = mock.create_autospec(bigframes.session.Session) + bf_df = mock.create_autospec(bpd.DataFrame) + session.read_pandas.return_value = bf_df + + def mock_ai_forecast(df, **kwargs): + assert df is bf_df + result_df = mock.create_autospec(bpd.DataFrame) + result_df.to_pandas.return_value = kwargs + return result_df + + import bigframes.bigquery.ai + + monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_ai_forecast) + + df = pd.DataFrame({"date": ["2020-01-01"], "value": [1.0]}) + result = df.bigquery.ai.forecast( + timestamp_col="date", + data_col="value", + horizon=5, + session=session, + ) + + session.read_pandas.assert_called_once() + assert result == { + "timestamp_col": "date", + "data_col": "value", + "model": "TimesFM 2.0", + "id_cols": None, + "horizon": 5, + "confidence_level": 0.95, + "context_window": None, + "output_historical_time_series": False, + } From 3ddd7ebafbb2708a5eed9d1aebc8533531b215f6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 18 Mar 2026 08:42:12 -0700 Subject: [PATCH 05/25] test: Update ARIMA PLUS score tests to ignore future BQML column additions (#2522) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_arima_plus_score and test_arima_plus_score_series system tests failed on absolute .shape matches (AssertionError: DataFrame shape mismatch). BigQuery ML recently expanded the ML.EVALUATE result shape for ARIMA_PLUS by appending a mean_absolute_scaled_error column. We modified assert_frame_equal to compare result[expected.columns] instead of comparing whole frames directly. This change guarantees that standard columns like mean_absolute_error and mean_squared_error are still verified. Furthermore, the test is robust to column drifts as BigQuery appends backward-compatible features over time. Fixes #<493635556> 🦕 --- tests/system/large/ml/test_forecasting.py | 1 + tests/system/large/ml/test_llm.py | 2 ++ tests/system/small/ml/test_forecasting.py | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index 72a0ee469b..8500ad9d5f 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -88,6 +88,7 @@ def test_arima_plus_model_fit_score( result, columns=expected_columns, index=2 if id_col_name else 1, + col_exact=False, ) # save, load to ensure configuration was kept diff --git a/tests/system/large/ml/test_llm.py b/tests/system/large/ml/test_llm.py index 6e2695b1b5..6e51d14a3a 100644 --- a/tests/system/large/ml/test_llm.py +++ b/tests/system/large/ml/test_llm.py @@ -198,6 +198,7 @@ def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name): "evaluation_status", ], index=1, + col_exact=False, ) @@ -226,6 +227,7 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) "label", "evaluation_status", ], + col_exact=False, ) diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 134f82e96e..23487983ee 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -493,7 +493,7 @@ def test_arima_plus_score( dtype="Float64", ) pd.testing.assert_frame_equal( - result, + result[expected.columns], expected, rtol=0.1, check_index_type=False, @@ -594,7 +594,7 @@ def test_arima_plus_score_series( dtype="Float64", ) pd.testing.assert_frame_equal( - result, + result[expected.columns], expected, rtol=0.1, check_index_type=False, From 494a0a113b1ba6dcdc9f9b85a4f750d093f5652f Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 18 Mar 2026 13:16:32 -0700 Subject: [PATCH 06/25] feat: support full round-trip persistence for multimodal reference cols (#2511) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Saves multimodal metadata descriptions on .to_gbq(). Fixes #<452681068> 🦕 --- bigframes/dtypes.py | 22 +++++++++++++++++++++- bigframes/session/bq_caching_executor.py | 9 +++++---- tests/system/small/test_dataframe_io.py | 22 ++++++++++++++++++++++ tests/unit/test_dtypes.py | 8 ++++++++ 4 files changed, 56 insertions(+), 5 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 304428ef2f..6601fe5ae5 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -772,6 +772,13 @@ def convert_schema_field( ) -> typing.Tuple[str, Dtype]: is_repeated = field.mode == "REPEATED" if field.field_type == "RECORD": + if field.description == OBJ_REF_DESCRIPTION_TAG: + bf_dtype = OBJ_REF_DTYPE # type: ignore + if is_repeated: + pa_type = pa.list_(bigframes_dtype_to_arrow_dtype(bf_dtype)) + bf_dtype = pd.ArrowDtype(pa_type) + return field.name, bf_dtype + mapped_fields = map(convert_schema_field, field.fields) fields = [] for name, dtype in mapped_fields: @@ -815,7 +822,11 @@ def convert_to_schema_field( ) inner_field = convert_to_schema_field(name, inner_type, overrides) return google.cloud.bigquery.SchemaField( - name, inner_field.field_type, mode="REPEATED", fields=inner_field.fields + name, + inner_field.field_type, + mode="REPEATED", + fields=inner_field.fields, + description=inner_field.description, ) if pa.types.is_struct(bigframes_dtype.pyarrow_dtype): inner_fields: list[google.cloud.bigquery.SchemaField] = [] @@ -827,6 +838,14 @@ def convert_to_schema_field( convert_to_schema_field(field.name, inner_bf_type, overrides) ) + if bigframes_dtype == OBJ_REF_DTYPE: + return google.cloud.bigquery.SchemaField( + name, + "RECORD", + fields=inner_fields, + description=OBJ_REF_DESCRIPTION_TAG, + ) + return google.cloud.bigquery.SchemaField( name, "RECORD", fields=inner_fields ) @@ -971,6 +990,7 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: TIMEDELTA_DESCRIPTION_TAG = "#microseconds" +OBJ_REF_DESCRIPTION_TAG = "bigframes_dtype: OBJ_REF_DTYPE" def contains_db_dtypes_json_arrow_type(type_): diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 1e240a841c..fbcdfd33f5 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -334,13 +334,14 @@ def _export_gbq( session=array_value.session, ) - has_timedelta_col = any( - t == bigframes.dtypes.TIMEDELTA_DTYPE for t in array_value.schema.dtypes + has_special_dtype_col = any( + t in (bigframes.dtypes.TIMEDELTA_DTYPE, bigframes.dtypes.OBJ_REF_DTYPE) + for t in array_value.schema.dtypes ) - if spec.if_exists != "append" and has_timedelta_col: + if spec.if_exists != "append" and has_special_dtype_col: # Only update schema if this is not modifying an existing table, and the - # new table contains timedelta columns. + # new table contains special columns (like timedelta or obj_ref). table = self.bqclient.get_table(spec.table) table.schema = array_value.schema.to_bigquery() self.bqclient.update_table(table, ["schema"]) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fece679d06..3da3544cbb 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -1002,6 +1002,28 @@ def test_to_gbq_timedelta_tag_ignored_when_appending(bigquery_client, dataset_id assert table.schema[0].description is None +def test_to_gbq_obj_ref(session, dataset_id: str, bigquery_client): + destination_table = f"{dataset_id}.test_to_gbq_obj_ref" + sql = """ + SELECT + 'gs://cloud-samples-data/vision/ocr/sign.jpg' AS uri_col + """ + df = session.read_gbq(sql) + df["obj_ref_col"] = df["uri_col"].str.to_blob() + df = df.drop(columns=["uri_col"]) + + df.to_gbq(destination_table) + + table = bigquery_client.get_table(destination_table) + obj_ref_field = next(f for f in table.schema if f.name == "obj_ref_col") + assert obj_ref_field.field_type == "RECORD" + assert obj_ref_field.description == "bigframes_dtype: OBJ_REF_DTYPE" + + reloaded_df = session.read_gbq(destination_table) + assert reloaded_df["obj_ref_col"].dtype == dtypes.OBJ_REF_DTYPE + assert len(reloaded_df) == 1 + + @pytest.mark.parametrize( ("index"), [True, False], diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 0e600de964..bb2b57d409 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -71,3 +71,11 @@ def test_infer_literal_type_arrow_scalar(scalar, expected_dtype): ) def test_contains_db_dtypes_json_arrow_type(type_, expected): assert bigframes.dtypes.contains_db_dtypes_json_arrow_type(type_) == expected + + +def test_convert_to_schema_field_list_description(): + bf_dtype = bigframes.dtypes.OBJ_REF_DTYPE + list_bf_dtype = bigframes.dtypes.list_type(bf_dtype) + field = bigframes.dtypes.convert_to_schema_field("my_list", list_bf_dtype) + assert field.description == "bigframes_dtype: OBJ_REF_DTYPE" + assert field.mode == "REPEATED" From 0ebc73395e578b6da964270e9a61cbd133e0a8d7 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 19 Mar 2026 14:04:53 -0700 Subject: [PATCH 07/25] refactor: use sqlglot to build create_external_table ddl (#2523) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes internal issue 418025765 🦕 --- bigframes/bigquery/_operations/table.py | 20 +- .../core/compile/sqlglot/sql/__init__.py | 3 +- bigframes/core/compile/sqlglot/sql/ddl.py | 234 +++++++++++------- bigframes/core/sql/table.py | 68 ----- tests/unit/bigquery/test_table.py | 95 ------- .../test_create_external_table/out.sql | 7 + .../out.sql | 10 + .../out.sql | 6 + .../unit/core/compile/sqlglot/sql/test_ddl.py | 45 ++++ 9 files changed, 226 insertions(+), 262 deletions(-) delete mode 100644 bigframes/core/sql/table.py delete mode 100644 tests/unit/bigquery/test_table.py create mode 100644 tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table/out.sql create mode 100644 tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_all_options/out.sql create mode 100644 tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_if_not_exists/out.sql diff --git a/bigframes/bigquery/_operations/table.py b/bigframes/bigquery/_operations/table.py index c90f88dcd6..cad025412d 100644 --- a/bigframes/bigquery/_operations/table.py +++ b/bigframes/bigquery/_operations/table.py @@ -19,8 +19,8 @@ import google.cloud.bigquery import pandas as pd +import bigframes.core.compile.sqlglot.sql as sg_sql import bigframes.core.logging.log_adapter as log_adapter -import bigframes.core.sql.table import bigframes.session @@ -80,14 +80,16 @@ def create_external_table( """ import bigframes.pandas as bpd - sql = bigframes.core.sql.table.create_external_table_ddl( - table_name=table_name, - replace=replace, - if_not_exists=if_not_exists, - columns=columns, - partition_columns=partition_columns, - connection_name=connection_name, - options=options, + sql = sg_sql.to_sql( + sg_sql.create_external_table( + table_name=table_name, + replace=replace, + if_not_exists=if_not_exists, + columns=columns, + partition_columns=partition_columns, + connection_name=connection_name, + options=options, + ) ) if session is None: diff --git a/bigframes/core/compile/sqlglot/sql/__init__.py b/bigframes/core/compile/sqlglot/sql/__init__.py index 17c78ba379..047fb73d30 100644 --- a/bigframes/core/compile/sqlglot/sql/__init__.py +++ b/bigframes/core/compile/sqlglot/sql/__init__.py @@ -22,7 +22,7 @@ table, to_sql, ) -from bigframes.core.compile.sqlglot.sql.ddl import load_data +from bigframes.core.compile.sqlglot.sql.ddl import create_external_table, load_data from bigframes.core.compile.sqlglot.sql.dml import insert, replace __all__ = [ @@ -35,6 +35,7 @@ "table", "to_sql", # From ddl.py + "create_external_table", "load_data", # From dml.py "insert", diff --git a/bigframes/core/compile/sqlglot/sql/ddl.py b/bigframes/core/compile/sqlglot/sql/ddl.py index 911c63781b..5134368b89 100644 --- a/bigframes/core/compile/sqlglot/sql/ddl.py +++ b/bigframes/core/compile/sqlglot/sql/ddl.py @@ -22,51 +22,6 @@ from bigframes.core.compile.sqlglot.sql import base -def _loaddata_sql(self: sg.Generator, expression: sge.LoadData) -> str: - out = ["LOAD DATA"] - if expression.args.get("overwrite"): - out.append("OVERWRITE") - - out.append(f"INTO {self.sql(expression, 'this').strip()}") - - # We ignore inpath as it's just a dummy to satisfy sqlglot requirements - # but BigQuery uses FROM FILES instead. - - columns = self.sql(expression, "columns").strip() - if columns: - out.append(columns) - - partition_by = self.sql(expression, "partition_by").strip() - if partition_by: - out.append(partition_by) - - cluster_by = self.sql(expression, "cluster_by").strip() - if cluster_by: - out.append(cluster_by) - - options = self.sql(expression, "options").strip() - if options: - out.append(options) - - from_files = self.sql(expression, "from_files").strip() - if from_files: - out.append(f"FROM FILES {from_files}") - - with_partition_columns = self.sql(expression, "with_partition_columns").strip() - if with_partition_columns: - out.append(f"WITH PARTITION COLUMNS {with_partition_columns}") - - connection = self.sql(expression, "connection").strip() - if connection: - out.append(f"WITH CONNECTION {connection}") - - return " ".join(out) - - -# Register the transform for BigQuery generator -sg.dialects.bigquery.BigQuery.Generator.TRANSFORMS[sge.LoadData] = _loaddata_sql - - def load_data( table_name: str, *, @@ -84,21 +39,6 @@ def load_data( # Quoting is handled by the dialect. table_expr = sge.Table(this=base.identifier(table_name)) - sge_columns = ( - sge.Schema( - this=None, - expressions=[ - sge.ColumnDef( - this=base.identifier(name), - kind=sge.DataType.build(typ, dialect="bigquery"), - ) - for name, typ in columns.items() - ], - ) - if columns - else None - ) - sge_partition_by = ( sge.PartitionedByProperty( this=base.identifier(partition_by[0]) @@ -115,17 +55,6 @@ def load_data( else None ) - sge_table_options = ( - sge.Properties( - expressions=[ - sge.Property(this=base.identifier(k), value=base.literal(v)) - for k, v in table_options.items() - ] - ) - if table_options - else None - ) - sge_from_files = sge.Tuple( expressions=[ sge.Property(this=base.identifier(k), value=base.literal(v)) @@ -133,32 +62,159 @@ def load_data( ] ) - sge_with_partition_columns = ( - sge.Schema( - this=None, - expressions=[ - sge.ColumnDef( - this=base.identifier(name), - kind=sge.DataType.build(typ, dialect="bigquery"), - ) - for name, typ in with_partition_columns.items() - ], - ) - if with_partition_columns - else None - ) - sge_connection = base.identifier(connection_name) if connection_name else None return sge.LoadData( this=table_expr, overwrite=(write_disposition == "OVERWRITE"), inpath=sge.convert("fake"), # satisfy sqlglot's required inpath arg - columns=sge_columns, + columns=_get_sge_schema(columns), partition_by=sge_partition_by, cluster_by=sge_cluster_by, - options=sge_table_options, + options=_get_sge_properties(table_options), from_files=sge_from_files, - with_partition_columns=sge_with_partition_columns, + with_partition_columns=_get_sge_schema(with_partition_columns), + connection=sge_connection, + ) + + +def create_external_table( + table_name: str, + *, + replace: bool = False, + if_not_exists: bool = False, + columns: Optional[Mapping[str, str]] = None, + partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, + options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None, +) -> sge.Create: + """Generates the CREATE EXTERNAL TABLE DDL statement.""" + sge_connection = base.identifier(connection_name) if connection_name else None + + table_expr = sge.Table(this=base.identifier(table_name)) + + # sqlglot.expressions.Create usually takes 'this' (Table or Schema) + sge_schema = _get_sge_schema(columns) + this: sge.Table | sge.Schema + if sge_schema: + sge_schema.set("this", table_expr) + this = sge_schema + else: + this = table_expr + + return sge.Create( + this=this, + kind="EXTERNAL TABLE", + replace=replace, + exists_ok=if_not_exists, + properties=_get_sge_properties(options), connection=sge_connection, + partition_columns=_get_sge_schema(partition_columns), ) + + +def _get_sge_schema( + columns: Optional[Mapping[str, str]] = None +) -> Optional[sge.Schema]: + if not columns: + return None + + return sge.Schema( + this=None, + expressions=[ + sge.ColumnDef( + this=base.identifier(name), + kind=sge.DataType.build(typ, dialect=base.DIALECT), + ) + for name, typ in columns.items() + ], + ) + + +def _get_sge_properties( + options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None +) -> Optional[sge.Properties]: + if not options: + return None + + return sge.Properties( + expressions=[ + sge.Property(this=base.identifier(k), value=base.literal(v)) + for k, v in options.items() + ] + ) + + +def _loaddata_sql(self: sg.Generator, expression: sge.LoadData) -> str: + out = ["LOAD DATA"] + if expression.args.get("overwrite"): + out.append("OVERWRITE") + + out.append(f"INTO {self.sql(expression, 'this').strip()}") + + # We ignore inpath as it's just a dummy to satisfy sqlglot requirements + # but BigQuery uses FROM FILES instead. + + columns = self.sql(expression, "columns").strip() + if columns: + out.append(columns) + + partition_by = self.sql(expression, "partition_by").strip() + if partition_by: + out.append(partition_by) + + cluster_by = self.sql(expression, "cluster_by").strip() + if cluster_by: + out.append(cluster_by) + + options = self.sql(expression, "options").strip() + if options: + out.append(options) + + from_files = self.sql(expression, "from_files").strip() + if from_files: + out.append(f"FROM FILES {from_files}") + + with_partition_columns = self.sql(expression, "with_partition_columns").strip() + if with_partition_columns: + out.append(f"WITH PARTITION COLUMNS {with_partition_columns}") + + connection = self.sql(expression, "connection").strip() + if connection: + out.append(f"WITH CONNECTION {connection}") + + return " ".join(out) + + +def _create_sql(self: sg.Generator, expression: sge.Create) -> str: + kind = expression.args.get("kind") + if kind != "EXTERNAL TABLE": + return self.create_sql(expression) + + out = ["CREATE"] + if expression.args.get("replace"): + out.append("OR REPLACE") + out.append("EXTERNAL TABLE") + if expression.args.get("exists_ok"): + out.append("IF NOT EXISTS") + + out.append(self.sql(expression, "this")) + + connection = self.sql(expression, "connection").strip() + if connection: + out.append(f"WITH CONNECTION {connection}") + + partition_columns = self.sql(expression, "partition_columns").strip() + if partition_columns: + out.append(f"WITH PARTITION COLUMNS {partition_columns}") + + properties = self.sql(expression, "properties").strip() + if properties: + out.append(properties) + + return " ".join(out) + + +# Register the transform for BigQuery generator +base.DIALECT.Generator.TRANSFORMS[sge.LoadData] = _loaddata_sql +base.DIALECT.Generator.TRANSFORMS[sge.Create] = _create_sql diff --git a/bigframes/core/sql/table.py b/bigframes/core/sql/table.py deleted file mode 100644 index 24a97ed159..0000000000 --- a/bigframes/core/sql/table.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from typing import Mapping, Optional, Union - - -def create_external_table_ddl( - table_name: str, - *, - replace: bool = False, - if_not_exists: bool = False, - columns: Optional[Mapping[str, str]] = None, - partition_columns: Optional[Mapping[str, str]] = None, - connection_name: Optional[str] = None, - options: Mapping[str, Union[str, int, float, bool, list]], -) -> str: - """Generates the CREATE EXTERNAL TABLE DDL statement.""" - statement = ["CREATE"] - if replace: - statement.append("OR REPLACE") - statement.append("EXTERNAL TABLE") - if if_not_exists: - statement.append("IF NOT EXISTS") - statement.append(table_name) - - if columns: - column_defs = ", ".join([f"{name} {typ}" for name, typ in columns.items()]) - statement.append(f"({column_defs})") - - if connection_name: - statement.append(f"WITH CONNECTION `{connection_name}`") - - if partition_columns: - part_defs = ", ".join( - [f"{name} {typ}" for name, typ in partition_columns.items()] - ) - statement.append(f"WITH PARTITION COLUMNS ({part_defs})") - - if options: - opts = [] - for key, value in options.items(): - if isinstance(value, str): - value_sql = repr(value) - opts.append(f"{key} = {value_sql}") - elif isinstance(value, bool): - opts.append(f"{key} = {str(value).upper()}") - elif isinstance(value, list): - list_str = ", ".join([repr(v) for v in value]) - opts.append(f"{key} = [{list_str}]") - else: - opts.append(f"{key} = {value}") - options_str = ", ".join(opts) - statement.append(f"OPTIONS ({options_str})") - - return " ".join(statement) diff --git a/tests/unit/bigquery/test_table.py b/tests/unit/bigquery/test_table.py deleted file mode 100644 index badce5e5e2..0000000000 --- a/tests/unit/bigquery/test_table.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License""); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from unittest import mock - -import pytest - -import bigframes.bigquery -import bigframes.core.sql.table -import bigframes.session - - -@pytest.fixture -def mock_session(): - return mock.create_autospec(spec=bigframes.session.Session) - - -def test_create_external_table_ddl(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_replace(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - replace=True, - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE OR REPLACE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_if_not_exists(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - if_not_exists=True, - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE IF NOT EXISTS my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_partition_columns(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - partition_columns={"part1": "DATE", "part2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) WITH PARTITION COLUMNS (part1 DATE, part2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -def test_create_external_table_ddl_connection(): - sql = bigframes.core.sql.table.create_external_table_ddl( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - connection_name="my-connection", - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - ) - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) WITH CONNECTION `my-connection` OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert sql == expected - - -@mock.patch("bigframes.bigquery._operations.table._get_table_metadata") -def test_create_external_table(get_table_metadata_mock, mock_session): - bigframes.bigquery.create_external_table( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - session=mock_session, - ) - mock_session.read_gbq_query.assert_called_once() - generated_sql = mock_session.read_gbq_query.call_args[0][0] - expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" - assert generated_sql == expected - get_table_metadata_mock.assert_called_once() diff --git a/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table/out.sql b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table/out.sql new file mode 100644 index 0000000000..867282de0e --- /dev/null +++ b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table/out.sql @@ -0,0 +1,7 @@ +CREATE EXTERNAL TABLE `my-project.my_dataset.my_table` ( + `col1` INT64, + `col2` STRING +) OPTIONS ( + format='CSV', + uris=['gs://bucket/path*'] +) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_all_options/out.sql b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_all_options/out.sql new file mode 100644 index 0000000000..a08ddf5ee5 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_all_options/out.sql @@ -0,0 +1,10 @@ +CREATE OR REPLACE EXTERNAL TABLE `my-project.my_dataset.my_table` ( + `col1` INT64, + `col2` STRING +) WITH CONNECTION `my-connection` WITH PARTITION COLUMNS ( + `part1` DATE, + `part2` STRING +) OPTIONS ( + format='CSV', + uris=['gs://bucket/path*'] +) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_if_not_exists/out.sql b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_if_not_exists/out.sql new file mode 100644 index 0000000000..e05a553317 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/sql/snapshots/test_ddl/test_create_external_table_if_not_exists/out.sql @@ -0,0 +1,6 @@ +CREATE EXTERNAL TABLE IF NOT EXISTS `my-project.my_dataset.my_table` ( + `col1` INT64 +) OPTIONS ( + format='CSV', + uris=['gs://bucket/path*'] +) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/sql/test_ddl.py b/tests/unit/core/compile/sqlglot/sql/test_ddl.py index 14d3708883..48080cd6b9 100644 --- a/tests/unit/core/compile/sqlglot/sql/test_ddl.py +++ b/tests/unit/core/compile/sqlglot/sql/test_ddl.py @@ -12,13 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import pytest +import bigframes.bigquery import bigframes.core.compile.sqlglot.sql as sql +import bigframes.session pytest.importorskip("pytest_snapshot") +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + def test_load_data_minimal(snapshot): expr = sql.load_data( "my-project.my_dataset.my_table", @@ -40,3 +49,39 @@ def test_load_data_all_options(snapshot): connection_name="my-connection", ) snapshot.assert_match(sql.to_sql(expr), "out.sql") + + +@mock.patch("bigframes.bigquery._operations.table._get_table_metadata") +def test_create_external_table(get_table_metadata_mock, mock_session, snapshot): + bigframes.bigquery.create_external_table( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + session=mock_session, + ) + mock_session.read_gbq_query.assert_called_once() + generated_sql = mock_session.read_gbq_query.call_args[0][0] + snapshot.assert_match(generated_sql, "out.sql") + get_table_metadata_mock.assert_called_once() + + +def test_create_external_table_all_options(snapshot): + expr = sql.create_external_table( + "my-project.my_dataset.my_table", + replace=True, + columns={"col1": "INT64", "col2": "STRING"}, + partition_columns={"part1": "DATE", "part2": "STRING"}, + connection_name="my-connection", + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + snapshot.assert_match(sql.to_sql(expr), "out.sql") + + +def test_create_external_table_if_not_exists(snapshot): + expr = sql.create_external_table( + "my-project.my_dataset.my_table", + if_not_exists=True, + columns={"col1": "INT64"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + snapshot.assert_match(sql.to_sql(expr), "out.sql") From 460dbdc9abf1027ec71a709f64fb20146c930b23 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 19 Mar 2026 15:31:02 -0700 Subject: [PATCH 08/25] refactor: Improve cache encapsulation (#2525) --- bigframes/core/tree_properties.py | 10 ++- bigframes/session/__init__.py | 7 +- bigframes/session/bq_caching_executor.py | 96 ++++++------------------ bigframes/session/execution_cache.py | 88 ++++++++++++++++++++++ 4 files changed, 124 insertions(+), 77 deletions(-) create mode 100644 bigframes/session/execution_cache.py diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index baf4b12566..5f713450f7 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -15,10 +15,13 @@ import functools import itertools -from typing import Callable, Dict, Optional, Sequence +from typing import Callable, Dict, Optional, Sequence, TYPE_CHECKING import bigframes.core.nodes as nodes +if TYPE_CHECKING: + import bigframes.session.execution_cache as execution_cache + def is_trivially_executable(node: nodes.BigFrameNode) -> bool: if local_only(node): @@ -65,7 +68,7 @@ def select_cache_target( root: nodes.BigFrameNode, min_complexity: float, max_complexity: float, - cache: dict[nodes.BigFrameNode, nodes.BigFrameNode], + cache: execution_cache.ExecutionCache, heuristic: Callable[[int, int], float], ) -> Optional[nodes.BigFrameNode]: """Take tree, and return candidate nodes with (# of occurences, post-caching planning complexity). @@ -75,7 +78,7 @@ def select_cache_target( @functools.cache def _with_caching(subtree: nodes.BigFrameNode) -> nodes.BigFrameNode: - return nodes.top_down(subtree, lambda x: cache.get(x, x)) + return cache.subsitute_cached_subplans(subtree) def _combine_counts( left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int] @@ -106,6 +109,7 @@ def _node_counts_inner( if len(node_counts) == 0: raise ValueError("node counts should be non-zero") + # for each considered node, calculate heuristic value, and return node with max value return max( node_counts.keys(), key=lambda node: heuristic( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7ea6e99954..0a2f2db189 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -265,15 +265,20 @@ def __init__( metrics=self._metrics, publisher=self._publisher, ) + + labels = {} + if not self._strictly_ordered: + labels["bigframes-mode"] = "unordered" + self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( bqclient=self._clients_provider.bqclient, bqstoragereadclient=self._clients_provider.bqstoragereadclient, loader=self._loader, storage_manager=self._temp_storage_manager, - strictly_ordered=self._strictly_ordered, metrics=self._metrics, enable_polars_execution=context.enable_polars_execution, publisher=self._publisher, + labels=labels, ) def __del__(self): diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index fbcdfd33f5..cf275154ce 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -17,7 +17,6 @@ import math import threading from typing import Literal, Mapping, Optional, Sequence, Tuple -import weakref import google.api_core.exceptions from google.cloud import bigquery @@ -47,6 +46,7 @@ semi_executor, ) import bigframes.session._io.bigquery as bq_io +import bigframes.session.execution_cache as execution_cache import bigframes.session.execution_spec as ex_spec import bigframes.session.metrics import bigframes.session.planner @@ -59,58 +59,6 @@ _MAX_CLUSTER_COLUMNS = 4 MAX_SMALL_RESULT_BYTES = 10 * 1024 * 1024 * 1024 # 10G -SourceIdMapping = Mapping[str, str] - - -class ExecutionCache: - def __init__(self): - # current assumption is only 1 cache of a given node - # in future, might have multiple caches, with different layout, localities - self._cached_executions: weakref.WeakKeyDictionary[ - nodes.BigFrameNode, nodes.CachedTableNode - ] = weakref.WeakKeyDictionary() - self._uploaded_local_data: weakref.WeakKeyDictionary[ - local_data.ManagedArrowTable, - tuple[bq_data.BigqueryDataSource, SourceIdMapping], - ] = weakref.WeakKeyDictionary() - - @property - def mapping(self) -> Mapping[nodes.BigFrameNode, nodes.BigFrameNode]: - return self._cached_executions - - def cache_results_table( - self, - original_root: nodes.BigFrameNode, - data: bq_data.BigqueryDataSource, - ): - # Assumption: GBQ cached table uses field name as bq column name - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(field.id, field.id.sql) for field in original_root.fields - ) - ) - cached_replacement = nodes.CachedTableNode( - source=data, - scan_list=scan_list, - table_session=original_root.session, - original_node=original_root, - ) - assert original_root.schema == cached_replacement.schema - self._cached_executions[original_root] = cached_replacement - - def cache_remote_replacement( - self, - local_data: local_data.ManagedArrowTable, - bq_data: bq_data.BigqueryDataSource, - ): - # bq table has one extra column for offsets, those are implicit for local data - assert len(local_data.schema.items) + 1 == len(bq_data.table.physical_schema) - mapping = { - local_data.schema.items[i].column: bq_data.table.physical_schema[i].name - for i in range(len(local_data.schema)) - } - self._uploaded_local_data[local_data] = (bq_data, mapping) - class BigQueryCachingExecutor(executor.Executor): """Computes BigFrames values using BigQuery Engine. @@ -128,20 +76,20 @@ def __init__( bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, loader: loader.GbqDataLoader, *, - strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, enable_polars_execution: bool = False, publisher: bigframes.core.events.Publisher, + labels: Mapping[str, str] = {}, ): self.bqclient = bqclient self.storage_manager = storage_manager - self.strictly_ordered: bool = strictly_ordered - self.cache: ExecutionCache = ExecutionCache() + self.cache: execution_cache.ExecutionCache = execution_cache.ExecutionCache() self.metrics = metrics self.loader = loader self.bqstoragereadclient = bqstoragereadclient self._enable_polars_execution = enable_polars_execution self._publisher = publisher + self._labels = labels # TODO(tswast): Send events from semi-executors, too. self._semi_executors: Sequence[semi_executor.SemiExecutor] = ( @@ -410,8 +358,8 @@ def _run_execute_query( bigframes.options.compute.maximum_bytes_billed ) - if not self.strictly_ordered: - job_config.labels["bigframes-mode"] = "unordered" + if self._labels: + job_config.labels.update(self._labels) try: # Trick the type checker into thinking we got a literal. @@ -450,9 +398,6 @@ def _run_execute_query( else: raise - def replace_cached_subtrees(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: - return nodes.top_down(node, lambda x: self.cache.mapping.get(x, x)) - def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): """ Can the block be evaluated very cheaply? @@ -482,7 +427,7 @@ def prepare_plan( ): self._simplify_with_caching(plan) - plan = self.replace_cached_subtrees(plan) + plan = self.cache.subsitute_cached_subplans(plan) plan = rewrite.column_pruning(plan) plan = plan.top_down(rewrite.fold_row_counts) @@ -527,7 +472,7 @@ def _cache_with_session_awareness( self._cache_with_cluster_cols( bigframes.core.ArrayValue(target), cluster_cols_sql_names ) - elif self.strictly_ordered: + elif not target.order_ambiguous: self._cache_with_offsets(bigframes.core.ArrayValue(target)) else: self._cache_with_cluster_cols(bigframes.core.ArrayValue(target), []) @@ -552,7 +497,7 @@ def _cache_most_complex_subtree(self, node: nodes.BigFrameNode) -> bool: node, min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), max_complexity=QUERY_COMPLEXITY_LIMIT, - cache=dict(self.cache.mapping), + cache=self.cache, # Heuristic: subtree_compleixty * (copies of subtree)^2 heuristic=lambda complexity, count: math.log(complexity) + 2 * math.log(count), @@ -581,32 +526,37 @@ def _substitute_large_local_sources(self, original_root: nodes.BigFrameNode): def map_local_scans(node: nodes.BigFrameNode): if not isinstance(node, nodes.ReadLocalNode): return node - if node.local_data_source not in self.cache._uploaded_local_data: - return node - bq_source, source_mapping = self.cache._uploaded_local_data[ + uploaded_local_data = self.cache.get_uploaded_local_data( node.local_data_source - ] - scan_list = node.scan_list.remap_source_ids(source_mapping) + ) + if uploaded_local_data is None: + return node + + scan_list = node.scan_list.remap_source_ids( + uploaded_local_data.source_mapping + ) # offsets_col isn't part of ReadTableNode, so emulate by adding to end of scan_list if node.offsets_col is not None: # Offsets are always implicitly the final column of uploaded data # See: Loader.load_data scan_list = scan_list.append( - bq_source.table.physical_schema[-1].name, + uploaded_local_data.bq_source.table.physical_schema[-1].name, bigframes.dtypes.INT_DTYPE, node.offsets_col, ) - return nodes.ReadTableNode(bq_source, scan_list, node.session) + return nodes.ReadTableNode( + uploaded_local_data.bq_source, scan_list, node.session + ) return original_root.bottom_up(map_local_scans) def _upload_local_data(self, local_table: local_data.ManagedArrowTable): - if local_table in self.cache._uploaded_local_data: + if self.cache.get_uploaded_local_data(local_table) is not None: return # Lock prevents concurrent repeated work, but slows things down. # Might be better as a queue and a worker thread with self._upload_lock: - if local_table not in self.cache._uploaded_local_data: + if self.cache.get_uploaded_local_data(local_table) is None: uploaded = self.loader.load_data_or_write_data( local_table, bigframes.core.guid.generate_guid() ) diff --git a/bigframes/session/execution_cache.py b/bigframes/session/execution_cache.py new file mode 100644 index 0000000000..782a1c5c4e --- /dev/null +++ b/bigframes/session/execution_cache.py @@ -0,0 +1,88 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +from typing import Mapping, Optional +import weakref + +from bigframes.core import bq_data, local_data, nodes + +SourceIdMapping = Mapping[str, str] + + +@dataclasses.dataclass(frozen=True) +class UploadedLocalData: + bq_source: bq_data.BigqueryDataSource + source_mapping: SourceIdMapping + + +class ExecutionCache: + def __init__(self): + # effectively two separate caches that don't interact + self._cached_executions: weakref.WeakKeyDictionary[ + nodes.BigFrameNode, bq_data.BigqueryDataSource + ] = weakref.WeakKeyDictionary() + # This upload cache is entirely independent of the plan cache. + self._uploaded_local_data: weakref.WeakKeyDictionary[ + local_data.ManagedArrowTable, + UploadedLocalData, + ] = weakref.WeakKeyDictionary() + + def subsitute_cached_subplans(self, root: nodes.BigFrameNode) -> nodes.BigFrameNode: + def replace_if_cached(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if node not in self._cached_executions: + return node + # Assumption: GBQ cached table uses field name as bq column name + scan_list = nodes.ScanList( + tuple(nodes.ScanItem(field.id, field.id.sql) for field in node.fields) + ) + bq_data = self._cached_executions[node] + cached_replacement = nodes.CachedTableNode( + source=bq_data, + scan_list=scan_list, + table_session=node.session, + original_node=node, + ) + assert node.schema == cached_replacement.schema + return cached_replacement + + return nodes.top_down(root, replace_if_cached) + + def cache_results_table( + self, + original_root: nodes.BigFrameNode, + data: bq_data.BigqueryDataSource, + ): + self._cached_executions[original_root] = data + + ## Local data upload caching + def cache_remote_replacement( + self, + local_data: local_data.ManagedArrowTable, + bq_data: bq_data.BigqueryDataSource, + ): + # bq table has one extra column for offsets, those are implicit for local data + assert len(local_data.schema.items) + 1 == len(bq_data.table.physical_schema) + mapping = { + local_data.schema.items[i].column: bq_data.table.physical_schema[i].name + for i in range(len(local_data.schema)) + } + self._uploaded_local_data[local_data] = UploadedLocalData(bq_data, mapping) + + def get_uploaded_local_data( + self, local_data: local_data.ManagedArrowTable + ) -> Optional[UploadedLocalData]: + return self._uploaded_local_data.get(local_data) From 6b0509be42b457f8a78199645fd00eefb1ed5951 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 20 Mar 2026 13:31:07 -0700 Subject: [PATCH 09/25] test: Remove ingress setting all test (#2528) --- .../large/functions/test_remote_function.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index a631d97a2e..1ab4987302 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -2682,25 +2682,6 @@ def square(x: int) -> int: ) -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_ingress_settings_w_all(session): - ingress_settings_args = {"cloud_function_ingress_settings": "all"} - - with pytest.raises( - google.api_core.exceptions.FailedPrecondition, - match="400.*allowedIngress violated", - ): - - def square(x: int) -> int: - return x * x - - session.remote_function( - reuse=False, - cloud_function_service_account="default", - **ingress_settings_args, - )(square) - - @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_ingress_settings_unsupported(session): with pytest.raises( From b5a765239a1962e182d35c8b8e78fe0d4be1eac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Sat, 21 Mar 2026 17:06:59 +0000 Subject: [PATCH 10/25] chore: include pandas accessors in api logging (#2524) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #2518 🦕 --- bigframes/extensions/pandas/__init__.py | 12 ++++++++++++ bigframes/extensions/pandas/dataframe_accessor.py | 11 +++++++---- docs/reference/index.rst | 2 +- tests/unit/extensions/pandas/test_registration.py | 6 ++++-- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/bigframes/extensions/pandas/__init__.py b/bigframes/extensions/pandas/__init__.py index 58d482ea38..d47acd3b05 100644 --- a/bigframes/extensions/pandas/__init__.py +++ b/bigframes/extensions/pandas/__init__.py @@ -11,3 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +""" +BigQuery DataFrames automatically registers a pandas extenstion when imported. +This allows you to use the power of the BigQuery engine with pandas objects +directly. +""" + +from bigframes.extensions.pandas.dataframe_accessor import ( + PandasBigQueryDataFrameAccessor, +) + +__all__ = ["PandasBigQueryDataFrameAccessor"] diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py index aab9f35bd9..ad75386f1c 100644 --- a/bigframes/extensions/pandas/dataframe_accessor.py +++ b/bigframes/extensions/pandas/dataframe_accessor.py @@ -18,10 +18,12 @@ import pandas.api.extensions import bigframes.core.global_session as bf_session +from bigframes.core.logging import log_adapter import bigframes.pandas as bpd -class AIAccessor: +@log_adapter.class_logger +class PandasAIAccessor: """ Pandas DataFrame accessor for BigQuery AI functions. """ @@ -101,7 +103,8 @@ def forecast( @pandas.api.extensions.register_dataframe_accessor("bigquery") -class BigQueryDataFrameAccessor: +@log_adapter.class_logger +class PandasBigQueryDataFrameAccessor: """ Pandas DataFrame accessor for BigQuery DataFrames functionality. @@ -112,11 +115,11 @@ def __init__(self, pandas_obj: pandas.DataFrame): self._obj = pandas_obj @property - def ai(self) -> "AIAccessor": + def ai(self) -> "PandasAIAccessor": """ Accessor for BigQuery AI functions. """ - return AIAccessor(self._obj) + return PandasAIAccessor(self._obj) def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None): """ diff --git a/docs/reference/index.rst b/docs/reference/index.rst index cb295a4309..0de668c4fa 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -27,7 +27,7 @@ BigQuery DataFrames provides extensions to pandas DataFrame objects. .. autosummary:: :toctree: api - bigframes.extensions.pandas.dataframe_accessor.BigQueryDataFrameAccessor + bigframes.extensions.pandas ML APIs ~~~~~~~ diff --git a/tests/unit/extensions/pandas/test_registration.py b/tests/unit/extensions/pandas/test_registration.py index 1258098091..7007d6f9f2 100644 --- a/tests/unit/extensions/pandas/test_registration.py +++ b/tests/unit/extensions/pandas/test_registration.py @@ -22,6 +22,8 @@ def test_bigframes_import_registers_accessor(): df = pd.DataFrame({"a": [1]}) # If bigframes was imported, df.bigquery should exist assert hasattr(df, "bigquery") - from bigframes.extensions.pandas.dataframe_accessor import BigQueryDataFrameAccessor + from bigframes.extensions.pandas.dataframe_accessor import ( + PandasBigQueryDataFrameAccessor, + ) - assert isinstance(df.bigquery, BigQueryDataFrameAccessor) + assert isinstance(df.bigquery, PandasBigQueryDataFrameAccessor) From e8c46032154e186042314d97aa813301413d8a13 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 23 Mar 2026 12:09:06 -0700 Subject: [PATCH 11/25] fix: support melting empty DataFrames without crashing (#2509) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allows alignment melts over zero-row offset layouts Fixes #<452681068> 🦕 --- bigframes/core/blocks.py | 69 +++++++++++++++++++-------- tests/system/small/test_dataframe.py | 13 +++++ tests/system/small/test_multiindex.py | 31 ++++++++++++ 3 files changed, 93 insertions(+), 20 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 239eedf6d3..a15c83e82e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1822,9 +1822,9 @@ def melt( Arguments correspond to pandas.melt arguments. """ # TODO: Implement col_level and ignore_index - value_labels: pd.Index = pd.Index( - [self.col_id_to_label[col_id] for col_id in value_vars] - ) + value_labels: pd.Index = self.column_labels[ + [self.value_columns.index(col_id) for col_id in value_vars] + ] id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] unpivot_expr, (var_col_ids, unpivot_out, passthrough_cols) = unpivot( @@ -3417,6 +3417,7 @@ def unpivot( joined_array, (labels_mapping, column_mapping) = labels_array.relational_join( array_value, type="cross" ) + new_passthrough_cols = [column_mapping[col] for col in passthrough_columns] # Last column is offsets index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]] @@ -3426,20 +3427,24 @@ def unpivot( unpivot_exprs: List[ex.Expression] = [] # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index for input_ids in unpivot_columns: - # row explode offset used to choose the input column - # we use offset instead of label as labels are not necessarily unique - cases = itertools.chain( - *( - ( - ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), - ex.deref(column_mapping[id_or_null]) - if (id_or_null is not None) - else ex.const(None), + col_expr: ex.Expression + if not input_ids: + col_expr = ex.const(None, dtype=bigframes.dtypes.INT_DTYPE) + else: + # row explode offset used to choose the input column + # we use offset instead of label as labels are not necessarily unique + cases = itertools.chain( + *( + ( + ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), + ex.deref(column_mapping[id_or_null]) + if (id_or_null is not None) + else ex.const(None), + ) + for i, id_or_null in enumerate(input_ids) ) - for i, id_or_null in enumerate(input_ids) ) - ) - col_expr = ops.case_when_op.as_expr(*cases) + col_expr = ops.case_when_op.as_expr(*cases) unpivot_exprs.append(col_expr) joined_array, unpivot_col_ids = joined_array.compute_values(unpivot_exprs) @@ -3457,19 +3462,43 @@ def _pd_index_to_array_value( Create an ArrayValue from a list of label tuples. The last column will be row offsets. """ + id_gen = bigframes.core.identifiers.standard_id_strings() + col_ids = [next(id_gen) for _ in range(index.nlevels)] + offset_id = next(id_gen) + rows = [] labels_as_tuples = utils.index_as_tuples(index) for row_offset in range(len(index)): - id_gen = bigframes.core.identifiers.standard_id_strings() row_label = labels_as_tuples[row_offset] row_label = (row_label,) if not isinstance(row_label, tuple) else row_label row = {} - for label_part, id in zip(row_label, id_gen): - row[id] = label_part if pd.notnull(label_part) else None - row[next(id_gen)] = row_offset + for label_part, col_id in zip(row_label, col_ids): + row[col_id] = label_part if pd.notnull(label_part) else None + row[offset_id] = row_offset rows.append(row) - return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session) + if not rows: + dtypes_list = getattr(index, "dtypes", None) + if dtypes_list is None: + dtypes_list = ( + [index.dtype] if hasattr(index, "dtype") else [pd.Float64Dtype()] + ) + + fields = [] + for col_id, dtype in zip(col_ids, dtypes_list): + try: + pa_type = bigframes.dtypes.bigframes_dtype_to_arrow_dtype(dtype) + except Exception: + pa_type = pa.string() + fields.append(pa.field(col_id, pa_type)) + fields.append(pa.field(offset_id, pa.int64())) + schema = pa.schema(fields) + pt = pa.Table.from_pylist([], schema=schema) + else: + pt = pa.Table.from_pylist(rows) + pt = pt.rename_columns([*col_ids, offset_id]) + + return core.ArrayValue.from_pyarrow(pt, session=session) def _resolve_index_col( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9683a8bc52..bc6095d434 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5902,6 +5902,19 @@ def test_to_gbq_table_labels(scalars_df_index): assert table.labels["test"] == "labels" +def test_to_gbq_obj_ref_persists(session): + # Test that saving and loading an Object Reference retains its dtype + bdf = session.from_glob_path( + "gs://cloud-samples-data/vision/ocr/*.jpg", name="uris" + ).head(1) + + destination_table = "bigframes-dev.bigframes_tests_sys.test_obj_ref_persistence" + bdf.to_gbq(destination_table, if_exists="replace") + + loaded_df = session.read_gbq(destination_table) + assert loaded_df["uris"].dtype == dtypes.OBJ_REF_DTYPE + + @pytest.mark.parametrize( ("col_names", "ignore_index"), [ diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 522e8db9e4..18368fc512 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1490,3 +1490,34 @@ def test_multiindex_eq_const(scalars_df_index, scalars_pandas_df_index): bigframes.testing.utils.assert_index_equal( pandas.Index(pd_result, dtype="boolean"), bf_result.to_pandas() ) + + +def test_count_empty_multiindex_columns(session): + df = pandas.DataFrame( + [], index=[1, 2], columns=pandas.MultiIndex.from_tuples([], names=["a", "b"]) + ) + bdf = session.read_pandas(df) + + # count() operation unpivots columns, triggering the empty MultiIndex bug internally + count_df = bdf.count() + + # The local fix ensures that empty unpivoted columns generate properly typed NULLs + # rather than failing syntax validation downstream in BigQuery. + # We compile to `.sql` to verify it succeeds locally without evaluating on BigQuery natively. + _ = count_df.to_frame().sql + + # Assert structural layout is correct + assert count_df.index.nlevels == 2 + assert list(count_df.index.names) == ["a", "b"] + + +def test_dataframe_melt_multiindex(session): + # Tests that `melt` operations via count do not cause MultiIndex drops in Arrow + df = pandas.DataFrame({"A": [1], "B": ["string"], "C": [3]}) + df.columns = pandas.MultiIndex.from_tuples( + [("Group1", "A"), ("Group2", "B"), ("Group1", "C")] + ) + bdf = session.read_pandas(df) + + count_df = bdf.count().to_pandas() + assert count_df.shape[0] == 3 From 915cce52b6f55ce043589104cd9b02c27a05581c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 23 Mar 2026 14:33:59 -0700 Subject: [PATCH 12/25] refactor: increase test coverages on sqlglot compiler (#2527) --- .../ibis_compiler/scalar_op_registry.py | 12 ++--- .../compile/sqlglot/aggregate_compiler.py | 2 - .../compile/sqlglot/expressions/array_ops.py | 25 --------- .../sqlglot/expressions/datetime_ops.py | 6 +-- .../aggregations/test_op_registration.py | 20 +++++++ .../test_array_ops/test_array_index/out.sql | 5 +- .../test_array_reduce_op/out.sql | 7 ++- .../test_array_ops/test_array_slice/out.sql | 17 ++++++ .../test_array_slice_with_only_start/out.sql | 9 ---- .../out.sql | 9 ---- .../out.sql | 3 ++ .../test_obj_make_ref_json/out.sql | 3 ++ .../test_bool_ops/test_and_op/out.sql | 3 +- .../test_bool_ops/test_or_op/out.sql | 3 +- .../test_bool_ops/test_xor_op/out.sql | 8 ++- .../test_eq_numeric/out.sql | 1 + .../test_ge_numeric/out.sql | 1 + .../test_gt_numeric/out.sql | 1 + .../test_comparison_ops/test_is_in/out.sql | 7 ++- .../test_le_numeric/out.sql | 1 + .../test_lt_numeric/out.sql | 1 + .../test_ne_numeric/out.sql | 3 ++ .../test_datetime_to_integer_label/out.sql | 52 ++++++++++++++++++- .../test_to_datetime/out.sql | 3 +- .../test_to_timestamp/out.sql | 3 +- .../test_json_value_array/out.sql | 3 ++ .../test_numeric_ops/test_add_numeric/out.sql | 1 + .../test_numeric_ops/test_div_numeric/out.sql | 1 + .../test_floordiv_numeric/out.sql | 47 +++++++++++++++++ .../test_numeric_ops/test_mul_numeric/out.sql | 1 + .../test_numeric_ops/test_sub_numeric/out.sql | 9 ++-- .../test_string_ops/test_str_slice/out.sql | 17 +++++- .../sqlglot/expressions/test_array_ops.py | 51 ++++++++++-------- .../sqlglot/expressions/test_blob_ops.py | 24 +++++++++ .../sqlglot/expressions/test_bool_ops.py | 3 ++ .../expressions/test_comparison_ops.py | 13 +++++ .../sqlglot/expressions/test_datetime_ops.py | 19 +++++++ .../sqlglot/expressions/test_json_ops.py | 10 ++++ .../sqlglot/expressions/test_numeric_ops.py | 15 ++++-- .../sqlglot/expressions/test_string_ops.py | 14 +++-- 40 files changed, 335 insertions(+), 98 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice/out.sql delete mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_only_start/out.sql delete mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_start_and_stop/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url_with_duration/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref_json/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_value_array/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index dd27587433..5bb278e882 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -663,7 +663,7 @@ def datetime_to_integer_label_non_fixed_frequency( .else_((x_int - first - 1) // us + 1) # type: ignore .end() ) - elif rule_code == "ME": # Monthly + elif rule_code in ("M", "ME"): # Monthly x_int = x.year() * 12 + x.month() - 1 # type: ignore first = y.year() * 12 + y.month() - 1 # type: ignore x_int_label = ( @@ -672,7 +672,7 @@ def datetime_to_integer_label_non_fixed_frequency( .else_((x_int - first - 1) // n + 1) # type: ignore .end() ) - elif rule_code == "QE-DEC": # Quarterly + elif rule_code in ("Q-DEC", "QE-DEC"): # Quarterly x_int = x.year() * 4 + x.quarter() - 1 # type: ignore first = y.year() * 4 + y.quarter() - 1 # type: ignore x_int_label = ( @@ -681,7 +681,7 @@ def datetime_to_integer_label_non_fixed_frequency( .else_((x_int - first - 1) // n + 1) # type: ignore .end() ) - elif rule_code == "YE-DEC": # Yearly + elif rule_code in ("A-DEC", "Y-DEC", "YE-DEC"): # Yearly x_int = x.year() # type: ignore first = y.year() # type: ignore x_int_label = ( @@ -749,7 +749,7 @@ def integer_label_to_datetime_op_non_fixed_frequency( .cast(ibis_dtypes.Timestamp(timezone="UTC")) .cast(y.type()) ) - elif rule_code == "ME": # Monthly + elif rule_code in ("M", "ME"): # Monthly one = ibis_types.literal(1) twelve = ibis_types.literal(12) first = y.year() * twelve + y.month() - one # type: ignore @@ -769,7 +769,7 @@ def integer_label_to_datetime_op_non_fixed_frequency( 0, ) x_label = next_month_date - ibis_api.interval(days=1) - elif rule_code == "QE-DEC": # Quarterly + elif rule_code in ("Q-DEC", "QE-DEC"): # Quarterly one = ibis_types.literal(1) three = ibis_types.literal(3) four = ibis_types.literal(4) @@ -792,7 +792,7 @@ def integer_label_to_datetime_op_non_fixed_frequency( ) x_label = next_month_date - ibis_api.interval(days=1) - elif rule_code == "YE-DEC": # Yearly + elif rule_code in ("A-DEC", "Y-DEC", "YE-DEC"): # Yearly one = ibis_types.literal(1) first = y.year() # type: ignore x = x * n + first # type: ignore diff --git a/bigframes/core/compile/sqlglot/aggregate_compiler.py b/bigframes/core/compile/sqlglot/aggregate_compiler.py index f86e2af0de..9f72e1c794 100644 --- a/bigframes/core/compile/sqlglot/aggregate_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregate_compiler.py @@ -70,7 +70,5 @@ def compile_analytic( aggregate.arg.output_type, ) return unary_compiler.compile(aggregate.op, column, window) - elif isinstance(aggregate, agg_expressions.BinaryAggregation): - raise NotImplementedError("binary analytic operations not yet supported") else: raise ValueError(f"Unexpected analytic operation: {aggregate}") diff --git a/bigframes/core/compile/sqlglot/expressions/array_ops.py b/bigframes/core/compile/sqlglot/expressions/array_ops.py index eb7582cb16..b2c8c1c568 100644 --- a/bigframes/core/compile/sqlglot/expressions/array_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/array_ops.py @@ -105,31 +105,6 @@ def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression: return typed_expr.expr -def _string_slice(expr: TypedExpr, op: ops.ArraySliceOp) -> sge.Expression: - # local name for each element in the array - el = sg.to_identifier("el") - # local name for the index in the array - slice_idx = sg.to_identifier("slice_idx") - - conditions: typing.List[sge.Predicate] = [slice_idx >= op.start] - if op.stop is not None: - conditions.append(slice_idx < op.stop) - - selected_elements = ( - sge.select(el) - .from_( - sge.Unnest( - expressions=[expr.expr], - alias=sge.TableAlias(columns=[el]), - offset=slice_idx, - ) - ) - .where(*conditions) - ) - - return sge.array(selected_elements) - - def _array_slice(expr: TypedExpr, op: ops.ArraySliceOp) -> sge.Expression: # local name for each element in the array el = sg.to_identifier("el") diff --git a/bigframes/core/compile/sqlglot/expressions/datetime_ops.py b/bigframes/core/compile/sqlglot/expressions/datetime_ops.py index 21f8b39e7d..4e0a75e699 100644 --- a/bigframes/core/compile/sqlglot/expressions/datetime_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/datetime_ops.py @@ -125,7 +125,7 @@ def _datetime_to_integer_label_non_fixed_frequency( expression=sge.convert(1), ), ) - elif rule_code == "ME": # Monthly + elif rule_code in ("M", "ME"): # Monthly x_int = sge.Paren( # type: ignore this=sge.Add( this=sge.Mul( @@ -182,7 +182,7 @@ def _datetime_to_integer_label_non_fixed_frequency( expression=sge.convert(1), ), ) - elif rule_code == "QE-DEC": # Quarterly + elif rule_code in ("Q-DEC", "QE-DEC"): # Quarterly x_int = sge.Paren( # type: ignore this=sge.Add( this=sge.Mul( @@ -239,7 +239,7 @@ def _datetime_to_integer_label_non_fixed_frequency( expression=sge.convert(1), ), ) - elif rule_code == "YE-DEC": # Yearly + elif rule_code in ("A-DEC", "Y-DEC", "YE-DEC"): # Yearly x_int = sge.Extract(this=sge.Identifier(this="YEAR"), expression=x.expr) first = sge.Extract(this=sge.Identifier(this="YEAR"), expression=y.expr) return sge.Case( diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py b/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py index c6c1c21151..7d4f53254d 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py @@ -42,3 +42,23 @@ def test_func(input: sge.Expression) -> sge.Expression: ValueError, match=r".*first parameter must be a window operator.*" ): test_func(sge.to_identifier("A")) + + +def test_register_already_registered_raise_error(): + reg = op_registration.OpRegistration() + + @reg.register(agg_ops.SizeOp) + def test_func1(op, input): + return input + + with pytest.raises(ValueError, match=r".*is already registered.*"): + + @reg.register(agg_ops.SizeOp) + def test_func2(op, input): + return input + + +def test_getitem_not_registered_raise_error(): + reg = op_registration.OpRegistration() + with pytest.raises(ValueError, match=r".*is not registered.*"): + _ = reg[agg_ops.SizeOp()] diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql index 4200470b65..a1f089424a 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_index/out.sql @@ -1,3 +1,4 @@ SELECT - `string_list_col`[SAFE_OFFSET(1)] AS `string_list_col` -FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file + IF(SUBSTRING(`string_col`, 2, 1) <> '', SUBSTRING(`string_col`, 2, 1), NULL) AS `string_index`, + [`int64_col`, `int64_too`][SAFE_OFFSET(1)] AS `array_index` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql index 26fc32f68d..1053ec1c2c 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql @@ -18,5 +18,10 @@ SELECT SELECT COALESCE(LOGICAL_OR(bf_arr_reduce_uid), FALSE) FROM UNNEST(`bool_list_col`) AS bf_arr_reduce_uid - ) AS `any_bool` + ) AS `any_bool`, + ( + SELECT + ARRAY_AGG(bf_arr_reduce_uid IGNORE NULLS) + FROM UNNEST(`string_list_col`) AS bf_arr_reduce_uid + ) AS `array_agg_str` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice/out.sql new file mode 100644 index 0000000000..ffec3b8e93 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice/out.sql @@ -0,0 +1,17 @@ +SELECT + SUBSTRING(`string_col`, 2, 4) AS `string_slice`, + ARRAY( + SELECT + el + FROM UNNEST([`int64_col`, `int64_too`]) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 + ) AS `slice_only_start`, + ARRAY( + SELECT + el + FROM UNNEST([`int64_col`, `int64_too`]) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 AND slice_idx < 5 + ) AS `slice_start_stop` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_only_start/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_only_start/out.sql deleted file mode 100644 index c37e27b2cf..0000000000 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_only_start/out.sql +++ /dev/null @@ -1,9 +0,0 @@ -SELECT - ARRAY( - SELECT - el - FROM UNNEST(`string_list_col`) AS el WITH OFFSET AS slice_idx - WHERE - slice_idx >= 1 - ) AS `string_list_col` -FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_start_and_stop/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_start_and_stop/out.sql deleted file mode 100644 index 70417daf5c..0000000000 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_slice_with_start_and_stop/out.sql +++ /dev/null @@ -1,9 +0,0 @@ -SELECT - ARRAY( - SELECT - el - FROM UNNEST(`string_list_col`) AS el WITH OFFSET AS slice_idx - WHERE - slice_idx >= 1 AND slice_idx < 5 - ) AS `string_list_col` -FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url_with_duration/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url_with_duration/out.sql new file mode 100644 index 0000000000..2e8b60230f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url_with_duration/out.sql @@ -0,0 +1,3 @@ +SELECT + OBJ.GET_ACCESS_URL(`string_col`, 'READ', INTERVAL 3600 MICROSECOND) AS `string_col` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref_json/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref_json/out.sql new file mode 100644 index 0000000000..dc84b3bec1 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_make_ref_json/out.sql @@ -0,0 +1,3 @@ +SELECT + OBJ.MAKE_REF(`string_col`) AS `string_col` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql index 7afe926ab4..d6f6587ead 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_and_op/out.sql @@ -4,5 +4,6 @@ SELECT `int64_col`, `int64_col` & `int64_col` AS `int_and_int`, `bool_col` AND `bool_col` AS `bool_and_bool`, - IF(`bool_col` = FALSE, `bool_col`, NULL) AS `bool_and_null` + IF(`bool_col` = FALSE, `bool_col`, NULL) AS `bool_and_null`, + IF(`bool_col` = FALSE, `bool_col`, NULL) AS `null_and_bool` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql index 89a80b05a8..dad4cee9d0 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_or_op/out.sql @@ -4,5 +4,6 @@ SELECT `int64_col`, `int64_col` | `int64_col` AS `int_and_int`, `bool_col` OR `bool_col` AS `bool_and_bool`, - IF(`bool_col` = TRUE, `bool_col`, NULL) AS `bool_and_null` + IF(`bool_col` = TRUE, `bool_col`, NULL) AS `bool_and_null`, + IF(`bool_col` = TRUE, `bool_col`, NULL) AS `null_and_bool` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql index 74a8e81081..4be3b9f94a 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_bool_ops/test_xor_op/out.sql @@ -13,5 +13,11 @@ SELECT ) OR ( NOT `bool_col` AND CAST(NULL AS BOOLEAN) - ) AS `bool_and_null` + ) AS `bool_and_null`, + ( + `bool_col` AND NOT CAST(NULL AS BOOLEAN) + ) + OR ( + NOT `bool_col` AND CAST(NULL AS BOOLEAN) + ) AS `null_and_bool` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql index 37554c77e0..7827731881 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_eq_numeric/out.sql @@ -5,6 +5,7 @@ SELECT `int64_col` = `int64_col` AS `int_eq_int`, `int64_col` = 1 AS `int_eq_1`, `int64_col` IS NULL AS `int_eq_null`, + `int64_col` IS NULL AS `null_eq_int`, `int64_col` = CAST(`bool_col` AS INT64) AS `int_eq_bool`, CAST(`bool_col` AS INT64) = `int64_col` AS `bool_eq_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql index f66e8435eb..5903cf0369 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ge_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` >= `int64_col` AS `int_ge_int`, `int64_col` >= 1 AS `int_ge_1`, + NULL AS `null_ge_int`, `int64_col` >= CAST(`bool_col` AS INT64) AS `int_ge_bool`, CAST(`bool_col` AS INT64) >= `int64_col` AS `bool_ge_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql index d97f9d1d42..42bf029240 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_gt_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` > `int64_col` AS `int_gt_int`, `int64_col` > 1 AS `int_gt_1`, + NULL AS `null_gt_int`, `int64_col` > CAST(`bool_col` AS INT64) AS `int_gt_bool`, CAST(`bool_col` AS INT64) > `int64_col` AS `bool_gt_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql index d1af7c57ae..b6d860d472 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql @@ -10,5 +10,10 @@ SELECT COALESCE(`int64_col` IN (123456), FALSE) AS `ints_wo_match_nulls`, ( `float64_col` IS NULL - ) OR `float64_col` IN (1, 2, 3) AS `float_in_ints` + ) OR `float64_col` IN (1, 2, 3) AS `float_in_ints`, + ( + `int64_col` IS NULL + ) OR `int64_col` IN (2) AS `mixed_with_null`, + COALESCE(CAST(`bool_col` AS INT64) IN (1, 2.5), FALSE) AS `bool_in_mixed`, + `int64_col` IS NULL AS `only_null_match` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql index e4e542d1c5..c6c8651010 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_le_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` <= `int64_col` AS `int_le_int`, `int64_col` <= 1 AS `int_le_1`, + NULL AS `null_le_int`, `int64_col` <= CAST(`bool_col` AS INT64) AS `int_le_bool`, CAST(`bool_col` AS INT64) <= `int64_col` AS `bool_le_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql index d616aecc8c..ec5c317a8e 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_lt_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` < `int64_col` AS `int_lt_int`, `int64_col` < 1 AS `int_lt_1`, + NULL AS `null_lt_int`, `int64_col` < CAST(`bool_col` AS INT64) AS `int_lt_bool`, CAST(`bool_col` AS INT64) < `int64_col` AS `bool_lt_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql index abef6f93d6..448a614629 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_ne_numeric/out.sql @@ -7,6 +7,9 @@ SELECT ( `int64_col` ) IS NOT NULL AS `int_ne_null`, + ( + `int64_col` + ) IS NOT NULL AS `null_ne_int`, `int64_col` <> CAST(`bool_col` AS INT64) AS `int_ne_bool`, CAST(`bool_col` AS INT64) <> `int64_col` AS `bool_ne_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql index 8654f94270..4b0696386c 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_datetime_to_integer_label/out.sql @@ -5,6 +5,13 @@ SELECT 86400000000 ) ) AS INT64) AS `fixed_freq`, + CAST(FLOOR(IEEE_DIVIDE(UNIX_MICROS(CAST(`datetime_col` AS TIMESTAMP)) - 0, 86400000000)) AS INT64) AS `origin_epoch`, + CAST(FLOOR( + IEEE_DIVIDE( + UNIX_MICROS(CAST(`datetime_col` AS TIMESTAMP)) - UNIX_MICROS(CAST(CAST(`timestamp_col` AS DATE) AS TIMESTAMP)), + 86400000000 + ) + ) AS INT64) AS `origin_start_day`, CASE WHEN UNIX_MICROS( CAST(TIMESTAMP_TRUNC(`datetime_col`, WEEK(MONDAY)) + INTERVAL 6 DAY AS TIMESTAMP) @@ -22,5 +29,48 @@ SELECT 604800000000 ) ) AS INT64) + 1 - END AS `non_fixed_freq_weekly` + END AS `non_fixed_freq_weekly`, + CASE + WHEN ( + EXTRACT(YEAR FROM `datetime_col`) * 12 + EXTRACT(MONTH FROM `datetime_col`) - 1 + ) = ( + EXTRACT(YEAR FROM `timestamp_col`) * 12 + EXTRACT(MONTH FROM `timestamp_col`) - 1 + ) + THEN 0 + ELSE CAST(FLOOR( + IEEE_DIVIDE( + ( + EXTRACT(YEAR FROM `datetime_col`) * 12 + EXTRACT(MONTH FROM `datetime_col`) - 1 + ) - ( + EXTRACT(YEAR FROM `timestamp_col`) * 12 + EXTRACT(MONTH FROM `timestamp_col`) - 1 + ) - 1, + 1 + ) + ) AS INT64) + 1 + END AS `non_fixed_freq_monthly`, + CASE + WHEN ( + EXTRACT(YEAR FROM `datetime_col`) * 4 + EXTRACT(QUARTER FROM `datetime_col`) - 1 + ) = ( + EXTRACT(YEAR FROM `timestamp_col`) * 4 + EXTRACT(QUARTER FROM `timestamp_col`) - 1 + ) + THEN 0 + ELSE CAST(FLOOR( + IEEE_DIVIDE( + ( + EXTRACT(YEAR FROM `datetime_col`) * 4 + EXTRACT(QUARTER FROM `datetime_col`) - 1 + ) - ( + EXTRACT(YEAR FROM `timestamp_col`) * 4 + EXTRACT(QUARTER FROM `timestamp_col`) - 1 + ) - 1, + 1 + ) + ) AS INT64) + 1 + END AS `non_fixed_freq_quarterly`, + CASE + WHEN EXTRACT(YEAR FROM `datetime_col`) = EXTRACT(YEAR FROM `timestamp_col`) + THEN 0 + ELSE CAST(FLOOR( + IEEE_DIVIDE(EXTRACT(YEAR FROM `datetime_col`) - EXTRACT(YEAR FROM `timestamp_col`) - 1, 1) + ) AS INT64) + 1 + END AS `non_fixed_freq_yearly` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql index 3d0b8213b6..5d98e445cc 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql @@ -2,5 +2,6 @@ SELECT CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS DATETIME) AS `int64_col`, SAFE_CAST(`string_col` AS DATETIME), CAST(TIMESTAMP_MICROS(CAST(TRUNC(`float64_col` * 0.001) AS INT64)) AS DATETIME) AS `float64_col`, - SAFE_CAST(`timestamp_col` AS DATETIME) + SAFE_CAST(`timestamp_col` AS DATETIME), + CAST(PARSE_TIMESTAMP('%Y-%m-%d', `string_col`, 'UTC') AS DATETIME) AS `string_col_fmt` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql index 1e8910fad7..e0fb530cc6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql @@ -5,5 +5,6 @@ SELECT CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 1000) AS INT64)) AS TIMESTAMP) AS `int64_col_ms`, CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col`) AS INT64)) AS TIMESTAMP) AS `int64_col_us`, CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS TIMESTAMP) AS `int64_col_ns`, - TIMESTAMP(`datetime_col`) AS `datetime_col` + TIMESTAMP(`datetime_col`) AS `datetime_col`, + PARSE_TIMESTAMP('%Y-%m-%d', `string_col`, 'UTC') AS `string_col_fmt` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_value_array/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_value_array/out.sql new file mode 100644 index 0000000000..8250c02934 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_json_value_array/out.sql @@ -0,0 +1,3 @@ +SELECT + JSON_VALUE_ARRAY(`json_col`, '$') AS `json_col` +FROM `bigframes-dev`.`sqlglot_test`.`json_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql index 111684acd0..3aa06fe16e 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` + `int64_col` AS `int_add_int`, `int64_col` + 1 AS `int_add_1`, + NULL AS `int_add_null`, `int64_col` + CAST(`bool_col` AS INT64) AS `int_add_bool`, CAST(`bool_col` AS INT64) + `int64_col` AS `bool_add_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql index 1b8166684c..3f5ff73326 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql @@ -6,6 +6,7 @@ SELECT IEEE_DIVIDE(`int64_col`, `int64_col`) AS `int_div_int`, IEEE_DIVIDE(`int64_col`, 1) AS `int_div_1`, IEEE_DIVIDE(`int64_col`, 0.0) AS `int_div_0`, + IEEE_DIVIDE(`int64_col`, NULL) AS `int_div_null`, IEEE_DIVIDE(`int64_col`, `float64_col`) AS `int_div_float`, IEEE_DIVIDE(`float64_col`, `int64_col`) AS `float_div_int`, IEEE_DIVIDE(`float64_col`, 0.0) AS `float_div_0`, diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql new file mode 100644 index 0000000000..c7fa74e48f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql @@ -0,0 +1,47 @@ +SELECT + `rowindex`, + `int64_col`, + `bool_col`, + `float64_col`, + CASE + WHEN `int64_col` = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, `int64_col`)) AS INT64) + END AS `int_div_int`, + CASE + WHEN 1 = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, 1)) AS INT64) + END AS `int_div_1`, + CASE + WHEN 0.0 = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, 0.0)) AS INT64) + END AS `int_div_0`, + NULL AS `int_div_null`, + CASE + WHEN `float64_col` = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, `float64_col`)) AS INT64) + END AS `int_div_float`, + CASE + WHEN `int64_col` = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `float64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`float64_col`, `int64_col`)) AS INT64) + END AS `float_div_int`, + CASE + WHEN 0.0 = CAST(0 AS INT64) + THEN CAST('Infinity' AS FLOAT64) * `float64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`float64_col`, 0.0)) AS INT64) + END AS `float_div_0`, + CASE + WHEN CAST(`bool_col` AS INT64) = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * `int64_col` + ELSE CAST(FLOOR(IEEE_DIVIDE(`int64_col`, CAST(`bool_col` AS INT64))) AS INT64) + END AS `int_div_bool`, + CASE + WHEN `int64_col` = CAST(0 AS INT64) + THEN CAST(0 AS INT64) * CAST(`bool_col` AS INT64) + ELSE CAST(FLOOR(IEEE_DIVIDE(CAST(`bool_col` AS INT64), `int64_col`)) AS INT64) + END AS `bool_div_int` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql index 57aff08158..ebe8d571d6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_numeric/out.sql @@ -4,6 +4,7 @@ SELECT `bool_col`, `int64_col` * `int64_col` AS `int_mul_int`, `int64_col` * 1 AS `int_mul_1`, + NULL AS `int_mul_null`, `int64_col` * CAST(`bool_col` AS INT64) AS `int_mul_bool`, CAST(`bool_col` AS INT64) * `int64_col` AS `bool_mul_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql index e1ca93d136..c1d0350a66 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_numeric/out.sql @@ -2,8 +2,9 @@ SELECT `rowindex`, `int64_col`, `bool_col`, - `int64_col` - `int64_col` AS `int_add_int`, - `int64_col` - 1 AS `int_add_1`, - `int64_col` - CAST(`bool_col` AS INT64) AS `int_add_bool`, - CAST(`bool_col` AS INT64) - `int64_col` AS `bool_add_int` + `int64_col` - `int64_col` AS `int_sub_int`, + `int64_col` - 1 AS `int_sub_1`, + NULL AS `int_sub_null`, + `int64_col` - CAST(`bool_col` AS INT64) AS `int_sub_bool`, + CAST(`bool_col` AS INT64) - `int64_col` AS `bool_sub_int` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql index df4dc689f7..b10f4b29e6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql @@ -1,3 +1,18 @@ SELECT - SUBSTRING(`string_col`, 2, 2) AS `string_col` + SUBSTRING(`string_col`, 2, 2) AS `1_3`, + SUBSTRING(`string_col`, 1, 3) AS `none_3`, + SUBSTRING(`string_col`, 2) AS `1_none`, + SUBSTRING(`string_col`, -3) AS `m3_none`, + SUBSTRING(`string_col`, 1, GREATEST(0, LENGTH(`string_col`) + -3)) AS `none_m3`, + SUBSTRING( + `string_col`, + GREATEST(1, LENGTH(`string_col`) + -4), + GREATEST(0, LENGTH(`string_col`) + -3) - GREATEST(0, LENGTH(`string_col`) + -5) + ) AS `m5_m3`, + SUBSTRING(`string_col`, 2, GREATEST(0, LENGTH(`string_col`) + -4)) AS `1_m3`, + SUBSTRING( + `string_col`, + GREATEST(1, LENGTH(`string_col`) + -2), + 5 - GREATEST(0, LENGTH(`string_col`) + -3) + ) AS `m3_5` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py index 67c8bb0e5c..4075e1c278 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py @@ -1,13 +1,13 @@ # Copyright 2025 Google LLC # -# Licensed under the Apache License, Version 2.0 (the "License"); +# Licensed under the Apache License, Version 2.0 (the \"License\"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, +# distributed under the License is distributed on an \"AS IS\" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -16,7 +16,6 @@ from bigframes import operations as ops from bigframes.core import expression -from bigframes.operations._op_converters import convert_index, convert_slice import bigframes.operations.aggregations as agg_ops import bigframes.pandas as bpd from bigframes.testing import utils @@ -34,13 +33,18 @@ def test_array_to_string(repeated_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") -def test_array_index(repeated_types_df: bpd.DataFrame, snapshot): - col_name = "string_list_col" - bf_df = repeated_types_df[[col_name]] +def test_array_index(scalar_types_df: bpd.DataFrame, snapshot): + ops_map = { + "string_index": ops.ArrayIndexOp(index=1).as_expr("string_col"), + "array_index": expression.OpExpression( + ops.ArrayIndexOp(index=1), + (ops.ToArrayOp().as_expr("int64_col", "int64_too"),), + ), + } + sql = utils._apply_ops_to_sql( - bf_df, [convert_index(1).as_expr(col_name)], [col_name] + scalar_types_df, list(ops_map.values()), list(ops_map.keys()) ) - snapshot.assert_match(sql, "out.sql") @@ -50,6 +54,9 @@ def test_array_reduce_op(repeated_types_df: bpd.DataFrame, snapshot): "std_float": ops.ArrayReduceOp(agg_ops.StdOp()).as_expr("float_list_col"), "count_str": ops.ArrayReduceOp(agg_ops.CountOp()).as_expr("string_list_col"), "any_bool": ops.ArrayReduceOp(agg_ops.AnyOp()).as_expr("bool_list_col"), + "array_agg_str": ops.ArrayReduceOp(agg_ops.ArrayAggOp()).as_expr( + "string_list_col" + ), } sql = utils._apply_ops_to_sql( @@ -58,23 +65,23 @@ def test_array_reduce_op(repeated_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") -def test_array_slice_with_only_start(repeated_types_df: bpd.DataFrame, snapshot): - col_name = "string_list_col" - bf_df = repeated_types_df[[col_name]] - sql = utils._apply_ops_to_sql( - bf_df, [convert_slice(slice(1, None)).as_expr(col_name)], [col_name] - ) - - snapshot.assert_match(sql, "out.sql") - +def test_array_slice(scalar_types_df: bpd.DataFrame, snapshot): + array_expr = ops.ToArrayOp().as_expr("int64_col", "int64_too") + ops_map = { + "string_slice": ops.ArraySliceOp(start=1, stop=5).as_expr("string_col"), + "slice_only_start": expression.OpExpression( + ops.ArraySliceOp(start=1, stop=None), + (array_expr,), + ), + "slice_start_stop": expression.OpExpression( + ops.ArraySliceOp(start=1, stop=5), + (array_expr,), + ), + } -def test_array_slice_with_start_and_stop(repeated_types_df: bpd.DataFrame, snapshot): - col_name = "string_list_col" - bf_df = repeated_types_df[[col_name]] sql = utils._apply_ops_to_sql( - bf_df, [convert_slice(slice(1, 5)).as_expr(col_name)], [col_name] + scalar_types_df, list(ops_map.values()), list(ops_map.keys()) ) - snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py index 80aa22aaac..ac032f46e6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_blob_ops.py @@ -14,7 +14,9 @@ import pytest +from bigframes import operations as ops import bigframes.pandas as bpd +from bigframes.testing import utils pytest.importorskip("pytest_snapshot") @@ -31,6 +33,28 @@ def test_obj_get_access_url(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_obj_get_access_url_with_duration(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "string_col" + bf_df = scalar_types_df[[col_name]] + sql = utils._apply_ops_to_sql( + bf_df, + [ops.ObjGetAccessUrl(mode="READ", duration=3600).as_expr(col_name)], + [col_name], + ) + snapshot.assert_match(sql, "out.sql") + + def test_obj_make_ref(scalar_types_df: bpd.DataFrame, snapshot): blob_df = scalar_types_df["string_col"].str.to_blob() snapshot.assert_match(blob_df.to_frame().sql, "out.sql") + + +def test_obj_make_ref_json(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "string_col" + bf_df = scalar_types_df[[col_name]] + sql = utils._apply_ops_to_sql( + bf_df, + [ops.obj_make_ref_json_op.as_expr(col_name)], + [col_name], + ) + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py index 601fd86e4e..bd51ea905a 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_bool_ops.py @@ -26,6 +26,7 @@ def test_and_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_and_int"] = bf_df["int64_col"] & bf_df["int64_col"] bf_df["bool_and_bool"] = bf_df["bool_col"] & bf_df["bool_col"] bf_df["bool_and_null"] = bf_df["bool_col"] & pd.NA # type: ignore + bf_df["null_and_bool"] = pd.NA & bf_df["bool_col"] # type: ignore snapshot.assert_match(bf_df.sql, "out.sql") @@ -35,6 +36,7 @@ def test_or_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_and_int"] = bf_df["int64_col"] | bf_df["int64_col"] bf_df["bool_and_bool"] = bf_df["bool_col"] | bf_df["bool_col"] bf_df["bool_and_null"] = bf_df["bool_col"] | pd.NA # type: ignore + bf_df["null_and_bool"] = pd.NA | bf_df["bool_col"] # type: ignore snapshot.assert_match(bf_df.sql, "out.sql") @@ -44,4 +46,5 @@ def test_xor_op(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_and_int"] = bf_df["int64_col"] ^ bf_df["int64_col"] bf_df["bool_and_bool"] = bf_df["bool_col"] ^ bf_df["bool_col"] bf_df["bool_and_null"] = bf_df["bool_col"] ^ pd.NA # type: ignore + bf_df["null_and_bool"] = pd.NA ^ bf_df["bool_col"] # type: ignore snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py index 3c13bc798b..05fa1b5434 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py @@ -44,6 +44,13 @@ def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): values=(None, 123456), match_nulls=False ).as_expr(int_col), "float_in_ints": ops.IsInOp(values=(1, 2, 3, None)).as_expr(float_col), + "mixed_with_null": ops.IsInOp( + values=("1.0", 2, None), match_nulls=True + ).as_expr(int_col), + "bool_in_mixed": ops.IsInOp(values=(1, 2.5)).as_expr(bool_col), + "only_null_match": ops.IsInOp(values=(None,), match_nulls=True).as_expr( + int_col + ), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) @@ -62,6 +69,7 @@ def test_eq_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_eq_int"] = bf_df["int64_col"] == bf_df["int64_col"] bf_df["int_eq_1"] = bf_df["int64_col"] == 1 bf_df["int_eq_null"] = bf_df["int64_col"] == pd.NA + bf_df["null_eq_int"] = pd.NA == bf_df["int64_col"] bf_df["int_eq_bool"] = bf_df["int64_col"] == bf_df["bool_col"] bf_df["bool_eq_int"] = bf_df["bool_col"] == bf_df["int64_col"] @@ -74,6 +82,7 @@ def test_gt_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_gt_int"] = bf_df["int64_col"] > bf_df["int64_col"] bf_df["int_gt_1"] = bf_df["int64_col"] > 1 + bf_df["null_gt_int"] = pd.NA > bf_df["int64_col"] bf_df["int_gt_bool"] = bf_df["int64_col"] > bf_df["bool_col"] bf_df["bool_gt_int"] = bf_df["bool_col"] > bf_df["int64_col"] @@ -86,6 +95,7 @@ def test_ge_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_ge_int"] = bf_df["int64_col"] >= bf_df["int64_col"] bf_df["int_ge_1"] = bf_df["int64_col"] >= 1 + bf_df["null_ge_int"] = pd.NA >= bf_df["int64_col"] bf_df["int_ge_bool"] = bf_df["int64_col"] >= bf_df["bool_col"] bf_df["bool_ge_int"] = bf_df["bool_col"] >= bf_df["int64_col"] @@ -98,6 +108,7 @@ def test_lt_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_lt_int"] = bf_df["int64_col"] < bf_df["int64_col"] bf_df["int_lt_1"] = bf_df["int64_col"] < 1 + bf_df["null_lt_int"] = pd.NA < bf_df["int64_col"] bf_df["int_lt_bool"] = bf_df["int64_col"] < bf_df["bool_col"] bf_df["bool_lt_int"] = bf_df["bool_col"] < bf_df["int64_col"] @@ -110,6 +121,7 @@ def test_le_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_le_int"] = bf_df["int64_col"] <= bf_df["int64_col"] bf_df["int_le_1"] = bf_df["int64_col"] <= 1 + bf_df["null_le_int"] = pd.NA <= bf_df["int64_col"] bf_df["int_le_bool"] = bf_df["int64_col"] <= bf_df["bool_col"] bf_df["bool_le_int"] = bf_df["bool_col"] <= bf_df["int64_col"] @@ -137,6 +149,7 @@ def test_ne_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_ne_int"] = bf_df["int64_col"] != bf_df["int64_col"] bf_df["int_ne_1"] = bf_df["int64_col"] != 1 bf_df["int_ne_null"] = bf_df["int64_col"] != pd.NA + bf_df["null_ne_int"] = pd.NA != bf_df["int64_col"] bf_df["int_ne_bool"] = bf_df["int64_col"] != bf_df["bool_col"] bf_df["bool_ne_int"] = bf_df["bool_col"] != bf_df["int64_col"] diff --git a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py index 76966d3c9b..1d6ea99d34 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py @@ -64,9 +64,24 @@ def test_datetime_to_integer_label(scalar_types_df: bpd.DataFrame, snapshot): "fixed_freq": ops.DatetimeToIntegerLabelOp( freq=pd.tseries.offsets.Day(), origin="start", closed="left" # type: ignore ).as_expr("datetime_col", "timestamp_col"), + "origin_epoch": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.Day(), origin="epoch", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), + "origin_start_day": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.Day(), origin="start_day", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), "non_fixed_freq_weekly": ops.DatetimeToIntegerLabelOp( freq=pd.tseries.offsets.Week(weekday=6), origin="start", closed="left" # type: ignore ).as_expr("datetime_col", "timestamp_col"), + "non_fixed_freq_monthly": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.MonthEnd(), origin="start", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), + "non_fixed_freq_quarterly": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.QuarterEnd(startingMonth=12), origin="start", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), + "non_fixed_freq_yearly": ops.DatetimeToIntegerLabelOp( + freq=pd.tseries.offsets.YearEnd(), origin="start", closed="left" # type: ignore + ).as_expr("datetime_col", "timestamp_col"), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) @@ -183,6 +198,9 @@ def test_to_datetime(scalar_types_df: bpd.DataFrame, snapshot): col_names = ["int64_col", "string_col", "float64_col", "timestamp_col"] bf_df = scalar_types_df[col_names] ops_map = {col_name: ops.ToDatetimeOp().as_expr(col_name) for col_name in col_names} + ops_map["string_col_fmt"] = ops.ToDatetimeOp(format="%Y-%m-%d").as_expr( + "string_col" + ) sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) snapshot.assert_match(sql, "out.sql") @@ -198,6 +216,7 @@ def test_to_timestamp(scalar_types_df: bpd.DataFrame, snapshot): "int64_col_us": ops.ToTimestampOp(unit="us").as_expr("int64_col"), "int64_col_ns": ops.ToTimestampOp(unit="ns").as_expr("int64_col"), "datetime_col": ops.ToTimestampOp().as_expr("datetime_col"), + "string_col_fmt": ops.ToTimestampOp(format="%Y-%m-%d").as_expr("string_col"), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) diff --git a/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py index 1c5894fc96..fa6d6d546f 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_json_ops.py @@ -95,6 +95,16 @@ def test_json_value(json_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_json_value_array(json_types_df: bpd.DataFrame, snapshot): + col_name = "json_col" + bf_df = json_types_df[[col_name]] + sql = utils._apply_ops_to_sql( + bf_df, [ops.JSONValueArray(json_path="$").as_expr(col_name)], [col_name] + ) + + snapshot.assert_match(sql, "out.sql") + + def test_parse_json(scalar_types_df: bpd.DataFrame, snapshot): col_name = "string_col" bf_df = scalar_types_df[[col_name]] diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py index f0237159bc..17c2ff98bc 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py @@ -282,6 +282,7 @@ def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_add_int"] = bf_df["int64_col"] + bf_df["int64_col"] bf_df["int_add_1"] = bf_df["int64_col"] + 1 + bf_df["int_add_null"] = bf_df["int64_col"] + pd.NA bf_df["int_add_bool"] = bf_df["int64_col"] + bf_df["bool_col"] bf_df["bool_add_int"] = bf_df["bool_col"] + bf_df["int64_col"] @@ -323,6 +324,7 @@ def test_div_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_div_int"] = bf_df["int64_col"] / bf_df["int64_col"] bf_df["int_div_1"] = bf_df["int64_col"] / 1 bf_df["int_div_0"] = bf_df["int64_col"] / 0.0 + bf_df["int_div_null"] = bf_df["int64_col"] / pd.NA bf_df["int_div_float"] = bf_df["int64_col"] / bf_df["float64_col"] bf_df["float_div_int"] = bf_df["float64_col"] / bf_df["int64_col"] @@ -363,6 +365,7 @@ def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_div_int"] = bf_df["int64_col"] // bf_df["int64_col"] bf_df["int_div_1"] = bf_df["int64_col"] // 1 bf_df["int_div_0"] = bf_df["int64_col"] // 0.0 + bf_df["int_div_null"] = bf_df["int64_col"] // pd.NA bf_df["int_div_float"] = bf_df["int64_col"] // bf_df["float64_col"] bf_df["float_div_int"] = bf_df["float64_col"] // bf_df["int64_col"] @@ -371,6 +374,8 @@ def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_div_bool"] = bf_df["int64_col"] // bf_df["bool_col"] bf_df["bool_div_int"] = bf_df["bool_col"] // bf_df["int64_col"] + snapshot.assert_match(bf_df.sql, "out.sql") + def test_floordiv_timedelta(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["timestamp_col", "date_col"]] @@ -401,6 +406,7 @@ def test_mul_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_mul_int"] = bf_df["int64_col"] * bf_df["int64_col"] bf_df["int_mul_1"] = bf_df["int64_col"] * 1 + bf_df["int_mul_null"] = bf_df["int64_col"] * pd.NA bf_df["int_mul_bool"] = bf_df["int64_col"] * bf_df["bool_col"] bf_df["bool_mul_int"] = bf_df["bool_col"] * bf_df["int64_col"] @@ -437,11 +443,12 @@ def test_mod_numeric(scalar_types_df: bpd.DataFrame, snapshot): def test_sub_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col"]] - bf_df["int_add_int"] = bf_df["int64_col"] - bf_df["int64_col"] - bf_df["int_add_1"] = bf_df["int64_col"] - 1 + bf_df["int_sub_int"] = bf_df["int64_col"] - bf_df["int64_col"] + bf_df["int_sub_1"] = bf_df["int64_col"] - 1 + bf_df["int_sub_null"] = bf_df["int64_col"] - pd.NA - bf_df["int_add_bool"] = bf_df["int64_col"] - bf_df["bool_col"] - bf_df["bool_add_int"] = bf_df["bool_col"] - bf_df["int64_col"] + bf_df["int_sub_bool"] = bf_df["int64_col"] - bf_df["bool_col"] + bf_df["bool_sub_int"] = bf_df["bool_col"] - bf_df["int64_col"] snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py index fff2cc06df..bb0e413486 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py @@ -219,9 +219,17 @@ def test_str_pad(scalar_types_df: bpd.DataFrame, snapshot): def test_str_slice(scalar_types_df: bpd.DataFrame, snapshot): col_name = "string_col" bf_df = scalar_types_df[[col_name]] - sql = utils._apply_ops_to_sql( - bf_df, [ops.StrSliceOp(1, 3).as_expr(col_name)], [col_name] - ) + ops_map = { + "1_3": ops.StrSliceOp(1, 3).as_expr(col_name), + "none_3": ops.StrSliceOp(None, 3).as_expr(col_name), + "1_none": ops.StrSliceOp(1, None).as_expr(col_name), + "m3_none": ops.StrSliceOp(-3, None).as_expr(col_name), + "none_m3": ops.StrSliceOp(None, -3).as_expr(col_name), + "m5_m3": ops.StrSliceOp(-5, -3).as_expr(col_name), + "1_m3": ops.StrSliceOp(1, -3).as_expr(col_name), + "m3_5": ops.StrSliceOp(-3, 5).as_expr(col_name), + } + sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) snapshot.assert_match(sql, "out.sql") From 96597f0b067310f32b18265fddccba9ed0260fcd Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 24 Mar 2026 10:29:54 -0700 Subject: [PATCH 13/25] perf: Make executor data uploads async internally (#2529) --- bigframes/session/bq_caching_executor.py | 31 ++++++++++++++---------- bigframes/session/loader.py | 13 +++++++++- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index cf275154ce..c5d6fe3e5f 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -14,6 +14,7 @@ from __future__ import annotations +import concurrent.futures import math import threading from typing import Literal, Mapping, Optional, Sequence, Tuple @@ -514,13 +515,29 @@ def _substitute_large_local_sources(self, original_root: nodes.BigFrameNode): Replace large local sources with the uploaded version of those datasources. """ # Step 1: Upload all previously un-uploaded data + needs_upload = [] for leaf in original_root.unique_nodes(): if isinstance(leaf, nodes.ReadLocalNode): if ( leaf.local_data_source.metadata.total_bytes > bigframes.constants.MAX_INLINE_BYTES ): - self._upload_local_data(leaf.local_data_source) + needs_upload.append(leaf.local_data_source) + + futures: dict[concurrent.futures.Future, local_data.ManagedArrowTable] = dict() + for local_source in needs_upload: + future = self.loader.read_data_async( + local_source, bigframes.core.guid.generate_guid() + ) + futures[future] = local_source + try: + for future in concurrent.futures.as_completed(futures.keys()): + self.cache.cache_remote_replacement(futures[future], future.result()) + except Exception as e: + # cancel all futures + for future in futures: + future.cancel() + raise e # Step 2: Replace local scans with remote scans def map_local_scans(node: nodes.BigFrameNode): @@ -550,18 +567,6 @@ def map_local_scans(node: nodes.BigFrameNode): return original_root.bottom_up(map_local_scans) - def _upload_local_data(self, local_table: local_data.ManagedArrowTable): - if self.cache.get_uploaded_local_data(local_table) is not None: - return - # Lock prevents concurrent repeated work, but slows things down. - # Might be better as a queue and a worker thread - with self._upload_lock: - if self.cache.get_uploaded_local_data(local_table) is None: - uploaded = self.loader.load_data_or_write_data( - local_table, bigframes.core.guid.generate_guid() - ) - self.cache.cache_remote_replacement(local_table, uploaded) - def _execute_plan_gbq( self, plan: nodes.BigFrameNode, diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 0944c0dab6..7b5d1bcaf1 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -300,6 +300,17 @@ def __init__( self._session = session self._clock = session_time.BigQuerySyncedClock(bqclient) self._clock.sync() + self._threadpool = concurrent.futures.ThreadPoolExecutor( + max_workers=1, thread_name_prefix="bigframes-loader" + ) + + def read_data_async( + self, local_data: local_data.ManagedArrowTable, offsets_col: str + ) -> concurrent.futures.Future[bq_data.BigqueryDataSource]: + future = self._threadpool.submit( + self._load_data_or_write_data, local_data, offsets_col + ) + return future def read_pandas( self, @@ -350,7 +361,7 @@ def read_managed_data( session=self._session, ) - def load_data_or_write_data( + def _load_data_or_write_data( self, data: local_data.ManagedArrowTable, offsets_col: str, From b9524284ad3b457b15598f546bac04c76b3e27b8 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 25 Mar 2026 16:36:38 -0700 Subject: [PATCH 14/25] fix: Respect remote function config changes even if logic unchanged (#2512) --- .../compile/ibis_compiler/ibis_compiler.py | 1 + .../ibis_compiler/scalar_op_registry.py | 18 +- bigframes/core/compile/sqlglot/compiler.py | 1 + .../compile/sqlglot/expressions/array_ops.py | 22 + .../core/compile/sqlglot/sql/__init__.py | 2 - bigframes/core/compile/sqlglot/sql/base.py | 23 - bigframes/core/rewrite/__init__.py | 2 + bigframes/core/rewrite/udfs.py | 87 +++ bigframes/core/sql/__init__.py | 29 + bigframes/dataframe.py | 5 +- bigframes/functions/_function_client.py | 401 +++++++------- bigframes/functions/_function_session.py | 94 +--- bigframes/functions/_utils.py | 90 ++-- bigframes/functions/function.py | 89 +--- bigframes/functions/function_template.py | 32 +- bigframes/functions/function_typing.py | 2 +- bigframes/functions/udf_def.py | 501 +++++++++++++++--- bigframes/operations/__init__.py | 2 + bigframes/operations/array_ops.py | 14 + bigframes/operations/remote_function_ops.py | 6 +- bigframes/series.py | 2 - bigframes/testing/utils.py | 2 +- setup.py | 1 + .../large/functions/test_remote_function.py | 72 ++- tests/system/large/ml/test_ensemble.py | 2 +- tests/system/large/ml/test_llm.py | 2 +- .../small/functions/test_remote_function.py | 42 +- .../sqlglot/expressions/test_generic_ops.py | 54 +- tests/unit/functions/test_remote_function.py | 54 -- .../functions/test_remote_function_utils.py | 78 +-- 30 files changed, 978 insertions(+), 752 deletions(-) create mode 100644 bigframes/core/rewrite/udfs.py diff --git a/bigframes/core/compile/ibis_compiler/ibis_compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py index 8d40a9eb74..3802a57e02 100644 --- a/bigframes/core/compile/ibis_compiler/ibis_compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -88,6 +88,7 @@ def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrites.rewrite_slice) node = nodes.bottom_up(node, rewrites.rewrite_timedelta_expressions) node = nodes.bottom_up(node, rewrites.rewrite_range_rolling) + node = nodes.bottom_up(node, rewrites.lower_udfs) return node diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 5bb278e882..6a697a8657 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1037,7 +1037,8 @@ def timedelta_floor_op_impl(x: ibis_types.NumericValue): @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + assert not udf_sig.is_virtual # should have been devirtualized in lowering pass + ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) @ibis_udf.scalar.builtin( name=str(op.function_def.routine_ref), signature=ibis_py_sig @@ -1056,7 +1057,8 @@ def binary_remote_function_op_impl( x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp ): udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + assert not udf_sig.is_virtual # should have been devirtualized in lowering pass + ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) @ibis_udf.scalar.builtin( name=str(op.function_def.routine_ref), signature=ibis_py_sig @@ -1073,8 +1075,9 @@ def nary_remote_function_op_impl( *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp ): udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) - arg_names = tuple(arg.name for arg in udf_sig.input_types) + assert not udf_sig.is_virtual # should have been devirtualized in lowering pass + ibis_py_sig = (tuple(arg.py_type for arg in udf_sig.inputs), udf_sig.output.py_type) + arg_names = tuple(arg.name for arg in udf_sig.inputs) @ibis_udf.scalar.builtin( name=str(op.function_def.routine_ref), @@ -1153,6 +1156,13 @@ def array_reduce_op_impl(x: ibis_types.Value, op: ops.ArrayReduceOp): ) +@scalar_op_compiler.register_unary_op(ops.ArrayMapOp, pass_op=True) +def array_map_op_impl(x: ibis_types.Value, op: ops.ArrayMapOp): + return typing.cast(ibis_types.ArrayValue, x).map( + lambda arr_vals: scalar_op_compiler.compile_row_op(op.map_op, (arr_vals,)) + ) + + # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index a86a192a9e..ce9ed6ce37 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -369,4 +369,5 @@ def compile_aggregate( def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) node = nodes.bottom_up(node, rewrite.rewrite_range_rolling) + node = nodes.bottom_up(node, rewrite.lower_udfs) return node diff --git a/bigframes/core/compile/sqlglot/expressions/array_ops.py b/bigframes/core/compile/sqlglot/expressions/array_ops.py index b2c8c1c568..0ae5f3e846 100644 --- a/bigframes/core/compile/sqlglot/expressions/array_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/array_ops.py @@ -73,6 +73,28 @@ def _(expr: TypedExpr, op: ops.ArrayReduceOp) -> sge.Expression: ) +@register_unary_op(ops.ArrayMapOp, pass_op=True) +def _(expr: TypedExpr, op: ops.ArrayMapOp) -> sge.Expression: + sub_expr = sg.to_identifier("bf_arr_map_uid") + sub_type = dtypes.get_array_inner_type(expr.dtype) + + # TODO: Expression should be provided instead of invoking compiler manually + map_expr = expression_compiler.expression_compiler.compile_row_op( + op.map_op, (TypedExpr(sub_expr, sub_type),) + ) + + return sge.array( + sge.select(map_expr) + .from_( + sge.Unnest( + expressions=[expr.expr], + alias=sge.TableAlias(columns=[sub_expr]), + ) + ) + .subquery() + ) + + @register_unary_op(ops.ArraySliceOp, pass_op=True) def _(expr: TypedExpr, op: ops.ArraySliceOp) -> sge.Expression: if expr.dtype == dtypes.STRING_DTYPE: diff --git a/bigframes/core/compile/sqlglot/sql/__init__.py b/bigframes/core/compile/sqlglot/sql/__init__.py index 047fb73d30..751c3cfc3a 100644 --- a/bigframes/core/compile/sqlglot/sql/__init__.py +++ b/bigframes/core/compile/sqlglot/sql/__init__.py @@ -15,7 +15,6 @@ from bigframes.core.compile.sqlglot.sql.base import ( cast, - escape_chars, identifier, is_null_literal, literal, @@ -28,7 +27,6 @@ __all__ = [ # From base.py "cast", - "escape_chars", "identifier", "is_null_literal", "literal", diff --git a/bigframes/core/compile/sqlglot/sql/base.py b/bigframes/core/compile/sqlglot/sql/base.py index 6e888fdf5e..d287b2cac9 100644 --- a/bigframes/core/compile/sqlglot/sql/base.py +++ b/bigframes/core/compile/sqlglot/sql/base.py @@ -136,29 +136,6 @@ def table(table: bigquery.TableReference) -> sge.Table: ) -def escape_chars(value: str): - """Escapes all special characters""" - # TODO: Reuse literal's escaping logic instead of re-implementing it here. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals - trans_table = str.maketrans( - { - "\a": r"\a", - "\b": r"\b", - "\f": r"\f", - "\n": r"\n", - "\r": r"\r", - "\t": r"\t", - "\v": r"\v", - "\\": r"\\", - "?": r"\?", - '"': r"\"", - "'": r"\'", - "`": r"\`", - } - ) - return value.translate(trans_table) - - def is_null_literal(expr: sge.Expression) -> bool: """Checks if the given expression is a NULL literal.""" if isinstance(expr, sge.Null): diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 5279418f5f..6b00e9b2f1 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -27,6 +27,7 @@ from bigframes.core.rewrite.select_pullup import defer_selection from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions +from bigframes.core.rewrite.udfs import lower_udfs from bigframes.core.rewrite.windows import ( pull_out_window_order, rewrite_range_rolling, @@ -53,4 +54,5 @@ "pull_out_window_order", "defer_selection", "simplify_complex_windows", + "lower_udfs", ] diff --git a/bigframes/core/rewrite/udfs.py b/bigframes/core/rewrite/udfs.py new file mode 100644 index 0000000000..f9aa330247 --- /dev/null +++ b/bigframes/core/rewrite/udfs.py @@ -0,0 +1,87 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses + +from bigframes.core import bigframe_node, expression +from bigframes.core.rewrite import op_lowering +import bigframes.functions.udf_def as udf_def +import bigframes.operations as ops + + +@dataclasses.dataclass +class LowerRemoteFunctionRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return ops.RemoteFunctionOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, ops.RemoteFunctionOp) + func_def = expr.op.function_def + devirtualized_expr = ops.RemoteFunctionOp( + func_def.with_devirtualize(), + apply_on_null=expr.op.apply_on_null, + ).as_expr(*expr.children) + if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): + return func_def.signature.output.out_expr(devirtualized_expr) + else: + return devirtualized_expr + + +@dataclasses.dataclass +class LowerBinaryRemoteFunctionRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return ops.BinaryRemoteFunctionOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, ops.BinaryRemoteFunctionOp) + func_def = expr.op.function_def + devirtualized_expr = ops.BinaryRemoteFunctionOp( + func_def.with_devirtualize(), + ).as_expr(*expr.children) + if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): + return func_def.signature.output.out_expr(devirtualized_expr) + else: + return devirtualized_expr + + +@dataclasses.dataclass +class LowerNaryRemoteFunctionRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return ops.NaryRemoteFunctionOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, ops.NaryRemoteFunctionOp) + func_def = expr.op.function_def + devirtualized_expr = ops.NaryRemoteFunctionOp( + func_def.with_devirtualize(), + ).as_expr(*expr.children) + if isinstance(func_def.signature.output, udf_def.VirtualListTypeV1): + return func_def.signature.output.out_expr(devirtualized_expr) + else: + return devirtualized_expr + + +UDF_LOWERING_RULES = ( + LowerRemoteFunctionRule(), + LowerBinaryRemoteFunctionRule(), + LowerNaryRemoteFunctionRule(), +) + + +def lower_udfs(root: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: + return op_lowering.lower_ops(root, rules=UDF_LOWERING_RULES) diff --git a/bigframes/core/sql/__init__.py b/bigframes/core/sql/__init__.py index 8c9a093802..69a74b15ce 100644 --- a/bigframes/core/sql/__init__.py +++ b/bigframes/core/sql/__init__.py @@ -48,6 +48,35 @@ to_wkt = dumps +def identifier(name: str) -> str: + if len(name) > 256: + raise ValueError("Identifier must be less than 256 characters") + return f"`{escape_chars(name)}`" + + +def escape_chars(value: str): + """Escapes all special characters""" + # TODO: Reuse literal's escaping logic instead of re-implementing it here. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + trans_table = str.maketrans( + { + "\a": r"\a", + "\b": r"\b", + "\f": r"\f", + "\n": r"\n", + "\r": r"\r", + "\t": r"\t", + "\v": r"\v", + "\\": r"\\", + "?": r"\?", + '"': r"\"", + "'": r"\'", + "`": r"\`", + } + ) + return value.translate(trans_table) + + def multi_literal(*values: Any): literal_strings = [sql.to_sql(sql.literal(i)) for i in values] return "(" + ", ".join(literal_strings) + ")" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 25cedda8f4..08c2c85e64 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4748,7 +4748,9 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # compatible with the data types of the input params. # 3. The order of the columns in the dataframe must correspond # to the order of the input params in the function. - udf_input_dtypes = func.udf_def.signature.bf_input_types + udf_input_dtypes = tuple( + arg.bf_type for arg in func.udf_def.signature.inputs + ) if not args and len(udf_input_dtypes) != len(self.columns): raise ValueError( f"Parameter count mismatch: BigFrames BigQuery function" @@ -4793,7 +4795,6 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) result_series.name = None - result_series = func._post_process_series(result_series) return result_series # At this point column-wise or element-wise bigquery function operation will diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index be9ff0956e..4b368f48cc 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -19,12 +19,13 @@ import logging import os import random +import re import shutil import string import tempfile import textwrap import types -from typing import Any, cast, Optional, Sequence, Tuple, TYPE_CHECKING +from typing import Any, cast, Optional, Sequence, TYPE_CHECKING import warnings import requests @@ -32,6 +33,7 @@ import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting import bigframes.functions.function_template as bff_template +import bigframes.functions.udf_def as udf_def if TYPE_CHECKING: from bigframes.session import Session @@ -40,7 +42,12 @@ import google.api_core.retry from google.cloud import bigquery, functions_v2 -from . import _utils +from bigframes.functions import _utils +from bigframes.functions._utils import ( + _BIGFRAMES_FUNCTION_PREFIX, + _BQ_FUNCTION_NAME_SEPERATOR, + _GCF_FUNCTION_NAME_SEPERATOR, +) logger = logging.getLogger(__name__) @@ -162,13 +169,8 @@ def _format_function_options(self, function_options: dict) -> str: def create_bq_remote_function( self, - input_args: Sequence[str], - input_types: Sequence[str], - output_type: str, - endpoint: str, - bq_function_name: str, - max_batching_rows: int, - metadata: str, + name: str, + udf_def: udf_def.RemoteFunctionConfig, ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" @@ -176,30 +178,30 @@ def create_bq_remote_function( # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 - bq_function_args = [] - bq_function_return_type = output_type - - # We are expecting the input type annotations to be 1:1 with the input args - for name, type_ in zip(input_args, input_types): - bq_function_args.append(f"{name} {type_}") remote_function_options = { - "endpoint": endpoint, - "max_batching_rows": max_batching_rows, + "endpoint": udf_def.endpoint, + "max_batching_rows": udf_def.max_batching_rows, } - if metadata: + if udf_def.bq_metadata: # We are using the description field to store this structured # bigframes specific metadata for the lack of a better option - remote_function_options["description"] = metadata + remote_function_options["description"] = udf_def.bq_metadata remote_function_options_str = self._format_function_options( remote_function_options ) + import bigframes.core.sql + import bigframes.core.utils + + # removes anything that isn't letter, number or underscore + _validate_routine_name(name) + bq_function_name_escaped = bigframes.core.sql.identifier(name) create_function_ddl = f""" - CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) - RETURNS {bq_function_return_type} + CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name_escaped}({udf_def.signature.to_sql_input_signature()}) + RETURNS {udf_def.signature.with_devirtualize().output.sql_type} REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` OPTIONS ({remote_function_options_str})""" @@ -227,6 +229,7 @@ def provision_bq_managed_function( # TODO(b/406283812): Expose the capability to pass down # capture_references=True in the public udf API. + # TODO(b/495508827): Include all config in the value hash. if ( capture_references and (python_version := _utils.get_python_version()) @@ -260,7 +263,10 @@ def provision_bq_managed_function( # Augment user package requirements with any internal package # requirements. packages = _utils.get_updated_package_requirements( - packages, is_row_processor, capture_references, ignore_package_version=True + packages or [], + is_row_processor, + capture_references, + ignore_package_version=True, ) if packages: managed_function_options["packages"] = packages @@ -268,14 +274,15 @@ def provision_bq_managed_function( managed_function_options ) - session_id = None if name else self._session.session_id bq_function_name = name if not bq_function_name: # Compute a unique hash representing the user code. function_hash = _utils.get_hash(func, packages) - bq_function_name = _utils.get_bigframes_function_name( + bq_function_name = _utils.get_managed_function_name( function_hash, - session_id, + # session-scope in absensce of name from user + # name indicates permanent allocation + None if name else self._session.session_id, ) persistent_func_id = ( @@ -337,7 +344,7 @@ def get_remote_function_fully_qualilfied_name(self, name): "Get the fully qualilfied name for a BQ remote function." return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" - def get_cloud_function_endpoint(self, name): + def get_cloud_function_endpoint(self, name) -> str | None: """Get the http endpoint of a cloud function if it exists.""" fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) try: @@ -351,29 +358,24 @@ def get_cloud_function_endpoint(self, name): def generate_cloud_function_code( self, - def_, + code_def: udf_def.CodeDef, directory, *, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - is_row_processor=False, + udf_signature: udf_def.UdfSignature, ): """Generate the cloud function code for a given user defined function.""" # requirements.txt - if package_requirements: + if code_def.package_requirements: requirements_txt = os.path.join(directory, "requirements.txt") with open(requirements_txt, "w") as f: - f.write("\n".join(package_requirements)) + f.write("\n".join(code_def.package_requirements)) # main.py entry_point = bff_template.generate_cloud_function_main_code( - def_, + code_def, directory, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, + udf_signature=udf_signature, ) return entry_point @@ -393,35 +395,19 @@ def _get_cloud_function_endpoint_with_retry(self, name): def create_cloud_function( self, - def_, - *, - random_name, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - timeout_seconds=600, - max_instance_count=None, - is_row_processor=False, - vpc_connector=None, - vpc_connector_egress_settings="private-ranges-only", - memory_mib=None, - cpus=None, - ingress_settings="internal-only", - workers=None, - threads=None, - concurrency=None, - ): + name: str, + func_def: udf_def.CloudRunFunctionConfig, + ) -> str: """Create a cloud function from the given user defined function.""" + config = func_def + # Build and deploy folder structure containing cloud function with tempfile.TemporaryDirectory() as directory: entry_point = self.generate_cloud_function_code( - def_, + config.code, directory, - package_requirements=package_requirements, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, + udf_signature=config.signature, ) archive_path = shutil.make_archive(directory, "zip", directory) @@ -461,9 +447,9 @@ def create_cloud_function( create_function_request.parent = ( self.get_cloud_function_fully_qualified_parent() ) - create_function_request.function_id = random_name + create_function_request.function_id = name function = functions_v2.Function() - function.name = self.get_cloud_function_fully_qualified_name(random_name) + function.name = self.get_cloud_function_fully_qualified_name(name) function.build_config = functions_v2.BuildConfig() function.build_config.runtime = python_version function.build_config.entry_point = entry_point @@ -490,33 +476,34 @@ def create_cloud_function( ) function.service_config = functions_v2.ServiceConfig() - if memory_mib is not None: - function.service_config.available_memory = f"{memory_mib}Mi" - if cpus is not None: - function.service_config.available_cpu = str(cpus) - if timeout_seconds is not None: - if timeout_seconds > 1200: + if config.memory_mib is not None: + function.service_config.available_memory = f"{config.memory_mib}Mi" + if config.cpus is not None: + function.service_config.available_cpu = str(config.cpus) + if config.timeout_seconds is not None: + if config.timeout_seconds > 1200: raise bf_formatting.create_exception_with_feedback_link( ValueError, "BigQuery remote function can wait only up to 20 minutes" ", see for more details " "https://cloud.google.com/bigquery/quotas#remote_function_limits.", ) - function.service_config.timeout_seconds = timeout_seconds - if max_instance_count is not None: - function.service_config.max_instance_count = max_instance_count - if vpc_connector is not None: - function.service_config.vpc_connector = vpc_connector - if vpc_connector_egress_settings is None: + function.service_config.timeout_seconds = config.timeout_seconds + if config.max_instance_count is not None: + function.service_config.max_instance_count = config.max_instance_count + if config.vpc_connector is not None: + function.service_config.vpc_connector = config.vpc_connector + vpc_connector_egress_settings = config.vpc_connector_egress_settings + if config.vpc_connector_egress_settings is None: msg = bfe.format_message( "The 'vpc_connector_egress_settings' was not specified. Defaulting to 'private-ranges-only'.", ) warnings.warn(msg, category=UserWarning) vpc_connector_egress_settings = "private-ranges-only" - if vpc_connector_egress_settings not in _VPC_EGRESS_SETTINGS_MAP: + if config.vpc_connector_egress_settings not in _VPC_EGRESS_SETTINGS_MAP: raise bf_formatting.create_exception_with_feedback_link( ValueError, - f"'{vpc_connector_egress_settings}' is not one of the supported vpc egress settings values: {list(_VPC_EGRESS_SETTINGS_MAP)}", + f"'{config.vpc_connector_egress_settings}' is not one of the supported vpc egress settings values: {list(_VPC_EGRESS_SETTINGS_MAP)}", ) function.service_config.vpc_connector_egress_settings = cast( functions_v2.ServiceConfig.VpcConnectorEgressSettings, @@ -525,28 +512,30 @@ def create_cloud_function( function.service_config.service_account_email = ( self._cloud_function_service_account ) - if concurrency: - function.service_config.max_instance_request_concurrency = concurrency + if config.concurrency: + function.service_config.max_instance_request_concurrency = ( + config.concurrency + ) # Functions framework use environment variables to pass config to gunicorn # See https://github.com/GoogleCloudPlatform/functions-framework-python/issues/241 # Code: https://github.com/GoogleCloudPlatform/functions-framework-python/blob/v3.10.1/src/functions_framework/_http/gunicorn.py#L37-L43 env_vars = {} - if workers: - env_vars["WORKERS"] = str(workers) - if threads: - env_vars["THREADS"] = str(threads) + if config.workers: + env_vars["WORKERS"] = str(config.workers) + if config.threads: + env_vars["THREADS"] = str(config.threads) if env_vars: function.service_config.environment_variables = env_vars - if ingress_settings not in _INGRESS_SETTINGS_MAP: + if config.ingress_settings not in _INGRESS_SETTINGS_MAP: raise bf_formatting.create_exception_with_feedback_link( ValueError, - f"'{ingress_settings}' not one of the supported ingress settings values: {list(_INGRESS_SETTINGS_MAP)}", + f"'{config.ingress_settings}' not one of the supported ingress settings values: {list(_INGRESS_SETTINGS_MAP)}", ) function.service_config.ingress_settings = cast( functions_v2.ServiceConfig.IngressSettings, - _INGRESS_SETTINGS_MAP[ingress_settings], + _INGRESS_SETTINGS_MAP[config.ingress_settings], ) function.kms_key_name = self._cloud_function_kms_key_name create_function_request.function = function @@ -577,67 +566,37 @@ def create_cloud_function( # Fetch the endpoint with retries if it wasn't returned by the operation if not endpoint: try: - endpoint = self._get_cloud_function_endpoint_with_retry(random_name) + endpoint = self._get_cloud_function_endpoint_with_retry(name) except Exception as e: raise bf_formatting.create_exception_with_feedback_link( ValueError, f"Couldn't fetch the http endpoint: {e}" ) - logger.info( - f"Successfully created cloud function {random_name} with uri ({endpoint})" - ) + logger.info(f"Successfully created cloud function {name} with uri ({endpoint})") return endpoint def provision_bq_remote_function( self, def_, - input_types, - output_type, - reuse, - name, - package_requirements, - max_batching_rows, - cloud_function_timeout, - cloud_function_max_instance_count, - is_row_processor, - cloud_function_vpc_connector, - cloud_function_vpc_connector_egress_settings, - cloud_function_memory_mib, - cloud_function_cpus, - cloud_function_ingress_settings, - bq_metadata, + func_signature: udf_def.UdfSignature, + reuse: bool, + name: str | None, + package_requirements: tuple[str, ...], + max_batching_rows: int | None, + cloud_function_timeout: int | None, + cloud_function_max_instance_count: int | None, + cloud_function_vpc_connector: str | None, + cloud_function_vpc_connector_egress_settings: str | None, + cloud_function_memory_mib: int | None, + cloud_function_cpus: float | None, + cloud_function_ingress_settings: str, ): """Provision a BigQuery remote function.""" # Augment user package requirements with any internal package # requirements - package_requirements = _utils.get_updated_package_requirements( - package_requirements, is_row_processor - ) - - # Compute a unique hash representing the user code - function_hash = _utils.get_hash(def_, package_requirements) - - # If reuse of any existing function with the same name (indicated by the - # same hash of its source code) is not intended, then attach a unique - # suffix to the intended function name to make it unique. - uniq_suffix = None - if not reuse: - # use 4 digits as a unique suffix which should suffice for - # uniqueness per session - uniq_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=4) - ) - - # Derive the name of the cloud function underlying the intended BQ - # remote function. Use the session id to identify the GCF for unnamed - # functions. The named remote functions are treated as a persistant - # artifacts, so let's keep them independent of session id, which also - # makes their naming more stable for the same udf code - session_id = None if name else self._session.session_id - cloud_function_name = _utils.get_cloud_function_name( - function_hash, session_id, uniq_suffix + full_package_requirements = _utils.get_updated_package_requirements( + package_requirements, func_signature.is_row_processor ) - cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) if cloud_function_memory_mib is None: cloud_function_memory_mib = _DEFAULT_FUNCTION_MEMORY_MIB @@ -654,90 +613,132 @@ def provision_bq_remote_function( # max concurrency==1 for vcpus < 1 hard limit from cloud run concurrency = (workers * threads) if (expected_milli_cpus >= 1000) else 1 + cloud_func_spec = udf_def.CloudRunFunctionConfig( + code=udf_def.CodeDef.from_func(def_, full_package_requirements), + signature=func_signature, + timeout_seconds=cloud_function_timeout, + max_instance_count=cloud_function_max_instance_count, + vpc_connector=cloud_function_vpc_connector, + vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings + or "private-ranges-only", + memory_mib=cloud_function_memory_mib, + cpus=cloud_function_cpus, + ingress_settings=cloud_function_ingress_settings, + workers=workers, + threads=threads, + concurrency=concurrency, + ) + + # If reuse of any existing function with the same name (indicated by the + # same hash of its source code and config) is not intended, then attach a unique + # suffix to the intended function name to make it unique. + random_suffix = "".join( + random.choices(string.ascii_lowercase + string.digits, k=4) + ) + # Derive the name of the cloud function underlying the intended BQ + # remote function. Use the session id to identify the GCF for unnamed + # functions. The named remote functions are treated as a persistant + # artifacts, so let's keep them independent of session id, which also + # makes their naming more stable for the same udf code + cloud_function_name = get_cloud_function_name( + cloud_func_spec, + session_id=self._session.session_id if (name is None) else None, + uniq_suffix=random_suffix if (not reuse) else None, + ) + + cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) # Create the cloud function if it does not exist if not cf_endpoint: cf_endpoint = self.create_cloud_function( - def_, - random_name=cloud_function_name, - input_types=input_types, - output_type=output_type, - package_requirements=package_requirements, - timeout_seconds=cloud_function_timeout, - max_instance_count=cloud_function_max_instance_count, - is_row_processor=is_row_processor, - vpc_connector=cloud_function_vpc_connector, - vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, - memory_mib=cloud_function_memory_mib, - cpus=cloud_function_cpus, - ingress_settings=cloud_function_ingress_settings, - workers=workers, - threads=threads, - concurrency=concurrency, + cloud_function_name, cloud_func_spec ) else: logger.info(f"Cloud function {cloud_function_name} already exists.") - # Derive the name of the remote function - remote_function_name = name - if not remote_function_name: - remote_function_name = _utils.get_bigframes_function_name( - function_hash, self._session.session_id, uniq_suffix - ) - rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) - - # Create the BQ remote function in following circumstances: - # 1. It does not exist - # 2. It exists but the existing remote function has different - # configuration than intended - created_new = False - if not rf_endpoint or ( - rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id - ): - input_args = inspect.getargs(def_.__code__).args - if len(input_args) != len(input_types): - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "Exactly one type should be provided for every input arg.", - ) - self.create_bq_remote_function( - input_args, - input_types, - output_type, - cf_endpoint, - remote_function_name, - max_batching_rows, - bq_metadata, - ) + intended_rf_spec = udf_def.RemoteFunctionConfig( + endpoint=cf_endpoint, + connection_id=self._bq_connection_id, + max_batching_rows=max_batching_rows or 1000, + signature=func_signature, + bq_metadata=func_signature.protocol_metadata, + ) + remote_function_name = name or get_bigframes_function_name( + intended_rf_spec, + self._session.session_id, + random_suffix if (not reuse) else None, + ) - created_new = True + if reuse: + existing_rf_spec = self.get_remote_function_specs(remote_function_name) + # Create the BQ remote function in following circumstances: + # 1. It does not exist + # 2. It exists but the existing remote function has different + # configuration than intended + created_new = False + if not existing_rf_spec or (existing_rf_spec != intended_rf_spec): + self.create_bq_remote_function(remote_function_name, intended_rf_spec) + created_new = True + else: + logger.info(f"Remote function {remote_function_name} already exists.") + + return remote_function_name, cloud_function_name, created_new else: - logger.info(f"Remote function {remote_function_name} already exists.") + self.create_bq_remote_function(remote_function_name, intended_rf_spec) + return remote_function_name, cloud_function_name, True - return remote_function_name, cloud_function_name, created_new - - def get_remote_function_specs(self, remote_function_name): + def get_remote_function_specs( + self, remote_function_name: str + ) -> udf_def.RemoteFunctionConfig | None: """Check whether a remote function already exists for the udf.""" - http_endpoint = None - bq_connection = None - routines = self._bq_client.list_routines( - f"{self._gcp_project_id}.{self._bq_dataset}" - ) try: - for routine in routines: - routine = cast(bigquery.Routine, routine) - if routine.reference.routine_id == remote_function_name: - rf_options = routine.remote_function_options - if rf_options: - http_endpoint = rf_options.endpoint - bq_connection = rf_options.connection - if bq_connection: - bq_connection = os.path.basename(bq_connection) - break + routine = self._bq_client.get_routine( + f"{self._gcp_project_id}.{self._bq_dataset}.{remote_function_name}" + ) + if routine.reference.routine_id == remote_function_name: + try: + return udf_def.RemoteFunctionConfig.from_bq_routine(routine) + except udf_def.ReturnTypeMissingError: + # The remote function exists, but it's missing a return type. + # Something is wrong with the function, so we should replace it. + return None except google.api_core.exceptions.NotFound: - # The dataset might not exist, in which case the http_endpoint doesn't, either. + # The dataset might not exist, in which case the remote function doesn't, either. # Note: list_routines doesn't make an API request until we iterate on the response object. pass - return (http_endpoint, bq_connection) + return None + + +def get_cloud_function_name( + function_def: udf_def.CloudRunFunctionConfig, session_id=None, uniq_suffix=None +): + "Get a name for the cloud function for the given user defined function." + parts = [_BIGFRAMES_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_def.stable_hash().hex()) + if uniq_suffix: + parts.append(uniq_suffix) + return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) + + +def get_bigframes_function_name( + function: udf_def.RemoteFunctionConfig, session_id, uniq_suffix=None +): + "Get a name for the bigframes function for the given user defined function." + parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function.stable_hash().hex()] + if uniq_suffix: + parts.append(uniq_suffix) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +def _validate_routine_name(name: str) -> None: + """Validate that the given name is a valid BigQuery routine name.""" + # Routine IDs can contain only letters (a-z, A-Z), numbers (0-9), or underscores (_) + # must also start with a letter or underscore only + if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name): + raise ValueError( + "Routine ID can contain only letters (a-z, A-Z), numbers (0-9), or underscores (_)" + ) def _infer_milli_cpus_from_memory(memory_mib: int) -> int: diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 7541936ede..85753a71ce 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -24,7 +24,6 @@ Any, cast, Dict, - get_origin, Literal, Mapping, Optional, @@ -51,7 +50,6 @@ if TYPE_CHECKING: from bigframes.session import Session -import pandas from bigframes.functions import _function_client, _utils @@ -241,7 +239,7 @@ def remote_function( cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, - max_batching_rows: Optional[int] = 1000, + max_batching_rows: Optional[int] = None, cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, @@ -580,13 +578,6 @@ def wrapper(func): warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) py_sig = py_sig.replace(return_annotation=output_type) - # The function will actually be receiving a pandas Series, but allow - # both BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if new_sig := _convert_row_processor_sig(py_sig): - py_sig = new_sig - is_row_processor = True - remote_function_client = _function_client.FunctionClient( dataset_ref.project, bq_location, @@ -605,25 +596,9 @@ def wrapper(func): session=session, # type: ignore ) - # resolve the output type that can be supported in the bigframes, - # ibis, BQ remote functions and cloud functions integration. - bqrf_metadata = None - post_process_routine = None - if get_origin(py_sig.return_annotation) is list: - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the cloud function - # and BQ level, and parse that to the intended output type at - # the bigframes level. - bqrf_metadata = _utils.get_bigframes_metadata( - python_output_type=py_sig.return_annotation - ) - post_process_routine = _utils.build_unnest_post_routine( - py_sig.return_annotation - ) - py_sig = py_sig.replace(return_annotation=str) - - udf_sig = udf_def.UdfSignature.from_py_signature(py_sig) + udf_sig = udf_def.UdfSignature.from_py_signature( + py_sig + ).to_remote_function_compatible() ( rf_name, @@ -631,21 +606,18 @@ def wrapper(func): created_new, ) = remote_function_client.provision_bq_remote_function( func, - input_types=udf_sig.sql_input_types, - output_type=udf_sig.sql_output_type, - reuse=reuse, + func_signature=udf_sig, + reuse=reuse or False, name=name, - package_requirements=packages, - max_batching_rows=max_batching_rows, + package_requirements=tuple(packages) if packages else tuple(), + max_batching_rows=max_batching_rows or 1000, cloud_function_timeout=cloud_function_timeout, cloud_function_max_instance_count=cloud_function_max_instances, - is_row_processor=is_row_processor, cloud_function_vpc_connector=cloud_function_vpc_connector, cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_cpus=cloud_function_cpus, cloud_function_ingress_settings=cloud_function_ingress_settings, - bq_metadata=bqrf_metadata, ) bigframes_cloud_function = ( @@ -676,12 +648,13 @@ def wrapper(func): signature=udf_sig, ) decorator = functools.wraps(func) - if is_row_processor: + if udf_sig.is_row_processor: + msg = bfe.format_message("input_types=Series is in preview.") + warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) return decorator( bq_functions.BigqueryCallableRowRoutine( udf_definition, session, - post_routine=post_process_routine, cloud_function_ref=bigframes_cloud_function, local_func=func, is_managed=False, @@ -692,7 +665,6 @@ def wrapper(func): bq_functions.BigqueryCallableRoutine( udf_definition, session, - post_routine=post_process_routine, cloud_function_ref=bigframes_cloud_function, local_func=func, is_managed=False, @@ -892,10 +864,6 @@ def wrapper(func): # The function will actually be receiving a pandas Series, but allow # both BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if new_sig := _convert_row_processor_sig(py_sig): - py_sig = new_sig - is_row_processor = True udf_sig = udf_def.UdfSignature.from_py_signature(py_sig) @@ -911,14 +879,14 @@ def wrapper(func): bq_function_name = managed_function_client.provision_bq_managed_function( func=func, - input_types=udf_sig.sql_input_types, - output_type=udf_sig.sql_output_type, + input_types=tuple(arg.sql_type for arg in udf_sig.inputs), + output_type=udf_sig.output.sql_type, name=name, packages=packages, max_batching_rows=max_batching_rows, container_cpu=container_cpu, container_memory=container_memory, - is_row_processor=is_row_processor, + is_row_processor=udf_sig.is_row_processor, bq_connection_id=bq_connection_id, ) full_rf_name = ( @@ -936,7 +904,9 @@ def wrapper(func): self._update_temp_artifacts(full_rf_name, "") decorator = functools.wraps(func) - if is_row_processor: + if udf_sig.is_row_processor: + msg = bfe.format_message("input_types=Series is in preview.") + warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) return decorator( bq_functions.BigqueryCallableRowRoutine( udf_definition, session, local_func=func, is_managed=True @@ -979,33 +949,3 @@ def deploy_udf( # TODO(tswast): If we update udf to defer deployment, update this method # to deploy immediately. return self.udf(**kwargs)(func) - - -def _convert_row_processor_sig( - signature: inspect.Signature, -) -> Optional[inspect.Signature]: - import bigframes.series as bf_series - - if len(signature.parameters) >= 1: - first_param = next(iter(signature.parameters.values())) - param_type = first_param.annotation - # Type hints for Series inputs should use pandas.Series because the - # underlying serialization process converts the input to a string - # representation of a pandas Series (not bigframes Series). Using - # bigframes Series will lead to TypeError when creating the function - # remotely. See more from b/445182819. - if param_type == bf_series.Series: - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "Argument type hint must be Pandas Series, not BigFrames Series.", - ) - if param_type == pandas.Series: - msg = bfe.format_message("input_types=Series is in preview.") - warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) - return signature.replace( - parameters=[ - p.replace(annotation=str) if i == 0 else p - for i, p in enumerate(signature.parameters.values()) - ] - ) - return None diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py index b6dedeac50..c197ed14fc 100644 --- a/bigframes/functions/_utils.py +++ b/bigframes/functions/_utils.py @@ -75,12 +75,12 @@ def _package_existed(package_requirements: list[str], package: str) -> bool: def get_updated_package_requirements( - package_requirements=None, - is_row_processor=False, - capture_references=True, - ignore_package_version=False, -): - requirements = [] + package_requirements: Sequence[str] = (), + is_row_processor: bool = False, + capture_references: bool = True, + ignore_package_version: bool = False, +) -> Sequence[str]: + requirements: list[str] = [] if capture_references: requirements.append(f"cloudpickle=={cloudpickle.__version__}") @@ -110,13 +110,12 @@ def get_updated_package_requirements( if not requirements: return package_requirements - if not package_requirements: - package_requirements = [] + result = list(package_requirements) for package in requirements: - if not _package_existed(package_requirements, package): - package_requirements.append(package) + if not _package_existed(result, package): + result.append(package) - return sorted(package_requirements) + return sorted(result) def clean_up_by_session_id( @@ -183,6 +182,23 @@ def clean_up_by_session_id( pass +def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: + return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" + + +def get_managed_function_name( + function_hash: str, + session_id: str | None = None, +): + """Get a name for the bigframes managed function for the given user defined function.""" + parts = [_BIGFRAMES_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_hash) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +# Deprecated: Use CodeDef.stable_hash() instead. def get_hash(def_, package_requirements=None): "Get hash (32 digits alphanumeric) of a function." # There is a known cell-id sensitivity of the cloudpickle serialization in @@ -208,46 +224,28 @@ def get_hash(def_, package_requirements=None): return hashlib.md5(def_repr).hexdigest() -def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: - return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" - - -def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): - "Get a name for the cloud function for the given user defined function." - parts = [_BIGFRAMES_FUNCTION_PREFIX] - if session_id: - parts.append(session_id) - parts.append(function_hash) - if uniq_suffix: - parts.append(uniq_suffix) - return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_bigframes_function_name(function_hash, session_id, uniq_suffix=None): - "Get a name for the bigframes function for the given user defined function." - parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function_hash] - if uniq_suffix: - parts.append(uniq_suffix) - return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_python_output_type_from_bigframes_metadata( +def get_python_output_type_str_from_bigframes_metadata( metadata_text: str, -) -> Optional[type]: +) -> Optional[str]: try: metadata_dict = json.loads(metadata_text) except (TypeError, json.decoder.JSONDecodeError): return None - try: - output_type = metadata_dict["value"]["python_array_output_type"] + return metadata_dict["value"]["python_array_output_type"] except KeyError: return None + +def get_python_output_type_from_bigframes_metadata( + metadata_text: str, +) -> Optional[type]: + output_type_str = get_python_output_type_str_from_bigframes_metadata(metadata_text) + for ( python_output_array_type ) in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: - if python_output_array_type.__name__ == output_type: + if python_output_array_type.__name__ == output_type_str: return list[python_output_array_type] # type: ignore return None @@ -293,20 +291,6 @@ def get_python_version(is_compat: bool = False) -> str: return f"python{major}{minor}" if is_compat else f"python-{major}.{minor}" -def build_unnest_post_routine(py_list_type: type[list]): - sdk_type = function_typing.sdk_array_output_type_from_python_type(py_list_type) - assert sdk_type.array_element_type is not None - inner_sdk_type = sdk_type.array_element_type - result_dtype = function_typing.sdk_type_to_bf_type(inner_sdk_type) - - def post_process(input): - import bigframes.bigquery as bbq - - return bbq.json_extract_string_array(input, value_dtype=result_dtype) - - return post_process - - def has_conflict_input_type( signature: inspect.Signature, input_types: Sequence[Any], diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 4e06cb1663..18a000c722 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -15,7 +15,7 @@ from __future__ import annotations import logging -from typing import Callable, cast, get_origin, Optional, TYPE_CHECKING +from typing import Callable, Optional, TYPE_CHECKING if TYPE_CHECKING: from bigframes.session import Session @@ -26,7 +26,7 @@ import bigframes.formatting_helpers as bf_formatting from bigframes.functions import _function_session as bff_session -from bigframes.functions import _utils, function_typing, udf_def +from bigframes.functions import function_typing, udf_def logger = logging.getLogger(__name__) @@ -82,39 +82,30 @@ def _try_import_routine( routine: bigquery.Routine, session: bigframes.Session ) -> BigqueryCallableRoutine: udf_def = _routine_as_udf_def(routine) - override_type = _get_output_type_override(routine) is_remote = ( hasattr(routine, "remote_function_options") and routine.remote_function_options ) - if override_type is not None: - return BigqueryCallableRoutine( - udf_def, - session, - post_routine=_utils.build_unnest_post_routine(override_type), - ) return BigqueryCallableRoutine(udf_def, session, is_managed=not is_remote) def _try_import_row_routine( routine: bigquery.Routine, session: bigframes.Session ) -> BigqueryCallableRowRoutine: - udf_def = _routine_as_udf_def(routine) - override_type = _get_output_type_override(routine) + udf_def = _routine_as_udf_def(routine, is_row_processor=True) + is_remote = ( hasattr(routine, "remote_function_options") and routine.remote_function_options ) - if override_type is not None: - return BigqueryCallableRowRoutine( - udf_def, - session, - post_routine=_utils.build_unnest_post_routine(override_type), - ) return BigqueryCallableRowRoutine(udf_def, session, is_managed=not is_remote) -def _routine_as_udf_def(routine: bigquery.Routine) -> udf_def.BigqueryUdf: +def _routine_as_udf_def( + routine: bigquery.Routine, is_row_processor: bool = False +) -> udf_def.BigqueryUdf: try: - return udf_def.BigqueryUdf.from_routine(routine) + return udf_def.BigqueryUdf.from_routine( + routine, is_row_processor=is_row_processor + ) except udf_def.ReturnTypeMissingError: raise bf_formatting.create_exception_with_feedback_link( ValueError, "Function return type must be specified." @@ -126,30 +117,6 @@ def _routine_as_udf_def(routine: bigquery.Routine) -> udf_def.BigqueryUdf: ) -def _get_output_type_override(routine: bigquery.Routine) -> Optional[type[list]]: - if routine.description is not None and isinstance(routine.description, str): - if python_output_type := _utils.get_python_output_type_from_bigframes_metadata( - routine.description - ): - bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type) - - if bq_return_type is None or bq_return_type.type_kind != "STRING": - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "An explicit output_type should be provided only for a BigQuery function with STRING output.", - ) - if get_origin(python_output_type) is list: - return python_output_type - else: - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "Currently only list of " - "a type is supported as python output type.", - ) - - return None - - # TODO(b/399894805): Support managed function. def read_gbq_function( function_name: str, @@ -178,6 +145,7 @@ def read_gbq_function( ValueError, f"Unknown function '{routine_ref}'." ) + # TODO(493293086): Deprecate is_row_processor. if is_row_processor: return _try_import_row_routine(routine, session) else: @@ -198,14 +166,10 @@ def __init__( *, local_func: Optional[Callable] = None, cloud_function_ref: Optional[str] = None, - post_routine: Optional[ - Callable[[bigframes.series.Series], bigframes.series.Series] - ] = None, is_managed: bool = False, ): self._udf_def = udf_def self._session = session - self._post_routine = post_routine self._local_fun = local_func self._cloud_function = cloud_function_ref self._is_managed = is_managed @@ -250,22 +214,15 @@ def bigframes_cloud_function(self) -> Optional[str]: @property def input_dtypes(self): - return self.udf_def.signature.bf_input_types + return tuple(arg.bf_type for arg in self.udf_def.signature.inputs) @property def output_dtype(self): - return self.udf_def.signature.bf_output_type + return self.udf_def.signature.output.bf_type @property def bigframes_bigquery_function_output_dtype(self): - return self.output_dtype - - def _post_process_series( - self, series: bigframes.series.Series - ) -> bigframes.series.Series: - if self._post_routine is not None: - return self._post_routine(series) - return series + return self.udf_def.signature.output.emulating_type.bf_type class BigqueryCallableRowRoutine: @@ -282,14 +239,11 @@ def __init__( *, local_func: Optional[Callable] = None, cloud_function_ref: Optional[str] = None, - post_routine: Optional[ - Callable[[bigframes.series.Series], bigframes.series.Series] - ] = None, is_managed: bool = False, ): + assert udf_def.signature.is_row_processor self._udf_def = udf_def self._session = session - self._post_routine = post_routine self._local_fun = local_func self._cloud_function = cloud_function_ref self._is_managed = is_managed @@ -334,19 +288,12 @@ def bigframes_cloud_function(self) -> Optional[str]: @property def input_dtypes(self): - return self.udf_def.signature.bf_input_types + return tuple(arg.bf_type for arg in self.udf_def.signature.inputs) @property def output_dtype(self): - return self.udf_def.signature.bf_output_type + return self.udf_def.signature.output.bf_type @property def bigframes_bigquery_function_output_dtype(self): - return self.output_dtype - - def _post_process_series( - self, series: bigframes.series.Series - ) -> bigframes.series.Series: - if self._post_routine is not None: - return self._post_routine(series) - return series + return self.udf_def.signature.output.emulating_type.bf_type diff --git a/bigframes/functions/function_template.py b/bigframes/functions/function_template.py index e48ffda8ed..31b5b20520 100644 --- a/bigframes/functions/function_template.py +++ b/bigframes/functions/function_template.py @@ -19,16 +19,12 @@ import os import re import textwrap -from typing import Tuple import cloudpickle -logger = logging.getLogger(__name__) - +from bigframes.functions import udf_def -# Protocol version 4 is available in python version 3.4 and above -# https://docs.python.org/3/library/pickle.html#data-stream-format -_pickle_protocol_version = 4 +logger = logging.getLogger(__name__) # Placeholder variables for testing. @@ -228,38 +224,38 @@ def udf_http_row_processor(request): return jsonify({"errorMessage": traceback.format_exc()}), 400 -def generate_udf_code(def_, directory): +def generate_udf_code(code_def: udf_def.CodeDef, directory: str): """Generate serialized code using cloudpickle given a udf.""" udf_code_file_name = "udf.py" udf_pickle_file_name = "udf.cloudpickle" # original code, only for debugging purpose - udf_code = textwrap.dedent(inspect.getsource(def_)) udf_code_file_path = os.path.join(directory, udf_code_file_name) with open(udf_code_file_path, "w") as f: - f.write(udf_code) + f.write(code_def.function_source) # serialized udf udf_pickle_file_path = os.path.join(directory, udf_pickle_file_name) # TODO(b/345433300): try io.BytesIO to avoid writing to the file system with open(udf_pickle_file_path, "wb") as f: - cloudpickle.dump(def_, f, protocol=_pickle_protocol_version) + f.write(code_def.pickled_code) return udf_code_file_name, udf_pickle_file_name def generate_cloud_function_main_code( - def_, - directory, + code_def: udf_def.CodeDef, + directory: str, *, - input_types: Tuple[str], - output_type: str, - is_row_processor=False, + udf_signature: udf_def.UdfSignature, ): """Get main.py code for the cloud function for the given user defined function.""" # Pickle the udf with all its dependencies - udf_code_file, udf_pickle_file = generate_udf_code(def_, directory) + udf_code_file, udf_pickle_file = generate_udf_code(code_def, directory) + + input_types = tuple(arg.sql_type for arg in udf_signature.inputs) + output_type = udf_signature.output.sql_type code_blocks = [ f"""\ @@ -278,7 +274,7 @@ def generate_cloud_function_main_code( # For converting scalar outputs to the correct type. code_blocks.append(inspect.getsource(convert_to_bq_json)) - if is_row_processor: + if udf_signature.is_row_processor: code_blocks.append(inspect.getsource(get_pd_series)) handler_func_name = "udf_http_row_processor" code_blocks.append(inspect.getsource(udf_http_row_processor)) @@ -308,8 +304,6 @@ def generate_managed_function_code( # This code path ensures that if the udf body contains any # references to variables and/or imports outside the body, they are # captured as well. - import cloudpickle - pickled = cloudpickle.dumps(def_) func_code = textwrap.dedent( f""" diff --git a/bigframes/functions/function_typing.py b/bigframes/functions/function_typing.py index 30804f317c..43ccfe9b25 100644 --- a/bigframes/functions/function_typing.py +++ b/bigframes/functions/function_typing.py @@ -81,7 +81,7 @@ def __init__(self, type_, supported_types): def sdk_type_from_python_type( - t: type, allow_lists: bool = False + t: type, allow_lists: bool = True ) -> bigquery.StandardSqlDataType: if (get_origin(t) is list) and allow_lists: return sdk_array_output_type_from_python_type(t) diff --git a/bigframes/functions/udf_def.py b/bigframes/functions/udf_def.py index 078e45f32d..f02f289ef6 100644 --- a/bigframes/functions/udf_def.py +++ b/bigframes/functions/udf_def.py @@ -14,160 +14,499 @@ from __future__ import annotations import dataclasses +import functools import inspect -from typing import cast, Optional +import io +import os +import textwrap +from typing import Any, cast, get_args, get_origin, Sequence, Type import warnings +import cloudpickle from google.cloud import bigquery +import google_crc32c +import pandas as pd import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting from bigframes.functions import function_typing +# Protocol version 4 is available in python version 3.4 and above +# https://docs.python.org/3/library/pickle.html#data-stream-format +_pickle_protocol_version = 4 + class ReturnTypeMissingError(ValueError): pass @dataclasses.dataclass(frozen=True) -class UdfField: +class UdfArg: name: str = dataclasses.field() - dtype: bigquery.StandardSqlDataType = dataclasses.field(hash=False, compare=False) + dtype: DirectScalarType | RowSeriesInputFieldV1 + + def __post_init__(self): + assert isinstance(self.name, str) + assert isinstance(self.dtype, (DirectScalarType, RowSeriesInputFieldV1)) + + @classmethod + def from_py_param(cls, param: inspect.Parameter) -> UdfArg: + if param.annotation == pd.Series: + return cls(param.name, RowSeriesInputFieldV1()) + return cls(param.name, DirectScalarType(param.annotation)) @classmethod - def from_sdk(cls, arg: bigquery.RoutineArgument) -> UdfField: + def from_sdk(cls, arg: bigquery.RoutineArgument) -> UdfArg: assert arg.name is not None - assert arg.data_type is not None - return cls(arg.name, arg.data_type) + + if arg.data_type is None: + msg = bfe.format_message( + "The function has one or more missing input data types. BigQuery DataFrames " + f"will assume default data type {function_typing.DEFAULT_RF_TYPE} for them." + ) + warnings.warn(msg, category=bfe.UnknownDataTypeWarning) + sdk_type = function_typing.DEFAULT_RF_TYPE + else: + sdk_type = arg.data_type + return cls(arg.name, DirectScalarType.from_sdk_type(sdk_type)) + + @property + def py_type(self) -> type: + return self.dtype.py_type + + @property + def bf_type(self) -> bigframes.dtypes.Dtype: + return self.dtype.bf_type + + @property + def sql_type(self) -> str: + return self.dtype.sql_type + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.name.encode()) + hash_val.update(self.dtype.stable_hash()) + return hash_val.digest() @dataclasses.dataclass(frozen=True) -class UdfSignature: - input_types: tuple[UdfField, ...] = dataclasses.field() - output_bq_type: bigquery.StandardSqlDataType = dataclasses.field( - hash=False, compare=False - ) +class DirectScalarType: + """ + Represents a scalar value that is passed directly to the remote function. + + For these values, BigQuery handles the serialization and deserialization without any additional processing. + """ + + _py_type: type @property - def bf_input_types(self) -> tuple[bigframes.dtypes.Dtype, ...]: - return tuple( - function_typing.sdk_type_to_bf_type(arg.dtype) for arg in self.input_types + def py_type(self) -> type: + return self._py_type + + @property + def bf_type(self) -> bigframes.dtypes.Dtype: + return function_typing.sdk_type_to_bf_type( + function_typing.sdk_type_from_python_type(self._py_type) ) @property - def bf_output_type(self) -> bigframes.dtypes.Dtype: - return function_typing.sdk_type_to_bf_type(self.output_bq_type) + def sql_type(self) -> str: + sdk_type = function_typing.sdk_type_from_python_type(self._py_type) + return function_typing.sdk_type_to_sql_string(sdk_type) + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self._py_type.__name__.encode()) + return hash_val.digest() + + @classmethod + def from_sdk_type(cls, sdk_type: bigquery.StandardSqlDataType) -> DirectScalarType: + return cls(function_typing.sdk_type_to_py_type(sdk_type)) @property - def py_input_types(self) -> tuple[type, ...]: - return tuple( - function_typing.sdk_type_to_py_type(arg.dtype) for arg in self.input_types - ) + def emulating_type(self) -> DirectScalarType: + return self + + +@dataclasses.dataclass(frozen=True) +class VirtualListTypeV1: + """ + Represents a list of scalar values that is emulated as a JSON array string in the remote function. + + Only works as output paramter right now where array -> string in function runtime, and then string -> array in SQL post-processing (defined in out_expr()). + """ + + _PROTOCOL_ID = "virtual_list_v1" + + inner_dtype: DirectScalarType + + @property + def py_type(self) -> Type[list[Any]]: + return list[self.inner_dtype.py_type] # type: ignore + + @property + def bf_type(self) -> bigframes.dtypes.Dtype: + return bigframes.dtypes.list_type(self.inner_dtype.bf_type) + + @property + def emulating_type(self) -> DirectScalarType: + # Regardless of list inner type, string is used to emulate the list in the remote function. + return DirectScalarType(str) + + def out_expr( + self, expr: bigframes.core.expression.Expression + ) -> bigframes.core.expression.Expression: + # essentially we are undoing json.dumps in sql + import bigframes.operations as ops + + as_str_list = ops.JSONValueArray(json_path="$").as_expr(expr) + if self.inner_dtype.py_type is str: + return as_str_list + elif self.inner_dtype.py_type is bool: + # hack so we don't need to make ArrayMap support general expressions yet + # with b/495513753 we can map the equality operator instead + return ops.ArrayMapOp(ops.IsInOp(values=("true",))).as_expr(as_str_list) + else: + return ops.ArrayMapOp(ops.AsTypeOp(self.inner_dtype.bf_type)).as_expr( + as_str_list + ) + + @property + def sql_type(self) -> str: + return f"ARRAY<{self.inner_dtype.sql_type}>" + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self._PROTOCOL_ID.encode()) + hash_val.update(self.inner_dtype.stable_hash()) + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class RowSeriesInputFieldV1: + """ + Used to handle functions that logically take a series as an input, but handled via a string protocol in the remote function. + + For these, the serialization is dependent on index metadata, which must be provided by the caller. + """ + + _PROTOCOL_ID = "row_series_input_v1" + + @property + def py_type(self) -> type: + return pd.Series @property - def py_output_type(self) -> type: - return function_typing.sdk_type_to_py_type(self.output_bq_type) + def bf_type(self) -> bigframes.dtypes.Dtype: + # Code paths shouldn't hit this. + raise ValueError("Series does not have a corresponding BigFrames type.") @property - def sql_input_types(self) -> tuple[str, ...]: - return tuple( - function_typing.sdk_type_to_sql_string(arg.dtype) - for arg in self.input_types + def sql_type(self) -> str: + return "STRING" + + @property + def emulating_type(self) -> DirectScalarType: + # Regardless of list inner type, string is used to emulate the list in the remote function. + return DirectScalarType(str) + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self._PROTOCOL_ID.encode()) + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class UdfSignature: + """ + Represents the mapping of input types from bigframes to sql to python and back. + """ + + inputs: tuple[UdfArg, ...] = dataclasses.field() + output: DirectScalarType | VirtualListTypeV1 + + def __post_init__(self): + # Validate inputs and outputs are of the correct types. + assert all(isinstance(arg, UdfArg) for arg in self.inputs) + assert isinstance(self.output, (DirectScalarType, VirtualListTypeV1)) + + def to_sql_input_signature(self) -> str: + return ",".join( + f"{field.name} {field.sql_type}" + for field in self.with_devirtualize().inputs ) @property - def sql_output_type(self) -> str: - return function_typing.sdk_type_to_sql_string(self.output_bq_type) + def protocol_metadata(self) -> str | None: + import bigframes.functions._utils + + if isinstance(self.output, VirtualListTypeV1): + return bigframes.functions._utils.get_bigframes_metadata( + python_output_type=self.output.py_type + ) + return None + @property + def is_virtual(self) -> bool: + dtypes = (self.output,) + tuple(arg.dtype for arg in self.inputs) + return not all(isinstance(dtype, DirectScalarType) for dtype in dtypes) + + @property + def is_row_processor(self) -> bool: + return any(isinstance(arg.dtype, RowSeriesInputFieldV1) for arg in self.inputs) + + def with_devirtualize(self) -> UdfSignature: + return UdfSignature( + inputs=tuple( + UdfArg(arg.name, arg.dtype.emulating_type) for arg in self.inputs + ), + output=self.output.emulating_type, + ) + + # TODO(493293086): Deprecate is_row_processor. @classmethod - def from_routine(cls, routine: bigquery.Routine) -> UdfSignature: + def from_routine( + cls, routine: bigquery.Routine, is_row_processor: bool = False + ) -> UdfSignature: + import bigframes.functions._utils + + ## Handle return type if routine.return_type is None: - raise ReturnTypeMissingError + raise ReturnTypeMissingError( + f"Routine {routine} has no return type. Routine properties: {routine._properties}" + ) + bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type) - if ( - bq_return_type.type_kind is None - or bq_return_type.type_kind - not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + return_type: DirectScalarType | VirtualListTypeV1 = ( + DirectScalarType.from_sdk_type(bq_return_type) + ) + if python_output_type := bigframes.functions._utils.get_python_output_type_from_bigframes_metadata( + routine.description ): - raise ValueError( - f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}" - ) - - udf_fields = [] - for argument in routine.arguments: - if argument.data_type is None: - msg = bfe.format_message( - "The function has one or more missing input data types. BigQuery DataFrames " - f"will assume default data type {function_typing.DEFAULT_RF_TYPE} for them." - ) - warnings.warn(msg, category=bfe.UnknownDataTypeWarning) - assert argument.name is not None - udf_fields.append( - UdfField(argument.name, function_typing.DEFAULT_RF_TYPE) + if bq_return_type.type_kind != "STRING": + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "An explicit output_type should be provided only for a BigQuery function with STRING output.", ) + + if get_origin(python_output_type) is list: + inner_type = get_args(python_output_type)[0] + return_type = VirtualListTypeV1(DirectScalarType(inner_type)) else: - udf_fields.append(UdfField.from_sdk(argument)) + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "Currently only list of " + "a type is supported as python output type.", + ) + + ## Handle input types + udf_fields = [] + + for i, argument in enumerate(routine.arguments): + if is_row_processor and i == 0: + if argument.data_type.type_kind == "STRING": + udf_fields.append(UdfArg(argument.name, RowSeriesInputFieldV1())) + else: + raise ValueError( + "Row processor functions must have STRING input type as first argument." + ) + udf_fields.append(UdfArg.from_sdk(argument)) return cls( - input_types=tuple(udf_fields), - output_bq_type=bq_return_type, + inputs=tuple(udf_fields), + output=return_type, ) @classmethod def from_py_signature(cls, signature: inspect.Signature): - input_types: list[UdfField] = [] + import bigframes.series + + input_types: list[UdfArg] = [] for parameter in signature.parameters.values(): if parameter.annotation is inspect.Signature.empty: raise bf_formatting.create_exception_with_feedback_link( ValueError, "'input_types' was not set and parameter " f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function.", + "Types are required to use udfs.", + ) + if parameter.annotation is bigframes.series.Series: + raise TypeError( + "Argument type hint must be Pandas Series, not BigFrames Series." ) - bq_type = function_typing.sdk_type_from_python_type(parameter.annotation) - input_types.append(UdfField(parameter.name, bq_type)) + + input_types.append(UdfArg.from_py_param(parameter)) if signature.return_annotation is inspect.Signature.empty: raise bf_formatting.create_exception_with_feedback_link( ValueError, "'output_type' was not set and function is missing a " "return type annotation. Types are required to use " - "@remote_function.", + "udfs.", ) - output_bq_type = function_typing.sdk_type_from_python_type( - signature.return_annotation, - allow_lists=True, - ) - return cls(tuple(input_types), output_bq_type) + + output_type = DirectScalarType(signature.return_annotation) + return cls(tuple(input_types), output_type) + + def to_remote_function_compatible(self) -> UdfSignature: + # need to virtualize list outputs + if isinstance(self.output, DirectScalarType): + if get_origin(self.output.py_type) is list: + inner_py_type = get_args(self.output.py_type)[0] + return UdfSignature( + inputs=self.inputs, + output=VirtualListTypeV1(DirectScalarType(inner_py_type)), + ) + return self + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + for input_type in self.inputs: + hash_val.update(input_type.stable_hash()) + hash_val.update(self.output.stable_hash()) + return hash_val.digest() @dataclasses.dataclass(frozen=True) class BigqueryUdf: + """ + Represents the information needed to call a BigQuery remote function - not a full spec. + """ + routine_ref: bigquery.RoutineReference = dataclasses.field() signature: UdfSignature - # Used to provide alternative interpretations of output bq type, eg interpret int as timestamp - output_type_override: Optional[bigframes.dtypes.Dtype] = dataclasses.field( - default=None - ) - @property - def bigframes_output_type(self) -> bigframes.dtypes.Dtype: - return self.output_type_override or function_typing.sdk_type_to_bf_type( - self.signature.output_bq_type + def with_devirtualize(self) -> BigqueryUdf: + if not self.signature.is_virtual: + return self + return BigqueryUdf( + routine_ref=self.routine_ref, + signature=self.signature.with_devirtualize(), ) @classmethod - def from_routine(cls, routine: bigquery.Routine) -> BigqueryUdf: - signature = UdfSignature.from_routine(routine) - - if ( - signature.output_bq_type.type_kind is None - or signature.output_bq_type.type_kind - not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS - ): - raise ValueError( - f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}" - ) + def from_routine( + cls, routine: bigquery.Routine, is_row_processor: bool = False + ) -> BigqueryUdf: + signature = UdfSignature.from_routine( + routine, is_row_processor=is_row_processor + ) return cls(routine.reference, signature=signature) + + +@dataclasses.dataclass(frozen=True) +class CodeDef: + # Produced by cloudpickle, not compatible across python versions + pickled_code: bytes + # This is just the function itself, and does not include referenced objects/functions/modules + function_source: str + package_requirements: tuple[str, ...] + + @classmethod + def from_func(cls, func, package_requirements: Sequence[str] | None = None): + bytes_io = io.BytesIO() + cloudpickle.dump(func, bytes_io, protocol=_pickle_protocol_version) + # this is hacky, but works for some nested functions + source = textwrap.dedent(inspect.getsource(func)) + return cls( + pickled_code=bytes_io.getvalue(), + function_source=source, + package_requirements=tuple(package_requirements or []), + ) + + @functools.cache + def stable_hash(self) -> bytes: + # There is a known cell-id sensitivity of the cloudpickle serialization in + # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of + # this, if a cell contains a udf decorated with @remote_function, a unique + # cloudpickle code is generated every time the cell is run, creating new + # cloud artifacts every time. This is slow and wasteful. + # A workaround of the same can be achieved by replacing the filename in the + # code object to a static value + # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. + # + # To respect the user code/environment let's make this modification on a + # copy of the udf, not on the original udf itself. + def_copy = cloudpickle.loads(self.pickled_code) + def_copy.__code__ = def_copy.__code__.replace( + co_filename="bigframes_place_holder_filename" + ) + + normalized_pickled_code = cloudpickle.dumps( + def_copy, protocol=_pickle_protocol_version + ) + + hash_val = google_crc32c.Checksum() + hash_val.update(normalized_pickled_code) + + if self.package_requirements: + for p in sorted(self.package_requirements): + hash_val.update(p.encode()) + + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class CloudRunFunctionConfig: + code: CodeDef + signature: UdfSignature + timeout_seconds: int | None + max_instance_count: int | None + vpc_connector: str | None + vpc_connector_egress_settings: str + memory_mib: int | None + cpus: float | None + ingress_settings: str + workers: int | None + threads: int | None + concurrency: int | None + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.code.stable_hash()) + hash_val.update(self.signature.stable_hash()) + hash_val.update(str(self.timeout_seconds).encode()) + hash_val.update(str(self.max_instance_count).encode()) + hash_val.update(str(self.vpc_connector).encode()) + hash_val.update(str(self.vpc_connector_egress_settings).encode()) + hash_val.update(str(self.memory_mib).encode()) + hash_val.update(str(self.cpus).encode()) + hash_val.update(str(self.ingress_settings).encode()) + hash_val.update(str(self.workers).encode()) + hash_val.update(str(self.threads).encode()) + hash_val.update(str(self.concurrency).encode()) + return hash_val.digest() + + +@dataclasses.dataclass(frozen=True) +class RemoteFunctionConfig: + """ + Represents the information needed to create a BigQuery remote function. + """ + + endpoint: str + signature: UdfSignature + connection_id: str + max_batching_rows: int + bq_metadata: str | None = None + + @classmethod + def from_bq_routine(cls, routine: bigquery.Routine) -> RemoteFunctionConfig: + return cls( + endpoint=routine.remote_function_options.endpoint, + connection_id=os.path.basename(routine.remote_function_options.connection), + signature=UdfSignature.from_routine(routine), + max_batching_rows=routine.remote_function_options.max_batching_rows, + bq_metadata=routine.description, + ) + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.endpoint.encode()) + hash_val.update(self.signature.stable_hash()) + hash_val.update(self.connection_id.encode()) + hash_val.update(str(self.max_batching_rows).encode()) + hash_val.update(str(self.bq_metadata).encode()) + return hash_val.digest() diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a1c7754ab5..9f585843b8 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -25,6 +25,7 @@ ) from bigframes.operations.array_ops import ( ArrayIndexOp, + ArrayMapOp, ArrayReduceOp, ArraySliceOp, ArrayToStringOp, @@ -440,4 +441,5 @@ "NUMPY_TO_OP", "ToArrayOp", "ArrayReduceOp", + "ArrayMapOp", ] diff --git a/bigframes/operations/array_ops.py b/bigframes/operations/array_ops.py index 61ada59cc7..c5694e50ba 100644 --- a/bigframes/operations/array_ops.py +++ b/bigframes/operations/array_ops.py @@ -88,3 +88,17 @@ def output_type(self, *input_types): assert dtypes.is_array_like(input_type) inner_type = dtypes.get_array_inner_type(input_type) return self.aggregation.output_type(inner_type) + + +@dataclasses.dataclass(frozen=True) +class ArrayMapOp(base_ops.UnaryOp): + name: typing.ClassVar[str] = "array_map" + # TODO(b/495513753): Generalize to chained expressions + map_op: base_ops.UnaryOp + + def output_type(self, *input_types): + input_type = input_types[0] + assert dtypes.is_array_like(input_type) + inner_type = dtypes.get_array_inner_type(input_type) + out_inner_type = self.map_op.output_type(inner_type) + return dtypes.list_type(out_inner_type) diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index e610ce61d6..9c51210df0 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -31,7 +31,7 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - return self.function_def.bigframes_output_type + return self.function_def.signature.output.bf_type @dataclasses.dataclass(frozen=True) @@ -44,7 +44,7 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - return self.function_def.bigframes_output_type + return self.function_def.signature.output.bf_type @dataclasses.dataclass(frozen=True) @@ -57,4 +57,4 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - return self.function_def.bigframes_output_type + return self.function_def.signature.output.bf_type diff --git a/bigframes/series.py b/bigframes/series.py index 23799a0a43..7eb30beb82 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2042,7 +2042,6 @@ def apply( result_series = self._apply_unary_op( ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True) ) - result_series = func._post_process_series(result_series) return result_series @@ -2095,7 +2094,6 @@ def combine( result_series = self._apply_binary_op( other, ops.BinaryRemoteFunctionOp(function_def=func.udf_def) ) - result_series = func._post_process_series(result_series) return result_series bf_op = python_ops.python_callable_to_op(func) diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index 26a944d760..5f4a8d2627 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -513,7 +513,7 @@ def get_function_name(func, package_requirements=None, is_row_processor=False): # Augment user package requirements with any internal package # requirements. package_requirements = bff_utils.get_updated_package_requirements( - package_requirements, is_row_processor + package_requirements or [], is_row_processor ) # Compute a unique hash representing the user code. diff --git a/setup.py b/setup.py index 2179fe3e96..e22b52442d 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ "google-cloud-bigquery-connection >=1.12.0", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", + "google-crc32c >=1.0.0,<2.0.0", "grpc-google-iam-v1 >= 0.14.2", "numpy >=1.24.0", "pandas >=1.5.3", diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 1ab4987302..9848d36096 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -20,6 +20,7 @@ import shutil import tempfile import textwrap +import uuid import warnings import google.api_core.exceptions @@ -32,7 +33,6 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions -import bigframes.functions._utils as bff_utils import bigframes.pandas as bpd import bigframes.series from bigframes.testing.utils import ( @@ -526,24 +526,6 @@ def add_one(x): # Make a unique udf add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) - # Expected cloud function name for the unique udf - package_requirements = bff_utils.get_updated_package_requirements() - add_one_uniq_hash = bff_utils.get_hash(add_one_uniq, package_requirements) - add_one_uniq_cf_name = bff_utils.get_cloud_function_name( - add_one_uniq_hash, session.session_id - ) - - # There should be no cloud function yet for the unique udf - cloud_functions = list( - get_cloud_functions( - session.cloudfunctionsclient, - session.bqclient.project, - session.bqclient.location, - name=add_one_uniq_cf_name, - ) - ) - assert len(cloud_functions) == 0 - # The first time both the cloud function and the bq remote function don't # exist and would be created remote_add_one = session.remote_function( @@ -555,6 +537,9 @@ def add_one(x): cloud_function_service_account="default", )(add_one_uniq) + assert remote_add_one.bigframes_cloud_function is not None + add_one_uniq_cf_name = remote_add_one.bigframes_cloud_function.split("/")[-1] + # There should have been excactly one cloud function created at this point cloud_functions = list( get_cloud_functions( @@ -1562,7 +1547,9 @@ def square(x): bq_routine = session.bqclient.get_routine( square_remote.bigframes_bigquery_function ) - assert bq_routine.remote_function_options.max_batching_rows == max_batching_rows + assert bq_routine.remote_function_options.max_batching_rows == ( + max_batching_rows or 1000 + ) scalars_df, scalars_pandas_df = scalars_dfs @@ -1682,6 +1669,51 @@ def square(x): ) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_reflects_config_change_with_reuse(session): + square_remote = None + square_remote_2 = None + try: + + def square(x): + return x * x + + # random alphanumeric name starting with a letter + deploy_name = "a" + str(uuid.uuid4().hex) + square_remote = session.remote_function( + input_types=[int], + name=deploy_name, + output_type=int, + reuse=True, + cloud_function_service_account="default", + cloud_function_cpus=1, + )(square) + square_remote_2 = session.remote_function( + input_types=[int], + name=deploy_name, + output_type=int, + reuse=True, + cloud_function_service_account="default", + cloud_function_cpus=2, + )(square) + + # Assert that the GCF is created with the intended max instance count + gcf = session.cloudfunctionsclient.get_function( + name=square_remote_2.bigframes_cloud_function + ) + assert float(gcf.service_config.available_cpu) == 2.0 + finally: + # clean up the gcp assets created for the remote function + if square_remote is not None: + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient + ) + if square_remote_2 is not None: + cleanup_function_assets( + square_remote_2, session.bqclient, session.cloudfunctionsclient + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_df_apply_axis_1(session, scalars_dfs): columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index c2e9036eed..eabd36ab38 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -155,7 +155,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): ) -# @pytest.mark.flaky(retries=2) +@pytest.mark.flaky(retries=2) def test_xgbclassifier_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): diff --git a/tests/system/large/ml/test_llm.py b/tests/system/large/ml/test_llm.py index 6e51d14a3a..dba5dc8e4d 100644 --- a/tests/system/large/ml/test_llm.py +++ b/tests/system/large/ml/test_llm.py @@ -63,7 +63,7 @@ def test_create_load_gemini_text_generator_model( "gemini-2.5-flash-lite", ), ) -# @pytest.mark.flaky(retries=2) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_default_params_success( llm_text_df, model_name, session, bq_connection ): diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 1ee60dafd6..643f503c05 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -34,7 +34,11 @@ from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff import bigframes.session._io.bigquery -from bigframes.testing.utils import assert_frame_equal, get_function_name +from bigframes.testing.utils import ( + assert_frame_equal, + assert_series_equal, + get_function_name, +) _prefixer = test_utils.prefixer.Prefixer("bigframes", "") @@ -102,7 +106,7 @@ def get_bq_connection_id_path_format(connection_id_dot_format): return f"projects/{fields[0]}/locations/{fields[1]}/connections/{fields[2]}" -# @pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, bigqueryconnection_client, @@ -617,7 +621,7 @@ def bytes_to_hex(mybytes: bytes) -> bytes: )(bytes_to_hex) bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() - pd.testing.assert_series_equal( + assert_series_equal( bf_result, pd_result, ) @@ -785,7 +789,7 @@ def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_un pd_result = pd_s.apply(func) bf_result = bf_s.apply(func) assert bigframes.dtypes.is_array_string_like(bf_result.dtype) - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False ) @@ -826,7 +830,7 @@ def test_read_gbq_function_runs_existing_udf_2_params_array_output( pd_result = pd_df["col0"].combine(pd_df["col1"], func) bf_result = bf_df["col0"].combine(bf_df["col1"], func) assert bigframes.dtypes.is_array_string_like(bf_result.dtype) - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False ) @@ -881,7 +885,7 @@ def test_read_gbq_function_runs_existing_udf_4_params_array_output( ) bf_result = bf_df.apply(func, axis=1) assert bigframes.dtypes.is_array_string_like(bf_result.dtype) - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False ) @@ -1060,9 +1064,7 @@ def test_read_gbq_function_respects_python_output_type( actual = s.apply(func).to_pandas() # ignore type disparities, e.g. "int64" in pandas v/s "Int64" in bigframes - pd.testing.assert_series_equal( - expected, actual, check_dtype=False, check_index_type=False - ) + assert_series_equal(expected, actual, check_dtype=False, check_index_type=False) @pytest.mark.parametrize( @@ -1200,9 +1202,7 @@ def add_ints(row: pandas.Series) -> int: # bf_result.to_numpy() produces an array of numpy.float64's # (in system_prerelease tests), while pd_result.to_numpy() produces an # array of ints, ignore this mismatch by using check_exact=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_exact=False - ) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_exact=False) # Read back the deployed BQ remote function using read_gbq_function. func_ref = session.read_gbq_function( @@ -1215,9 +1215,7 @@ def add_ints(row: pandas.Series) -> int: assert func_ref.bigframes_remote_function == func_ref.bigframes_bigquery_function # type: ignore bf_result_gbq = scalars_df[columns].apply(func_ref, axis=1).to_pandas() - pd.testing.assert_series_equal( - pd_result, bf_result_gbq, check_dtype=False, check_exact=False - ) + assert_series_equal(pd_result, bf_result_gbq, check_dtype=False, check_exact=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1253,9 +1251,7 @@ def add_ints(row: pandas.Series) -> int: # bf_result.to_numpy() produces an array of numpy.float64's # (in system_prerelease tests), while pd_result.to_numpy() produces an # array of ints, ignore this mismatch by using check_exact=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_exact=False - ) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_exact=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1286,9 +1282,7 @@ def add_numbers(row): # bf_result.index[0].dtype is 'string[pyarrow]' while # pd_result.index[0].dtype is 'object', ignore this mismatch by using # check_index_type=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_apply_axis_1_unsupported_callable(scalars_dfs): @@ -1452,7 +1446,7 @@ def is_odd(x: int) -> bool: bf_result = bf_method(is_odd_remote).to_pandas() # ignore any dtype difference - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1501,7 +1495,7 @@ def add(x: int, y: int) -> int: ) # ignore any dtype difference - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.flaky(retries=2, delay=120) @@ -1563,7 +1557,7 @@ def add_pandas(s: pd.Series) -> float: bf_result = bf_df[bf_filter].apply(add_remote, axis=1).to_pandas() # ignore any dtype difference - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( diff --git a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py index 2667e482c8..da5baea524 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py @@ -178,17 +178,13 @@ def test_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): "my_project.my_dataset.my_routine" ), signature=udf_def.UdfSignature( - input_types=( - udf_def.UdfField( + inputs=( + udf_def.UdfArg( "x", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.INT64 - ), + udf_def.DirectScalarType(int), ), ), - output_bq_type=bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + output=udf_def.DirectScalarType(float), ), ) ops_map = { @@ -211,23 +207,17 @@ def test_binary_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): "my_project.my_dataset.my_routine" ), signature=udf_def.UdfSignature( - input_types=( - udf_def.UdfField( + inputs=( + udf_def.UdfArg( "x", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.INT64 - ), + udf_def.DirectScalarType(int), ), - udf_def.UdfField( + udf_def.UdfArg( "y", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + udf_def.DirectScalarType(float), ), ), - output_bq_type=bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + output=udf_def.DirectScalarType(float), ), ) ) @@ -244,29 +234,21 @@ def test_nary_remote_function_op(scalar_types_df: bpd.DataFrame, snapshot): "my_project.my_dataset.my_routine" ), signature=udf_def.UdfSignature( - input_types=( - udf_def.UdfField( + inputs=( + udf_def.UdfArg( "x", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.INT64 - ), + udf_def.DirectScalarType(int), ), - udf_def.UdfField( + udf_def.UdfArg( "y", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + udf_def.DirectScalarType(float), ), - udf_def.UdfField( + udf_def.UdfArg( "z", - bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.STRING - ), + udf_def.DirectScalarType(str), ), ), - output_bq_type=bigquery.StandardSqlDataType( - type_kind=bigquery.StandardSqlTypeNames.FLOAT64 - ), + output=udf_def.DirectScalarType(float), ), ) ) diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index e9e0d0df67..bfb6192a2c 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -12,36 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re - -import pandas import pytest -import bigframes.exceptions import bigframes.functions.function as bff from bigframes.testing import mocks -def test_series_input_types_to_str(): - """Check that is_row_processor=True uses str as the input type to serialize a row.""" - session = mocks.create_bigquery_session() - remote_function_decorator = bff.remote_function( - session=session, cloud_function_service_account="default" - ) - - with pytest.warns( - bigframes.exceptions.PreviewWarning, - match=re.escape("input_types=Series is in preview."), - ): - - @remote_function_decorator - def axis_1_function(myparam: pandas.Series) -> str: # type: ignore - return "Hello, " + myparam["str_col"] + "!" # type: ignore - - # Still works as a normal function. - assert axis_1_function(pandas.Series({"str_col": "World"})) == "Hello, World!" - - def test_missing_input_types(): session = mocks.create_bigquery_session() remote_function_decorator = bff.remote_function( @@ -78,36 +54,6 @@ def function_without_return_annotation(myparam: int): remote_function_decorator(function_without_return_annotation) -def test_deploy_remote_function(): - session = mocks.create_bigquery_session() - - def my_remote_func(x: int) -> int: - return x * 2 - - deployed = session.deploy_remote_function( - my_remote_func, cloud_function_service_account="test_sa@example.com" - ) - - # Test that the function would have been deployed somewhere. - assert deployed.bigframes_bigquery_function - - -def test_deploy_remote_function_with_name(): - session = mocks.create_bigquery_session() - - def my_remote_func(x: int) -> int: - return x * 2 - - deployed = session.deploy_remote_function( - my_remote_func, - name="my_custom_name", - cloud_function_service_account="test_sa@example.com", - ) - - # Test that the function would have been deployed somewhere. - assert "my_custom_name" in deployed.bigframes_bigquery_function - - def test_deploy_udf(): session = mocks.create_bigquery_session() diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py index e200e7c12a..dcf6058767 100644 --- a/tests/unit/functions/test_remote_function_utils.py +++ b/tests/unit/functions/test_remote_function_utils.py @@ -41,82 +41,6 @@ def test_get_remote_function_locations( assert cf_region == expected_cf_region -@pytest.mark.parametrize( - "func_hash, session_id, uniq_suffix, expected_name", - [ - ( - "hash123", - None, - None, - "bigframes-hash123", - ), - ( - "hash456", - "session789", - None, - "bigframes-session789-hash456", - ), - ( - "hash123", - None, - "suffixABC", - "bigframes-hash123-suffixABC", - ), - ( - "hash456", - "session789", - "suffixDEF", - "bigframes-session789-hash456-suffixDEF", - ), - ], -) -def test_get_cloud_function_name(func_hash, session_id, uniq_suffix, expected_name): - """Tests the construction of the cloud function name from its parts.""" - result = _utils.get_cloud_function_name(func_hash, session_id, uniq_suffix) - - assert result == expected_name - - -@pytest.mark.parametrize( - "function_hash, session_id, uniq_suffix, expected_name", - [ - ( - "hash123", - "session456", - None, - "bigframes_session456_hash123", - ), - ( - "hash789", - "sessionABC", - "suffixDEF", - "bigframes_sessionABC_hash789_suffixDEF", - ), - ], -) -def test_get_bigframes_function_name( - function_hash, session_id, uniq_suffix, expected_name -): - """Tests the construction of the BigQuery function name from its parts.""" - result = _utils.get_bigframes_function_name(function_hash, session_id, uniq_suffix) - - assert result == expected_name - - -def test_get_updated_package_requirements_no_extra_package(): - """Tests with no extra package.""" - result = _utils.get_updated_package_requirements(capture_references=False) - - assert result is None - - initial_packages = ["xgboost"] - result = _utils.get_updated_package_requirements( - initial_packages, capture_references=False - ) - - assert result == initial_packages - - @patch("bigframes.functions._utils.numpy.__version__", "1.24.4") @patch("bigframes.functions._utils.pyarrow.__version__", "14.0.1") @patch("bigframes.functions._utils.pandas.__version__", "2.0.3") @@ -162,7 +86,7 @@ def test_get_updated_package_requirements_capture_references_false(): # Case 1: Only capture_references=False. result_1 = _utils.get_updated_package_requirements(capture_references=False) - assert result_1 is None + assert len(result_1) == 0 # Case 2: capture_references=False but is_row_processor=True. expected_2 = ["numpy==1.24.4", "pandas==2.0.3", "pyarrow==14.0.1"] From 17ecc65e1c0397ef349fca4afcf5a77af72aa798 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 26 Mar 2026 09:35:01 -0700 Subject: [PATCH 15/25] fix: to_gbq may swap data columns when replace table (#2532) Fixes https://github.com/googleapis/python-bigquery-dataframes/issues/2502 --- bigframes/session/bq_caching_executor.py | 16 ++++++++-------- tests/system/small/test_dataframe_io.py | 7 +++++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index c5d6fe3e5f..7cf9d9bd6d 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -247,7 +247,7 @@ def _export_gbq( ) sql = compiled.sql - if (existing_table is not None) and _if_schema_match( + if (existing_table is not None) and _is_schema_match( existing_table.schema, array_value.schema ): # b/409086472: Uses DML for table appends and replacements to avoid @@ -690,16 +690,16 @@ def _result_schema( ) -def _if_schema_match( - table_schema: Tuple[bigquery.SchemaField, ...], schema: schemata.ArraySchema +def _is_schema_match( + table_schema: Tuple[bigquery.SchemaField, ...], + schema: schemata.ArraySchema, ) -> bool: if len(table_schema) != len(schema.items): return False - for field in table_schema: - if field.name not in schema.names: + for field, schema_item in zip(table_schema, schema.items): + if field.name != schema_item.column: return False - if bigframes.dtypes.convert_schema_field(field)[1] != schema.get_type( - field.name - ): + _, field_dtype = bigframes.dtypes.convert_schema_field(field) + if field_dtype != schema_item.dtype: return False return True diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 3da3544cbb..b40dcca7d7 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -631,6 +631,13 @@ def test_to_gbq_if_exists_is_replace(scalars_dfs, dataset_id): assert len(gcs_df) == len(scalars_pandas_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + # When replacing a table with same schema but different column order + reordered_df = scalars_df[scalars_df.columns[::-1]] + reordered_df.to_gbq(destination_table, if_exists="replace") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, reordered_df.columns) + # When replacing a table with different schema partitial_scalars_df = scalars_df.drop(columns=["string_col"]) partitial_scalars_df.to_gbq(destination_table, if_exists="replace") From 69fe317612a69aa92f06f0c418c67aa1f9488bd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 26 Mar 2026 18:47:58 +0000 Subject: [PATCH 16/25] feat: expose DataFrame.bigquery in both pandas and bigframes DataFrames (#2533) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/bigquery/_operations/ai.py | 19 +++ bigframes/bigquery/_operations/sql.py | 19 +++ bigframes/dataframe.py | 41 ++++- bigframes/extensions/bigframes/__init__.py | 20 +++ .../bigframes/dataframe_accessor.py | 71 +++++++++ bigframes/extensions/core/__init__.py | 20 +++ .../extensions/core/dataframe_accessor.py | 125 ++++++++++++++++ .../extensions/pandas/dataframe_accessor.py | 141 +++++------------- docs/reference/index.rst | 3 +- .../test_bigframes_sql_scalar/out.sql | 4 + .../sqlglot/test_dataframe_accessor.py | 41 +++++ .../bigframes_vendored/pandas/core/frame.py | 9 +- .../pandas/core/tools/datetimes.py | 2 + 13 files changed, 395 insertions(+), 120 deletions(-) create mode 100644 bigframes/extensions/bigframes/__init__.py create mode 100644 bigframes/extensions/bigframes/dataframe_accessor.py create mode 100644 bigframes/extensions/core/__init__.py create mode 100644 bigframes/extensions/core/dataframe_accessor.py create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_bigframes_sql_scalar/out.sql diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 055a5cda79..bb73d02609 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -893,6 +893,25 @@ def forecast( and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). + **Examples:** + + Forecast using a pandas DataFrame: + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> df = pd.DataFrame({"value": [1, 2, 3], "time": pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"])}) + >>> bpd.options.display.progress_bar = None # doctest: +SKIP + >>> forecasted_pandas_df = df.bigquery.ai.forecast(data_col="value", timestamp_col="time", horizon=2) # doctest: +SKIP + >>> type(forecasted_pandas_df) # doctest: +SKIP + + + Forecast using a BigFrames DataFrame: + + >>> bf_df = bpd.DataFrame({"value": [1, 2, 3], "time": pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"])}) + >>> forecasted_bf_df = bf_df.bigquery.ai.forecast(data_col="value", timestamp_col="time", horizon=2) # doctest: +SKIP + >>> type(forecasted_bf_df) # doctest: +SKIP + + Args: df (DataFrame): The dataframe that contains the data that you want to forecast. It could be either a BigFrames Dataframe or diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index b65dfd2d16..649c7364dd 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -71,6 +71,25 @@ def sql_scalar( 2 4.000000000 dtype: decimal128(38, 9)[pyarrow] + You can also use the `.bigquery` DataFrame accessor to apply a SQL scalar function. + + Compute SQL scalar using a pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame({"x": [1, 2, 3]}) + >>> bpd.options.display.progress_bar = None # doctest: +SKIP + >>> pandas_s = df.bigquery.sql_scalar("POW({0}, 2)") # doctest: +SKIP + >>> type(pandas_s) # doctest: +SKIP + + + Compute SQL scalar using a BigFrames DataFrame: + + >>> bf_df = bpd.DataFrame({"x": [1, 2, 3]}) + >>> bf_s = bf_df.bigquery.sql_scalar("POW({0}, 2)") # doctest: +SKIP + >>> type(bf_s) # doctest: +SKIP + + + Args: sql_template (str): A SQL format string with Python-style {0} placeholders for each of diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 08c2c85e64..a8eb29e50c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -37,6 +37,7 @@ overload, Sequence, Tuple, + TYPE_CHECKING, TypeVar, Union, ) @@ -47,6 +48,8 @@ import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import google.api_core.exceptions import google.cloud.bigquery as bigquery +import google.cloud.bigquery.job +import google.cloud.bigquery.table import numpy import pandas from pandas.api import extensions as pd_ext @@ -91,9 +94,10 @@ import bigframes.session._io.bigquery import bigframes.session.execution_spec as ex_spec -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from _typeshed import SupportsRichComparison + import bigframes.extensions.bigframes.dataframe_accessor as bigquery_accessor import bigframes.session SingleItemValue = Union[ @@ -144,7 +148,7 @@ def __init__( ): global bigframes - self._query_job: Optional[bigquery.QueryJob] = None + self._query_job: Optional[google.cloud.bigquery.job.QueryJob] = None if copy is not None and not copy: raise ValueError( @@ -376,6 +380,25 @@ def bqclient(self) -> bigframes.Session: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def bigquery( + self, + ) -> bigquery_accessor.BigframesBigQueryDataFrameAccessor: + """ + Accessor for BigQuery functionality. + + Returns: + bigframes.extensions.core.dataframe_accessor.BigQueryDataFrameAccessor: + Accessor that exposes BigQuery functionality on a DataFrame, + with method names closer to SQL. + """ + # Import the accessor here to avoid circular imports. + import bigframes.extensions.bigframes.dataframe_accessor + + return bigframes.extensions.bigframes.dataframe_accessor.BigframesBigQueryDataFrameAccessor( + self + ) + @property def _has_index(self) -> bool: return len(self._block.index_columns) > 0 @@ -438,7 +461,9 @@ def _should_sql_have_index(self) -> bool: self.index.name is not None or len(self.index.names) > 1 ) - def _to_placeholder_table(self, dry_run: bool = False) -> bigquery.TableReference: + def _to_placeholder_table( + self, dry_run: bool = False + ) -> google.cloud.bigquery.table.TableReference: """Compiles this DataFrame's expression tree to SQL and saves it to a (temporary) view or table (in the case of a dry run). """ @@ -488,11 +513,11 @@ def sql(self) -> str: ) from e @property - def query_job(self) -> Optional[bigquery.QueryJob]: + def query_job(self) -> Optional[google.cloud.bigquery.job.QueryJob]: """BigQuery job metadata for the most recent query. Returns: - None or google.cloud.bigquery.QueryJob: + None or google.cloud.bigquery.job.QueryJob: The most recent `QueryJob `_. """ @@ -606,7 +631,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ) return DataFrame(self._block.select_columns(selected_columns)) - def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): + def _set_internal_query_job( + self, query_job: Optional[google.cloud.bigquery.job.QueryJob] + ): self._query_job = query_job def __getitem__( @@ -1782,7 +1809,7 @@ def _to_pandas_batches( allow_large_results=allow_large_results, ) - def _compute_dry_run(self) -> bigquery.QueryJob: + def _compute_dry_run(self) -> google.cloud.bigquery.job.QueryJob: _, query_job = self._block._compute_dry_run() return query_job diff --git a/bigframes/extensions/bigframes/__init__.py b/bigframes/extensions/bigframes/__init__.py new file mode 100644 index 0000000000..859b51d71c --- /dev/null +++ b/bigframes/extensions/bigframes/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.extensions.bigframes.dataframe_accessor import ( + BigframesAIAccessor, + BigframesBigQueryDataFrameAccessor, +) + +__all__ = ["BigframesAIAccessor", "BigframesBigQueryDataFrameAccessor"] diff --git a/bigframes/extensions/bigframes/dataframe_accessor.py b/bigframes/extensions/bigframes/dataframe_accessor.py new file mode 100644 index 0000000000..f58f0d4838 --- /dev/null +++ b/bigframes/extensions/bigframes/dataframe_accessor.py @@ -0,0 +1,71 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import cast, TypeVar + +from bigframes.core.logging import log_adapter +import bigframes.dataframe +import bigframes.extensions.core.dataframe_accessor as core_accessor +import bigframes.series + +T = TypeVar("T", bound="bigframes.dataframe.DataFrame") +S = TypeVar("S", bound="bigframes.series.Series") + + +@log_adapter.class_logger +class BigframesAIAccessor(core_accessor.AIAccessor[T, S]): + """ + BigFrames DataFrame accessor for BigQuery AI functions. + """ + + def __init__(self, bf_obj: T): + super().__init__(bf_obj) + + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: + return self._obj + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series) + + +@log_adapter.class_logger +class BigframesBigQueryDataFrameAccessor(core_accessor.BigQueryDataFrameAccessor[T, S]): + """ + BigFrames DataFrame accessor for BigQuery DataFrames functionality. + """ + + def __init__(self, bf_obj: T): + super().__init__(bf_obj) + + @property + def ai(self) -> BigframesAIAccessor: + return BigframesAIAccessor(self._obj) + + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: + return self._obj + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series) diff --git a/bigframes/extensions/core/__init__.py b/bigframes/extensions/core/__init__.py new file mode 100644 index 0000000000..41b554c99e --- /dev/null +++ b/bigframes/extensions/core/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.extensions.core.dataframe_accessor import ( + AIAccessor, + BigQueryDataFrameAccessor, +) + +__all__ = ["AIAccessor", "BigQueryDataFrameAccessor"] diff --git a/bigframes/extensions/core/dataframe_accessor.py b/bigframes/extensions/core/dataframe_accessor.py new file mode 100644 index 0000000000..02c13e4555 --- /dev/null +++ b/bigframes/extensions/core/dataframe_accessor.py @@ -0,0 +1,125 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import abc +from typing import Generic, Iterable, TYPE_CHECKING, TypeVar + +if TYPE_CHECKING: + import bigframes.dataframe + import bigframes.session + +T = TypeVar("T") +S = TypeVar("S") + + +class AbstractBigQueryDataFrameAccessor(abc.ABC, Generic[T, S]): + @abc.abstractmethod + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: + """Convert the accessor's object to a BigFrames DataFrame.""" + + @abc.abstractmethod + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + """Convert a BigFrames DataFrame to the accessor's object type.""" + + @abc.abstractmethod + def _to_series(self, bf_series: bigframes.series.Series) -> S: + """Convert a BigFrames Series to the accessor's object type.""" + + +class AIAccessor(AbstractBigQueryDataFrameAccessor[T, S]): + """ + DataFrame accessor for BigQuery AI functions. + """ + + def __init__(self, obj: T): + self._obj = obj + + def forecast( + self, + *, + data_col: str, + timestamp_col: str, + model: str = "TimesFM 2.0", + id_cols: Iterable[str] | None = None, + horizon: int = 10, + confidence_level: float = 0.95, + context_window: int | None = None, + output_historical_time_series: bool = False, + session: bigframes.session.Session | None = None, + ) -> T: + """ + Forecast time series at future horizon using BigQuery AI.FORECAST. + + This is an accessor for :func:`bigframes.bigquery.ai.forecast`. See that + function's documentation for detailed parameter descriptions and examples. + """ + import bigframes.bigquery.ai + + bf_df = self._bf_from_dataframe(session) + result = bigframes.bigquery.ai.forecast( + bf_df, + data_col=data_col, + timestamp_col=timestamp_col, + model=model, + id_cols=id_cols, + horizon=horizon, + confidence_level=confidence_level, + context_window=context_window, + output_historical_time_series=output_historical_time_series, + ) + return self._to_dataframe(result) + + +class BigQueryDataFrameAccessor(AbstractBigQueryDataFrameAccessor[T, S]): + """ + DataFrame accessor for BigQuery DataFrames functionality. + """ + + def __init__(self, obj: T): + self._obj = obj + + @property + @abc.abstractmethod + def ai(self) -> AIAccessor: + """ + Accessor for BigQuery AI functions. + + Returns: + AIAccessor: Accessor for BigQuery AI functions. + """ + + def sql_scalar( + self, + sql_template: str, + *, + output_dtype=None, + session: bigframes.session.Session | None = None, + ) -> S: + """ + Compute a new Series by applying a SQL scalar function to the DataFrame. + + This is an accessor for :func:`bigframes.bigquery.sql_scalar`. See that + function's documentation for detailed parameter descriptions and examples. + """ + import bigframes.bigquery + + bf_df = self._bf_from_dataframe(session) + result = bigframes.bigquery.sql_scalar( + sql_template, bf_df, output_dtype=output_dtype + ) + return self._to_series(result) diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py index ad75386f1c..3edb8ebe14 100644 --- a/bigframes/extensions/pandas/dataframe_accessor.py +++ b/bigframes/extensions/pandas/dataframe_accessor.py @@ -12,145 +12,72 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import cast, Iterable, Optional +from typing import cast, TypeVar import pandas import pandas.api.extensions import bigframes.core.global_session as bf_session from bigframes.core.logging import log_adapter +import bigframes.dataframe +from bigframes.extensions.core.dataframe_accessor import ( + AIAccessor, + BigQueryDataFrameAccessor, +) import bigframes.pandas as bpd +T = TypeVar("T", bound="pandas.DataFrame") +S = TypeVar("S", bound="pandas.Series") + @log_adapter.class_logger -class PandasAIAccessor: +class PandasAIAccessor(AIAccessor[T, S]): """ Pandas DataFrame accessor for BigQuery AI functions. """ - def __init__(self, pandas_obj: pandas.DataFrame): - self._obj = pandas_obj - - def forecast( - self, - *, - data_col: str, - timestamp_col: str, - model: str = "TimesFM 2.0", - id_cols: Optional[Iterable[str]] = None, - horizon: int = 10, - confidence_level: float = 0.95, - context_window: Optional[int] = None, - output_historical_time_series: bool = False, - session=None, - ) -> pandas.DataFrame: - """ - Forecast time series at future horizon using BigQuery AI.FORECAST. - - See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast - - Args: - data_col (str): - A str value that specifies the name of the data column. The data column contains the data to forecast. - The data column must use one of the following data types: INT64, NUMERIC and FLOAT64 - timestamp_col (str): - A str value that specified the name of the time points column. - The time points column provides the time points used to generate the forecast. - The time points column must use one of the following data types: TIMESTAMP, DATE and DATETIME - model (str, default "TimesFM 2.0"): - A str value that specifies the name of the model. "TimesFM 2.0" and "TimesFM 2.5" are supported. - id_cols (Iterable[str], optional): - An iterable of str value that specifies the names of one or more ID columns. Each ID identifies a unique time series to forecast. - Specify one or more values for this argument in order to forecast multiple time series using a single query. - The columns that you specify must use one of the following data types: STRING, INT64, ARRAY and ARRAY - horizon (int, default 10): - An int value that specifies the number of time points to forecast. The default value is 10. The valid input range is [1, 10,000]. - confidence_level (float, default 0.95): - A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval. - The default value is 0.95. The valid input range is [0, 1). - context_window (int, optional): - An int value that specifies the context window length used by BigQuery ML's built-in TimesFM model. - The context window length determines how many of the most recent data points from the input time series are use by the model. - If you don't specify a value, the AI.FORECAST function automatically chooses the smallest possible context window length to use - that is still large enough to cover the number of time series data points in your input data. - output_historical_time_series (bool, default False): - A boolean value that determines whether to include the input time series history in the forecast. - session (bigframes.session.Session, optional): - The BigFrames session to use. If not provided, the default global session is used. - - Returns: - pandas.DataFrame: - The forecast DataFrame result. - """ - import bigframes.bigquery.ai + def __init__(self, pandas_obj: T): + super().__init__(pandas_obj) + def _bf_from_dataframe( + self, session: bigframes.session.Session | None + ) -> bigframes.dataframe.DataFrame: if session is None: session = bf_session.get_global_session() - bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) - result = bigframes.bigquery.ai.forecast( - bf_df, - data_col=data_col, - timestamp_col=timestamp_col, - model=model, - id_cols=id_cols, - horizon=horizon, - confidence_level=confidence_level, - context_window=context_window, - output_historical_time_series=output_historical_time_series, - ) - return result.to_pandas(ordered=True) + return cast(bpd.DataFrame, session.read_pandas(self._obj)) + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series.to_pandas(ordered=True)) @pandas.api.extensions.register_dataframe_accessor("bigquery") @log_adapter.class_logger -class PandasBigQueryDataFrameAccessor: +class PandasBigQueryDataFrameAccessor(BigQueryDataFrameAccessor[T, S]): """ Pandas DataFrame accessor for BigQuery DataFrames functionality. This accessor is registered under the ``bigquery`` namespace on pandas DataFrame objects. """ - def __init__(self, pandas_obj: pandas.DataFrame): - self._obj = pandas_obj + def __init__(self, pandas_obj: T): + super().__init__(pandas_obj) @property - def ai(self) -> "PandasAIAccessor": - """ - Accessor for BigQuery AI functions. - """ + def ai(self) -> PandasAIAccessor: return PandasAIAccessor(self._obj) - def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None): - """ - Compute a new pandas Series by applying a SQL scalar function to the DataFrame. - - The DataFrame is converted to BigFrames by calling ``read_pandas``, then the SQL - template is applied using ``bigframes.bigquery.sql_scalar``, and the result is - converted back to a pandas Series using ``to_pandas``. - - Args: - sql_template (str): - A SQL format string with Python-style {0}, {1}, etc. placeholders for each of - the columns in the DataFrame (in the order they appear in ``df.columns``). - output_dtype (a BigQuery DataFrames compatible dtype, optional): - If provided, BigQuery DataFrames uses this to determine the output - of the returned Series. This avoids a dry run query. - session (bigframes.session.Session, optional): - The BigFrames session to use. If not provided, the default global session is used. - - Returns: - pandas.Series: - The result of the SQL scalar function as a pandas Series. - """ - # Import bigframes.bigquery here to avoid circular imports - import bigframes.bigquery - + def _bf_from_dataframe(self, session) -> bigframes.dataframe.DataFrame: if session is None: session = bf_session.get_global_session() - bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) - result = bigframes.bigquery.sql_scalar( - sql_template, bf_df, output_dtype=output_dtype - ) - return result.to_pandas(ordered=True) + return cast(bpd.DataFrame, session.read_pandas(self._obj)) + + def _to_dataframe(self, bf_df: bigframes.dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) + + def _to_series(self, bf_series: bigframes.series.Series) -> S: + return cast(S, bf_series.to_pandas(ordered=True)) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 0de668c4fa..0ddfa5f0e3 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -27,7 +27,8 @@ BigQuery DataFrames provides extensions to pandas DataFrame objects. .. autosummary:: :toctree: api - bigframes.extensions.pandas + bigframes.extensions.core.dataframe_accessor.BigQueryDataFrameAccessor + bigframes.extensions.core.dataframe_accessor.AIAccessor ML APIs ~~~~~~~ diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_bigframes_sql_scalar/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_bigframes_sql_scalar/out.sql new file mode 100644 index 0000000000..14853067c7 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_bigframes_sql_scalar/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + ROUND(`int64_col` + `int64_too`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py index 364f738353..7b3ee5f922 100644 --- a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py +++ b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py @@ -82,3 +82,44 @@ def mock_ai_forecast(df, **kwargs): "context_window": None, "output_historical_time_series": False, } + + +def test_bigframes_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot): + session = mock.create_autospec(bigframes.session.Session) + + result = scalar_types_df.bigquery.sql_scalar( + "ROUND({int64_col} + {int64_too})", + output_dtype=pd.Int64Dtype(), + session=session, + ) + + session.read_pandas.assert_not_called() + # Bigframes implementation returns a bigframes.series.Series + sql, _, _ = result.to_frame()._to_sql_query(include_index=True) + snapshot.assert_match(sql, "out.sql") + + +def test_bigframes_ai_forecast(snapshot, monkeypatch): + import bigframes.bigquery.ai + import bigframes.session + + session = mock.create_autospec(bigframes.session.Session) + bf_df = mock.create_autospec(bpd.DataFrame) + + def mock_ai_forecast(df, **kwargs): + assert df is bf_df + result_df = mock.create_autospec(bpd.DataFrame) + return result_df + + monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_ai_forecast) + + result = bf_df.bigquery.ai.forecast( + timestamp_col="date", + data_col="value", + horizon=5, + session=session, + ) + + session.read_pandas.assert_not_called() + # BigFrames accessor returns the bf_df directly without calling to_pandas + assert result is not None diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f04d9989dd..09b2a4045f 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2026,7 +2026,6 @@ def where(self, cond, other): **Examples:** - >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2097,14 +2096,14 @@ def where(self, cond, other): with corresponding value from other. If cond is callable, it is computed on the Series/DataFrame and returns boolean Series/DataFrame or array. The callable must not change input - Series/DataFrame (though pandas doesn’t check it). + Series/DataFrame. other (scalar, DataFrame, or callable): Entries where cond is False are replaced with corresponding value from other. If other is callable, it is computed on the DataFrame and returns scalar or DataFrame. The callable must not - change input DataFrame (though pandas doesn’t check it). If not - specified, entries will be filled with the corresponding NULL - value (np.nan for numpy dtypes, pd.NA for extension dtypes). + change input DataFrame. If not specified, entries will be filled + with the corresponding NULL value (np.nan for numpy dtypes, + pd.NA for extension dtypes). Returns: DataFrame: DataFrame after the replacement. diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 655f801b3d..c5f9f8330f 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -1,5 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py +from __future__ import annotations + from datetime import date, datetime from typing import List, Mapping, Tuple, Union From 44e0ffd947e9db66ab612f92de6e31f1085e7968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 26 Mar 2026 20:57:33 +0000 Subject: [PATCH 17/25] docs: Rename Blob column references to ObjectRef column (#2535) Updates documentation and internal comments to use the term "ObjectRef column" instead of "Blob column", as per the official BigQuery documentation. Links to the documentation are included in user-facing docstrings. --- *PR created automatically by Jules for task [15739234298342142432](https://jules.google.com/task/15739234298342142432) started by @tswast* Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: tswast <247555+tswast@users.noreply.github.com> --- bigframes/dataframe.py | 2 +- bigframes/ml/llm.py | 8 ++++---- bigframes/operations/ai.py | 2 +- bigframes/operations/semantics.py | 4 ++-- bigframes/session/__init__.py | 12 ++++++------ 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a8eb29e50c..1ac80a4e6a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -817,7 +817,7 @@ def __repr__(self) -> str: ) def _get_display_df_and_blob_cols(self) -> tuple[DataFrame, list[str]]: - """Process blob columns for display.""" + """Process ObjectRef columns for display.""" df = self blob_cols = [] if bigframes.options.display.blob_display: diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 68842961e3..78f592a10e 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -286,7 +286,7 @@ class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor): """Multimodal embedding generator LLM model. .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). @@ -374,7 +374,7 @@ def predict( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. - The content column must be of string type or BigFrames Blob of image or video. + The content column must be of string type or BigFrames `ObjectRef `_ of image or video. max_retries (int, default 0): Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry. @@ -668,13 +668,13 @@ def predict( prompt (Iterable of str or bigframes.series.Series, or None, default None): .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). Construct a prompt struct column for prediction based on the input. The input must be an Iterable that can take string literals, - such as "summarize", string column(s) of X, such as X["str_col"], or blob column(s) of X, such as X["blob_col"]. + such as "summarize", string column(s) of X, such as X["str_col"], or `ObjectRef column(s) `_ of X, such as X["objectref_col"]. It creates a struct column of the items of the iterable, and use the concatenated result as the input prompt. No-op if set to None. output_schema (Mapping[str, str] or None, default None): The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of :. diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 6921299acd..6516620b53 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -203,7 +203,7 @@ def map( has_blob_column = False for column in columns: if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string + # Don't cast ObjectRef columns to string has_blob_column = True continue diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index f237959d0d..b445698630 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -382,7 +382,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals has_blob_column = False for column in columns: if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string + # Don't cast ObjectRef columns to string has_blob_column = True continue @@ -501,7 +501,7 @@ def map( has_blob_column = False for column in columns: if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string + # Don't cast ObjectRef columns to string has_blob_column = True continue diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0a2f2db189..710b3701fa 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -2234,12 +2234,12 @@ def _create_temp_table( def from_glob_path( self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None ) -> dataframe.DataFrame: - r"""Create a BigFrames DataFrame that contains a BigFrames Blob column from a global wildcard path. + r"""Create a BigFrames DataFrame that contains a BigFrames `ObjectRef column `_ from a global wildcard path. This operation creates a temporary BQ Object Table under the hood and requires bigquery.connections.delegate permission or BigQuery Connection Admin role. If you have an existing BQ Object Table, use read_gbq_object_table(). .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). @@ -2252,7 +2252,7 @@ def from_glob_path( If None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach permission if the connection isn't fully set up. name (str): - The column name of the Blob column. + The column name of the ObjectRef column. Returns: bigframes.pandas.DataFrame: Result BigFrames DataFrame. @@ -2295,18 +2295,18 @@ def _create_bq_connection( def read_gbq_object_table( self, object_table: str, *, name: Optional[str] = None ) -> dataframe.DataFrame: - """Read an existing object table to create a BigFrames Blob DataFrame. Use the connection of the object table for the connection of the blob. + """Read an existing object table to create a BigFrames `ObjectRef `_ DataFrame. Use the connection of the object table for the connection of the ObjectRef. This function dosen't retrieve the object table data. If you want to read the data, use read_gbq() instead. .. note:: - BigFrames Blob is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + BigFrames ObjectRef is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). Args: object_table (str): name of the object table of form ... - name (str or None): the returned blob column name. + name (str or None): the returned ObjectRef column name. Returns: bigframes.pandas.DataFrame: From 677d6cc8d510dcf62a5ca20e781d1792d743e661 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 27 Mar 2026 11:06:03 -0700 Subject: [PATCH 18/25] refactor: Simplify @udf code (#2539) --- bigframes/functions/_function_client.py | 90 ++++++++--------- bigframes/functions/_function_session.py | 96 +++++++++---------- bigframes/functions/_utils.py | 12 --- bigframes/functions/function_template.py | 34 +++---- bigframes/functions/udf_def.py | 40 +++++++- bigframes/testing/utils.py | 14 --- .../small/functions/test_remote_function.py | 20 +++- .../functions/test_remote_function_utils.py | 72 -------------- 8 files changed, 151 insertions(+), 227 deletions(-) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 4b368f48cc..fc06465327 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -15,7 +15,6 @@ from __future__ import annotations -import inspect import logging import os import random @@ -25,7 +24,7 @@ import tempfile import textwrap import types -from typing import Any, cast, Optional, Sequence, TYPE_CHECKING +from typing import Any, cast, Optional, TYPE_CHECKING import warnings import requests @@ -87,7 +86,6 @@ def __init__( bq_location, bq_dataset, bq_client, - bq_connection_id, bq_connection_manager, cloud_function_region=None, cloud_functions_client=None, @@ -102,7 +100,6 @@ def __init__( self._bq_location = bq_location self._bq_dataset = bq_dataset self._bq_client = bq_client - self._bq_connection_id = bq_connection_id self._bq_connection_manager = bq_connection_manager self._session = session @@ -114,12 +111,12 @@ def __init__( self._cloud_function_docker_repository = cloud_function_docker_repository self._cloud_build_service_account = cloud_build_service_account - def _create_bq_connection(self) -> None: + def _create_bq_connection(self, connection_id: str) -> None: if self._bq_connection_manager: self._bq_connection_manager.create_bq_connection( self._gcp_project_id, self._bq_location, - self._bq_connection_id, + connection_id, "run.invoker", ) @@ -174,7 +171,7 @@ def create_bq_remote_function( ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" - self._create_bq_connection() + self._create_bq_connection(udf_def.connection_id) # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 @@ -202,7 +199,7 @@ def create_bq_remote_function( create_function_ddl = f""" CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name_escaped}({udf_def.signature.to_sql_input_signature()}) RETURNS {udf_def.signature.with_devirtualize().output.sql_type} - REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` + REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{udf_def.connection_id}` OPTIONS ({remote_function_options_str})""" logger.info(f"Creating BQ remote function: {create_function_ddl}") @@ -212,26 +209,15 @@ def create_bq_remote_function( def provision_bq_managed_function( self, - func, - input_types: Sequence[str], - output_type: str, name: Optional[str], - packages: Optional[Sequence[str]], - max_batching_rows: Optional[int], - container_cpu: Optional[float], - container_memory: Optional[str], - is_row_processor: bool, - bq_connection_id, - *, - capture_references: bool = False, + config: udf_def.ManagedFunctionConfig, ): """Create a BigQuery managed function.""" # TODO(b/406283812): Expose the capability to pass down # capture_references=True in the public udf API. - # TODO(b/495508827): Include all config in the value hash. if ( - capture_references + config.capture_references and (python_version := _utils.get_python_version()) != _MANAGED_FUNC_PYTHON_VERSION ): @@ -241,31 +227,26 @@ def provision_bq_managed_function( ) # Create BQ managed function. - bq_function_args = [] - bq_function_return_type = output_type - - input_args = inspect.getargs(func.__code__).args - # We expect the input type annotations to be 1:1 with the input args. - for name_, type_ in zip(input_args, input_types): - bq_function_args.append(f"{name_} {type_}") + bq_function_args = config.signature.to_sql_input_signature() + bq_function_return_type = config.signature.with_devirtualize().output.sql_type managed_function_options: dict[str, Any] = { "runtime_version": _MANAGED_FUNC_PYTHON_VERSION, "entry_point": "bigframes_handler", } - if max_batching_rows: - managed_function_options["max_batching_rows"] = max_batching_rows - if container_cpu: - managed_function_options["container_cpu"] = container_cpu - if container_memory: - managed_function_options["container_memory"] = container_memory + if config.max_batching_rows: + managed_function_options["max_batching_rows"] = config.max_batching_rows + if config.container_cpu: + managed_function_options["container_cpu"] = config.container_cpu + if config.container_memory: + managed_function_options["container_memory"] = config.container_memory # Augment user package requirements with any internal package # requirements. packages = _utils.get_updated_package_requirements( - packages or [], - is_row_processor, - capture_references, + config.code.package_requirements or [], + config.signature.is_row_processor, + config.capture_references, ignore_package_version=True, ) if packages: @@ -276,26 +257,20 @@ def provision_bq_managed_function( bq_function_name = name if not bq_function_name: - # Compute a unique hash representing the user code. - function_hash = _utils.get_hash(func, packages) - bq_function_name = _utils.get_managed_function_name( - function_hash, - # session-scope in absensce of name from user - # name indicates permanent allocation - None if name else self._session.session_id, + # Compute a unique hash representing the artifact definition. + bq_function_name = get_managed_function_name( + config, self._session.session_id ) persistent_func_id = ( f"`{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}" ) - udf_name = func.__name__ - with_connection_clause = ( ( - f"WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}`" + f"WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{config.bq_connection_id}`" ) - if bq_connection_id + if config.bq_connection_id else "" ) @@ -303,13 +278,13 @@ def provision_bq_managed_function( # including the user's function, necessary imports, and the BigQuery # handler wrapper. python_code_block = bff_template.generate_managed_function_code( - func, udf_name, is_row_processor, capture_references + config.code, config.signature, config.capture_references ) create_function_ddl = ( textwrap.dedent( f""" - CREATE OR REPLACE FUNCTION {persistent_func_id}({','.join(bq_function_args)}) + CREATE OR REPLACE FUNCTION {persistent_func_id}({bq_function_args}) RETURNS {bq_function_return_type} LANGUAGE python {with_connection_clause} @@ -590,6 +565,7 @@ def provision_bq_remote_function( cloud_function_memory_mib: int | None, cloud_function_cpus: float | None, cloud_function_ingress_settings: str, + bq_connection_id: str, ): """Provision a BigQuery remote function.""" # Augment user package requirements with any internal package @@ -657,7 +633,7 @@ def provision_bq_remote_function( intended_rf_spec = udf_def.RemoteFunctionConfig( endpoint=cf_endpoint, - connection_id=self._bq_connection_id, + connection_id=bq_connection_id, max_batching_rows=max_batching_rows or 1000, signature=func_signature, bq_metadata=func_signature.protocol_metadata, @@ -731,6 +707,18 @@ def get_bigframes_function_name( return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) +def get_managed_function_name( + function_def: udf_def.ManagedFunctionConfig, + session_id: str | None = None, +): + """Get a name for the bigframes managed function for the given user defined function.""" + parts = [_BIGFRAMES_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_def.stable_hash().hex()) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + def _validate_routine_name(name: str) -> None: """Validate that the given name is a valid BigQuery routine name.""" # Routine IDs can contain only letters (a-z, A-Z), numbers (0-9), or underscores (_) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 85753a71ce..fe7889e955 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -556,34 +556,13 @@ def wrapper(func): func, **signature_kwargs, ) - if input_types is not None: - if not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - if _utils.has_conflict_input_type(py_sig, input_types): - msg = bfe.format_message( - "Conflicting input types detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace( - parameters=[ - par.replace(annotation=itype) - for par, itype in zip(py_sig.parameters.values(), input_types) - ] - ) - if output_type: - if _utils.has_conflict_output_type(py_sig, output_type): - msg = bfe.format_message( - "Conflicting return type detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace(return_annotation=output_type) + py_sig = _resolve_signature(py_sig, input_types, output_type) remote_function_client = _function_client.FunctionClient( dataset_ref.project, bq_location, dataset_ref.dataset_id, bigquery_client, - bq_connection_id, bq_connection_manager, cloud_function_region, cloud_functions_client, @@ -618,6 +597,7 @@ def wrapper(func): cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_cpus=cloud_function_cpus, cloud_function_ingress_settings=cloud_function_ingress_settings, + bq_connection_id=bq_connection_id, ) bigframes_cloud_function = ( @@ -840,27 +820,7 @@ def wrapper(func): func, **signature_kwargs, ) - if input_types is not None: - if not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - if _utils.has_conflict_input_type(py_sig, input_types): - msg = bfe.format_message( - "Conflicting input types detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace( - parameters=[ - par.replace(annotation=itype) - for par, itype in zip(py_sig.parameters.values(), input_types) - ] - ) - if output_type: - if _utils.has_conflict_output_type(py_sig, output_type): - msg = bfe.format_message( - "Conflicting return type detected, using the one from the decorator." - ) - warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) - py_sig = py_sig.replace(return_annotation=output_type) + py_sig = _resolve_signature(py_sig, input_types, output_type) # The function will actually be receiving a pandas Series, but allow # both BigQuery DataFrames and pandas object types for compatibility. @@ -872,22 +832,22 @@ def wrapper(func): bq_location, dataset_ref.dataset_id, bigquery_client, - bq_connection_id, bq_connection_manager, session=session, # type: ignore ) - - bq_function_name = managed_function_client.provision_bq_managed_function( - func=func, - input_types=tuple(arg.sql_type for arg in udf_sig.inputs), - output_type=udf_sig.output.sql_type, - name=name, - packages=packages, + config = udf_def.ManagedFunctionConfig( + code=udf_def.CodeDef.from_func(func), + signature=udf_sig, max_batching_rows=max_batching_rows, container_cpu=container_cpu, container_memory=container_memory, - is_row_processor=udf_sig.is_row_processor, bq_connection_id=bq_connection_id, + capture_references=False, + ) + + bq_function_name = managed_function_client.provision_bq_managed_function( + name=name, + config=config, ) full_rf_name = ( managed_function_client.get_remote_function_fully_qualilfied_name( @@ -907,12 +867,14 @@ def wrapper(func): if udf_sig.is_row_processor: msg = bfe.format_message("input_types=Series is in preview.") warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) + assert session is not None # appease mypy return decorator( bq_functions.BigqueryCallableRowRoutine( udf_definition, session, local_func=func, is_managed=True ) ) else: + assert session is not None # appease mypy return decorator( bq_functions.BigqueryCallableRoutine( udf_definition, @@ -949,3 +911,33 @@ def deploy_udf( # TODO(tswast): If we update udf to defer deployment, update this method # to deploy immediately. return self.udf(**kwargs)(func) + + +def _resolve_signature( + py_sig: inspect.Signature, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, +) -> inspect.Signature: + if input_types is not None: + if not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + if _utils.has_conflict_input_type(py_sig, input_types): + msg = bfe.format_message( + "Conflicting input types detected, using the one from the decorator." + ) + warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) + py_sig = py_sig.replace( + parameters=[ + par.replace(annotation=itype) + for par, itype in zip(py_sig.parameters.values(), input_types) + ] + ) + if output_type: + if _utils.has_conflict_output_type(py_sig, output_type): + msg = bfe.format_message( + "Conflicting return type detected, using the one from the decorator." + ) + warnings.warn(msg, category=bfe.FunctionConflictTypeHintWarning) + py_sig = py_sig.replace(return_annotation=output_type) + + return py_sig diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py index c197ed14fc..e02cd94fb1 100644 --- a/bigframes/functions/_utils.py +++ b/bigframes/functions/_utils.py @@ -186,18 +186,6 @@ def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> s return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" -def get_managed_function_name( - function_hash: str, - session_id: str | None = None, -): - """Get a name for the bigframes managed function for the given user defined function.""" - parts = [_BIGFRAMES_FUNCTION_PREFIX] - if session_id: - parts.append(session_id) - parts.append(function_hash) - return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) - - # Deprecated: Use CodeDef.stable_hash() instead. def get_hash(def_, package_requirements=None): "Get hash (32 digits alphanumeric) of a function." diff --git a/bigframes/functions/function_template.py b/bigframes/functions/function_template.py index 31b5b20520..33a3688cf1 100644 --- a/bigframes/functions/function_template.py +++ b/bigframes/functions/function_template.py @@ -20,8 +20,6 @@ import re import textwrap -import cloudpickle - from bigframes.functions import udf_def logger = logging.getLogger(__name__) @@ -230,9 +228,10 @@ def generate_udf_code(code_def: udf_def.CodeDef, directory: str): udf_pickle_file_name = "udf.cloudpickle" # original code, only for debugging purpose - udf_code_file_path = os.path.join(directory, udf_code_file_name) - with open(udf_code_file_path, "w") as f: - f.write(code_def.function_source) + if code_def.function_source: + udf_code_file_path = os.path.join(directory, udf_code_file_name) + with open(udf_code_file_path, "w") as f: + f.write(code_def.function_source) # serialized udf udf_pickle_file_path = os.path.join(directory, udf_pickle_file_name) @@ -293,35 +292,37 @@ def generate_cloud_function_main_code( def generate_managed_function_code( - def_, - udf_name: str, - is_row_processor: bool, + code_def: udf_def.CodeDef, + signature: udf_def.UdfSignature, capture_references: bool, ) -> str: """Generates the Python code block for managed Python UDF.""" + udf_name = "unpickled_udf" if capture_references: # This code path ensures that if the udf body contains any # references to variables and/or imports outside the body, they are # captured as well. - pickled = cloudpickle.dumps(def_) func_code = textwrap.dedent( f""" import cloudpickle - {udf_name} = cloudpickle.loads({pickled}) + {udf_name} = cloudpickle.loads({code_def.pickled_code!r}) """ ) else: # This code path ensures that if the udf body is self contained, # i.e. there are no references to variables or imports outside the # body. - func_code = textwrap.dedent(inspect.getsource(def_)) + assert code_def.function_source is not None + assert code_def.entry_point is not None + func_code = code_def.function_source + udf_name = code_def.entry_point match = re.search(r"^def ", func_code, flags=re.MULTILINE) if match is None: raise ValueError("The UDF is not defined correctly.") func_code = func_code[match.start() :] - if is_row_processor: + if signature.is_row_processor: udf_code = textwrap.dedent(inspect.getsource(get_pd_series)) udf_code = udf_code[udf_code.index("def") :] bigframes_handler_code = textwrap.dedent( @@ -331,20 +332,19 @@ def bigframes_handler(str_arg): """ ) - sig = inspect.signature(def_) - params = list(sig.parameters.values()) + params = list(arg.name for arg in signature.inputs) additional_params = params[1:] # Build the parameter list for the new handler function definition. # e.g., "str_arg, y: bool, z" handler_def_parts = ["str_arg"] - handler_def_parts.extend(str(p) for p in additional_params) + handler_def_parts.extend(additional_params) handler_def_str = ", ".join(handler_def_parts) # Build the argument list for the call to the original UDF. # e.g., "get_pd_series(str_arg), y, z" udf_call_parts = [f"{get_pd_series.__name__}(str_arg)"] - udf_call_parts.extend(p.name for p in additional_params) + udf_call_parts.extend(additional_params) udf_call_str = ", ".join(udf_call_parts) bigframes_handler_code = textwrap.dedent( @@ -364,7 +364,7 @@ def bigframes_handler(*args): ) udf_code_block = [] - if not capture_references and is_row_processor: + if not capture_references and signature.is_row_processor: # Enable postponed evaluation of type annotations. This converts all # type hints to strings at runtime, which is necessary for correctly # handling the type annotation of pandas.Series after the UDF code is diff --git a/bigframes/functions/udf_def.py b/bigframes/functions/udf_def.py index f02f289ef6..3ebf2eeb47 100644 --- a/bigframes/functions/udf_def.py +++ b/bigframes/functions/udf_def.py @@ -19,7 +19,7 @@ import io import os import textwrap -from typing import Any, cast, get_args, get_origin, Sequence, Type +from typing import Any, cast, get_args, get_origin, Optional, Sequence, Type import warnings import cloudpickle @@ -401,18 +401,26 @@ class CodeDef: # Produced by cloudpickle, not compatible across python versions pickled_code: bytes # This is just the function itself, and does not include referenced objects/functions/modules - function_source: str + function_source: Optional[str] + entry_point: Optional[str] package_requirements: tuple[str, ...] @classmethod def from_func(cls, func, package_requirements: Sequence[str] | None = None): bytes_io = io.BytesIO() cloudpickle.dump(func, bytes_io, protocol=_pickle_protocol_version) - # this is hacky, but works for some nested functions - source = textwrap.dedent(inspect.getsource(func)) + source = None + entry_point = None + try: + # dedent is hacky, but works for some nested functions + source = textwrap.dedent(inspect.getsource(func)) + entry_point = func.__name__ + except OSError: + pass return cls( pickled_code=bytes_io.getvalue(), function_source=source, + entry_point=entry_point, package_requirements=tuple(package_requirements or []), ) @@ -448,6 +456,30 @@ def stable_hash(self) -> bytes: return hash_val.digest() +@dataclasses.dataclass(frozen=True) +class ManagedFunctionConfig: + code: CodeDef + signature: UdfSignature + max_batching_rows: Optional[int] + container_cpu: Optional[float] + container_memory: Optional[str] + bq_connection_id: Optional[str] + # capture_refernces=True -> deploy as cloudpickle + # capture_references=False -> deploy as source + capture_references: bool = False + + def stable_hash(self) -> bytes: + hash_val = google_crc32c.Checksum() + hash_val.update(self.code.stable_hash()) + hash_val.update(self.signature.stable_hash()) + hash_val.update(str(self.max_batching_rows).encode()) + hash_val.update(str(self.container_cpu).encode()) + hash_val.update(str(self.container_memory).encode()) + hash_val.update(str(self.bq_connection_id).encode()) + hash_val.update(str(self.capture_references).encode()) + return hash_val.digest() + + @dataclasses.dataclass(frozen=True) class CloudRunFunctionConfig: code: CodeDef diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index 5f4a8d2627..bd2fa41c5e 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -508,20 +508,6 @@ def cleanup_function_assets( pass -def get_function_name(func, package_requirements=None, is_row_processor=False): - """Get a bigframes function name for testing given a udf.""" - # Augment user package requirements with any internal package - # requirements. - package_requirements = bff_utils.get_updated_package_requirements( - package_requirements or [], is_row_processor - ) - - # Compute a unique hash representing the user code. - function_hash = bff_utils.get_hash(func, package_requirements) - - return f"bigframes_{function_hash}" - - def _apply_ops_to_sql( obj: bpd.DataFrame, ops_list: Sequence[ex.Expression], diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 643f503c05..0a9875a989 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -34,15 +34,25 @@ from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff import bigframes.session._io.bigquery -from bigframes.testing.utils import ( - assert_frame_equal, - assert_series_equal, - get_function_name, -) +from bigframes.testing.utils import assert_frame_equal, assert_series_equal _prefixer = test_utils.prefixer.Prefixer("bigframes", "") +def get_function_name(func, package_requirements=None, is_row_processor=False): + """Get a bigframes function name for testing given a udf.""" + # Augment user package requirements with any internal package + # requirements. + package_requirements = bff_utils.get_updated_package_requirements( + package_requirements or [], is_row_processor + ) + + # Compute a unique hash representing the user code. + function_hash = bff_utils.get_hash(func, package_requirements) + + return f"bigframes_{function_hash}" + + @pytest.fixture(scope="module") def bq_cf_connection() -> str: """Pre-created BQ connection in the test project in US location, used to diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py index dcf6058767..5ca26fe96f 100644 --- a/tests/unit/functions/test_remote_function_utils.py +++ b/tests/unit/functions/test_remote_function_utils.py @@ -188,78 +188,6 @@ def test_package_existed_helper(): assert not _utils._package_existed([], "pandas") -def _function_add_one(x): - return x + 1 - - -def _function_add_two(x): - return x + 2 - - -@pytest.mark.parametrize( - "func1, func2, should_be_equal, description", - [ - ( - _function_add_one, - _function_add_one, - True, - "Identical functions should have the same hash.", - ), - ( - _function_add_one, - _function_add_two, - False, - "Different functions should have different hashes.", - ), - ], -) -def test_get_hash_without_package_requirements( - func1, func2, should_be_equal, description -): - """Tests function hashes without any requirements.""" - hash1 = _utils.get_hash(func1) - hash2 = _utils.get_hash(func2) - - if should_be_equal: - assert hash1 == hash2, f"FAILED: {description}" - else: - assert hash1 != hash2, f"FAILED: {description}" - - -@pytest.mark.parametrize( - "reqs1, reqs2, should_be_equal, description", - [ - ( - None, - ["pandas>=1.0"], - False, - "Hash with or without requirements should differ from hash.", - ), - ( - ["pandas", "numpy", "scikit-learn"], - ["numpy", "scikit-learn", "pandas"], - True, - "Same requirements should produce the same hash.", - ), - ( - ["pandas==1.0"], - ["pandas==2.0"], - False, - "Different requirement versions should produce different hashes.", - ), - ], -) -def test_get_hash_with_package_requirements(reqs1, reqs2, should_be_equal, description): - """Tests how package requirements affect the final hash.""" - hash1 = _utils.get_hash(_function_add_one, package_requirements=reqs1) - hash2 = _utils.get_hash(_function_add_one, package_requirements=reqs2) - - if should_be_equal: - assert hash1 == hash2, f"FAILED: {description}" - else: - assert hash1 != hash2, f"FAILED: {description}" - - # Helper functions for signature inspection tests def _func_one_arg_annotated(x: int) -> int: """A function with one annotated arg and an annotated return type.""" From d1fabcd0e09a790b4803bda70fd514784946b3fe Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 27 Mar 2026 15:11:34 -0700 Subject: [PATCH 19/25] chore: add a notebook about using AI functions for movie posters (#2537) This will be the reference notebook to be used by the tech blog on AI functions in BigFrames --- docs/user_guide/index.rst | 1 + notebooks/generative_ai/ai_movie_poster.ipynb | 732 ++++++++++++++++++ 2 files changed, 733 insertions(+) create mode 100644 notebooks/generative_ai/ai_movie_poster.ipynb diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index af09616e05..a9695cf8c7 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -44,6 +44,7 @@ User Guide :maxdepth: 1 AI Functions <../notebooks/generative_ai/ai_functions.ipynb> + AI Functions for Poster Analysis <../notebooks/generative_ai/ai_movie_poster.ipynb> AI Forecast <../notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb> LLM Code Generation <../notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb> LLM KMeans <../notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb> diff --git a/notebooks/generative_ai/ai_movie_poster.ipynb b/notebooks/generative_ai/ai_movie_poster.ipynb new file mode 100644 index 0000000000..b25e2b556e --- /dev/null +++ b/notebooks/generative_ai/ai_movie_poster.ipynb @@ -0,0 +1,732 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "XZpKUoHjXw3_" + }, + "outputs": [], + "source": [ + "# Copyright 2026 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEKzWP6jW9Oj" + }, + "source": [ + "# Analyzing movie posters with BigQuery Dataframe AI functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c9CCKXG5XTb-" + }, + "source": [ + "BigQuery Dataframe provides a Pythonic way to use AI functions directly with your dataframes. In this notebook, you will use these functions to analyze old\n", + "movie posters. These posters are images stored in a public Google Cloud Storage bucket: `gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CUJDa_7MPbL9" + }, + "source": [ + "## Set up" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D3iYtBSkYpCK" + }, + "source": [ + "Before you begin, you need to\n", + "\n", + "* Set up your permissions for generative AI functions with [these instructions](https://docs.cloud.google.com/bigquery/docs/permissions-for-ai-functions)\n", + "* Set up your Cloud Resource connection by following [these instructions](https://docs.cloud.google.com/bigquery/docs/create-cloud-resource-connection)\n", + "\n", + "Once you have the permissions set up, import the `bigframes.pandas` package, and\n", + "set your cloud project ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6nqoRHYbPAx3" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "MY_RPOJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", + "\n", + "bpd.options.bigquery.project = MY_RPOJECT_ID" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2XHcNHtvPhNW" + }, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eS-9A7DijfoQ" + }, + "source": [ + "First, you load the data from the GCS bucket to a BigQuery Dataframe with the `from_glob_path` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ZNPzFjCyPap0", + "outputId": "346d20b2-d615-4094-d24e-2d40e5c90ee2" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a moment of slot time. [Job bigframes-dev:US.48a27954-7a4a-4b9e-8176-ea227fd188ad details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in a minute of slot time. [Job bigframes-dev:US.09c48ecb-e041-4c18-a390-ca5a36fd07c3 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
poster
0
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " poster\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Replace with your own connection name.\n", + "MY_CONNECTION = 'bigframes-default-connection' # @param {type:\"string\"}\n", + "\n", + "movies = bpd.from_glob_path(\n", + " \"gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/*\",\n", + " connection = MY_CONNECTION,\n", + " name='poster')\n", + "movies.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EfkdDH08QnYw" + }, + "source": [ + "## Extract titles from posters" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "6CoZZ5tSQm1r", + "outputId": "1b3915ce-eb83-4be9-b1c1-d9a326dc9408" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in 2 minutes of slot time. [Job bigframes-dev:US.4a08a15f-5a2f-463b-bba8-734858ec992b details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitle
0Der Student von Prag
\n", + "

1 rows × 2 columns

\n", + "
[1 rows x 2 columns in total]" + ], + "text/plain": [ + " poster title\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag\n", + "\n", + "[1 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import bigframes.bigquery as bbq\n", + "\n", + "movies['title'] = bbq.ai.generate(\n", + " (\"What is the movie title for this poster? Name only\", movies['poster']),\n", + " endpoint='gemini-2.5-pro'\n", + ").struct.field(\"result\")\n", + "movies.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cFQHQ9S2lr6t" + }, + "source": [ + "Notice that `ai.generate()` has a `struct` return type, which holds not only the LLM response, but also the status. If you do not provide a field name for your answer, `\"result\"` will be the default name. You can access LLM response content with the struct accessor (e.g. `my_response.struct.filed(\"result\")`);." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R8kkUhgoS5Xz" + }, + "source": [ + "## Get movie release year\n", + "\n", + "In the example below, you will use `ai.generate_int()` to find the release year for each movie poster:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 976 + }, + "id": "cKZdHq0XS1iW", + "outputId": "72cbad57-4518-4e1e-97bb-333d424dba73" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in 4 minutes of slot time. [Job bigframes-dev:US.b60a151a-6cbc-405e-9c40-8a7461981a00 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
0Der Student von Prag1913
\n", + "

1 rows × 3 columns

\n", + "
[1 rows x 3 columns in total]" + ], + "text/plain": [ + " poster title \\\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag \n", + "\n", + " year \n", + "0 1913 \n", + "\n", + "[1 rows x 3 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "movies['year'] = bbq.ai.generate_int(\n", + " (\"What is the release year for this movie?\", movies['title']),\n", + " endpoint='gemini-2.5-pro'\n", + ").struct.field(\"result\")\n", + "\n", + "movies.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 250 + }, + "id": "yqRiNRY8_8fs", + "outputId": "efa60107-6883-4f5c-8e40-43c7287ea7fb" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
posterstruct<uri: string, version: string, authorize...
titlestring[pyarrow]
yearInt64
\n", + "

" + ], + "text/plain": [ + "poster structJob bigframes-dev:US.c9bb23f0-5ceb-4d6c-8241-960c496274ae details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
8Shoulder Arms1918
\n", + "

1 rows × 3 columns

\n", + "
[1 rows x 3 columns in total]" + ], + "text/plain": [ + " poster title year\n", + "8 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Shoulder Arms 1918\n", + "\n", + "[1 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "us_movies = movies[bbq.ai.if_(\n", + " (\"The movie \", movies['title'], \" was made in US\")\n", + ")]\n", + "us_movies.head(1)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 825b7c537fdf356ba319ec1ff763765bd5aa6eeb Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 30 Mar 2026 18:48:48 -0700 Subject: [PATCH 20/25] chore: disable "read_csv_for_names" test (#2544) --- tests/system/small/test_session.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e8e601cc76..45e98cd960 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1380,6 +1380,9 @@ def test_read_csv_for_gcs_wildcard_path(session, df_and_gcs_csv): bigframes.testing.utils.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) +@pytest.mark.skip( + reason="Unstable test blocking PR submissions. Tracking bug b/497970577" +) def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns): _, path = df_and_gcs_csv_for_two_columns From af49ca29399aa2c63753d9045fd382e30334d134 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 30 Mar 2026 19:39:03 -0700 Subject: [PATCH 21/25] fix: Localize BigQuery log suppression for gbq.py (#2541) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #<496320476> 🦕 --- bigframes/operations/ai.py | 7 +++++-- bigframes/session/__init__.py | 6 ++++-- third_party/bigframes_vendored/pandas/io/gbq.py | 6 ++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 6516620b53..456ef3ecda 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -612,8 +612,11 @@ def sim_join( >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) - >>> df1.ai.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) - animal animal_1 + >>> res = df1.ai.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) + >>> print("---"); print(res) # doctest: +ELLIPSIS + --- + ... + animal animal_1 0 monkey baboon 1 spider scorpion diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 710b3701fa..75be3022d7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -664,9 +664,11 @@ def read_gbq_query( ... WHERE year = 2016 ... GROUP BY pitcherFirstName, pitcherLastName ... ''', index_col="rowindex") - >>> df.head(2) + >>> print("START_OF_OUTPUT"); df.head(2) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE + START_OF_OUTPUT + ... pitcherFirstName pitcherLastName averagePitchSpeed - rowindex + ... 1 Albertin Chapman 96.514113 2 Zachary Britton 94.591039 diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 3190c92b92..79faa53de2 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -86,9 +86,11 @@ def read_gbq( ... WHERE year = 2016 ... GROUP BY pitcherFirstName, pitcherLastName ... ''', index_col="rowindex") - >>> df.head(2) + >>> print("START_OF_OUTPUT"); df.head(2) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE + START_OF_OUTPUT + ... pitcherFirstName pitcherLastName averagePitchSpeed - rowindex + ... 1 Albertin Chapman 96.514113 2 Zachary Britton 94.591039 From b589de98daa9b7aa8fe68b5cb1ba527c36d5f4ab Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 31 Mar 2026 13:09:35 -0700 Subject: [PATCH 22/25] chore: fix null literal check in sqlglot compiler (#2538) --- .../compile/sqlglot/expressions/bool_ops.py | 13 ++++++------ .../sqlglot/expressions/comparison_ops.py | 8 +++---- .../sqlglot/expressions/numeric_ops.py | 21 +++++++++++++++---- .../compile/sqlglot/expressions/string_ops.py | 10 +++++++-- .../test_numeric_ops/test_div_numeric/out.sql | 2 +- .../test_floordiv_numeric/out.sql | 1 + .../test_numeric_ops/test_mod_numeric/out.sql | 3 ++- .../test_numeric_ops/test_pow/out.sql | 4 +++- .../test_string_ops/test_str_slice/out.sql | 2 +- .../sqlglot/expressions/test_numeric_ops.py | 6 ++++++ 10 files changed, 50 insertions(+), 20 deletions(-) diff --git a/bigframes/core/compile/sqlglot/expressions/bool_ops.py b/bigframes/core/compile/sqlglot/expressions/bool_ops.py index cd7f9da408..3b4ecf5431 100644 --- a/bigframes/core/compile/sqlglot/expressions/bool_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/bool_ops.py @@ -18,6 +18,7 @@ from bigframes import dtypes from bigframes import operations as ops +from bigframes.core.compile.sqlglot import sql import bigframes.core.compile.sqlglot.expression_compiler as expression_compiler from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr @@ -29,10 +30,10 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: # For AND, when we encounter a NULL value, we only know when the result is FALSE, # otherwise the result is unknown (NULL). See: truth table at # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if left.expr == sge.null(): + if sql.is_null_literal(left.expr): condition = sge.EQ(this=right.expr, expression=sge.convert(False)) return sge.If(this=condition, true=right.expr, false=sge.null()) - if right.expr == sge.null(): + if sql.is_null_literal(right.expr): condition = sge.EQ(this=left.expr, expression=sge.convert(False)) return sge.If(this=condition, true=left.expr, false=sge.null()) @@ -46,10 +47,10 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: # For OR, when we encounter a NULL value, we only know when the result is TRUE, # otherwise the result is unknown (NULL). See: truth table at # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if left.expr == sge.null(): + if sql.is_null_literal(left.expr): condition = sge.EQ(this=right.expr, expression=sge.convert(True)) return sge.If(this=condition, true=right.expr, false=sge.null()) - if right.expr == sge.null(): + if sql.is_null_literal(right.expr): condition = sge.EQ(this=left.expr, expression=sge.convert(True)) return sge.If(this=condition, true=left.expr, false=sge.null()) @@ -64,12 +65,12 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: # maintains the boolean data type. left_expr = left.expr left_dtype = left.dtype - if left_expr == sge.null(): + if sql.is_null_literal(left_expr): left_expr = sge.Cast(this=sge.convert(None), to="BOOLEAN") left_dtype = dtypes.BOOL_DTYPE right_expr = right.expr right_dtype = right.dtype - if right_expr == sge.null(): + if sql.is_null_literal(right_expr): right_expr = sge.Cast(this=sge.convert(None), to="BOOLEAN") right_dtype = dtypes.BOOL_DTYPE diff --git a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py index 7177f9de84..82c264da50 100644 --- a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py @@ -102,7 +102,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.ge_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -112,7 +112,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.gt_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -122,7 +122,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.lt_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -132,7 +132,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.le_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index d70ec2ef3f..c5fdbe3c84 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -19,6 +19,7 @@ from bigframes import dtypes from bigframes import operations as ops +from bigframes.core.compile.sqlglot import sql import bigframes.core.compile.sqlglot.expression_compiler as expression_compiler from bigframes.core.compile.sqlglot.expressions.common import round_towards_zero import bigframes.core.compile.sqlglot.expressions.constants as constants @@ -260,6 +261,9 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: def _int_pow_op( left_expr: sge.Expression, right_expr: sge.Expression ) -> sge.Expression: + if sql.is_null_literal(left_expr) or sql.is_null_literal(right_expr): + return sge.null() + overflow_cond = sge.and_( sge.NEQ(this=left_expr, expression=sge.convert(0)), sge.GT( @@ -292,6 +296,9 @@ def _int_pow_op( def _float_pow_op( left_expr: sge.Expression, right_expr: sge.Expression ) -> sge.Expression: + if sql.is_null_literal(left_expr) or sql.is_null_literal(right_expr): + return sge.null() + # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow overflow_cond = sge.and_( @@ -425,7 +432,7 @@ def _(expr: TypedExpr) -> sge.Expression: @register_binary_op(ops.add_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: @@ -463,6 +470,9 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.div_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): + return sge.null() + left_expr = _coerce_bool_to_int(left) right_expr = _coerce_bool_to_int(right) @@ -482,7 +492,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.floordiv_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -525,6 +535,9 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.mod_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): + return sge.null() + # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) left_expr = _coerce_bool_to_int(left) right_expr = _coerce_bool_to_int(right) @@ -568,7 +581,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: @register_binary_op(ops.mul_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() left_expr = _coerce_bool_to_int(left) @@ -594,7 +607,7 @@ def _(expr: TypedExpr, n_digits: TypedExpr) -> sge.Expression: @register_binary_op(ops.sub_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.expr == sge.null() or right.expr == sge.null(): + if sql.is_null_literal(left.expr) or sql.is_null_literal(right.expr): return sge.null() if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): diff --git a/bigframes/core/compile/sqlglot/expressions/string_ops.py b/bigframes/core/compile/sqlglot/expressions/string_ops.py index 3bfec04b3e..f8938b1486 100644 --- a/bigframes/core/compile/sqlglot/expressions/string_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/string_ops.py @@ -366,10 +366,16 @@ def string_slice( column_length + sge.convert(start + 1), ] ) - length_expr = sge.convert(op_end) - sge.Greatest( + length_expr = sge.Greatest( expressions=[ sge.convert(0), - column_length + sge.convert(start), + sge.convert(op_end) + - sge.Greatest( + expressions=[ + sge.convert(0), + column_length + sge.convert(start), + ] + ), ] ) else: diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql index 3f5ff73326..e2ccf96410 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_numeric/out.sql @@ -6,7 +6,7 @@ SELECT IEEE_DIVIDE(`int64_col`, `int64_col`) AS `int_div_int`, IEEE_DIVIDE(`int64_col`, 1) AS `int_div_1`, IEEE_DIVIDE(`int64_col`, 0.0) AS `int_div_0`, - IEEE_DIVIDE(`int64_col`, NULL) AS `int_div_null`, + NULL AS `int_div_null`, IEEE_DIVIDE(`int64_col`, `float64_col`) AS `int_div_float`, IEEE_DIVIDE(`float64_col`, `int64_col`) AS `float_div_int`, IEEE_DIVIDE(`float64_col`, 0.0) AS `float_div_0`, diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql index c7fa74e48f..8307b1b8ad 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_floordiv_numeric/out.sql @@ -34,6 +34,7 @@ SELECT THEN CAST('Infinity' AS FLOAT64) * `float64_col` ELSE CAST(FLOOR(IEEE_DIVIDE(`float64_col`, 0.0)) AS INT64) END AS `float_div_0`, + NULL AS `float_div_null`, CASE WHEN CAST(`bool_col` AS INT64) = CAST(0 AS INT64) THEN CAST(0 AS INT64) * `int64_col` diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql index 2a79820635..78107415b4 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql @@ -189,5 +189,6 @@ SELECT MOD(CAST(`float64_col` AS BIGNUMERIC), CAST(0 AS BIGNUMERIC)) ) ELSE MOD(CAST(`float64_col` AS BIGNUMERIC), CAST(0 AS BIGNUMERIC)) - END AS `float_mod_0` + END AS `float_mod_0`, + NULL AS `float_mod_null` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql index 8f72522262..7202903ebe 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_pow/out.sql @@ -241,5 +241,7 @@ SELECT ELSE 1 END ) - END AS `float_pow_1` + END AS `float_pow_1`, + NULL AS `float_pow_null`, + NULL AS `null_pow_float` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql index b10f4b29e6..f011480ad3 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_string_ops/test_str_slice/out.sql @@ -13,6 +13,6 @@ SELECT SUBSTRING( `string_col`, GREATEST(1, LENGTH(`string_col`) + -2), - 5 - GREATEST(0, LENGTH(`string_col`) + -3) + GREATEST(0, 5 - GREATEST(0, LENGTH(`string_col`) + -3)) ) AS `m3_5` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py index 17c2ff98bc..1d2f0a5b44 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py @@ -220,6 +220,9 @@ def test_pow(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_pow_1"] = bf_df["int64_col"] ** 1 bf_df["float_pow_1"] = bf_df["float64_col"] ** 1 + bf_df["float_pow_null"] = bf_df["float64_col"] ** pd.NA + bf_df["null_pow_float"] = pd.NA ** bf_df["float64_col"] + snapshot.assert_match(bf_df.sql, "out.sql") @@ -370,6 +373,7 @@ def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_div_float"] = bf_df["int64_col"] // bf_df["float64_col"] bf_df["float_div_int"] = bf_df["float64_col"] // bf_df["int64_col"] bf_df["float_div_0"] = bf_df["float64_col"] // 0.0 + bf_df["float_div_null"] = bf_df["float64_col"] // pd.NA bf_df["int_div_bool"] = bf_df["int64_col"] // bf_df["bool_col"] bf_df["bool_div_int"] = bf_df["bool_col"] // bf_df["int64_col"] @@ -437,6 +441,8 @@ def test_mod_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["float_mod_1"] = bf_df["float64_col"] % 1 bf_df["float_mod_0"] = bf_df["float64_col"] % 0 + bf_df["float_mod_null"] = bf_df["float64_col"] % pd.NA + snapshot.assert_match(bf_df.sql, "out.sql") From 34fb5daa93726d0d3ff364912a3c1de0fc535fb2 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 31 Mar 2026 13:39:11 -0700 Subject: [PATCH 23/25] fix: handle aggregate operations on empty selections (#2510) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SQL generator output fallbacks (SELECT 1 placeholder). Fixes #<452681068> 🦕 --- tests/system/small/test_dataframe.py | 17 +++++++++++++++++ .../ibis/backends/sql/compilers/base.py | 14 +++++++++++--- .../backends/sql/compilers/bigquery/__init__.py | 9 +++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index bc6095d434..db8842bd32 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -6296,3 +6296,20 @@ def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): with pytest.raises(KeyError): bf_df.agg(agg_funcs) + + +def test_empty_agg_projection_succeeds(): + # Tests that the compiler generates a SELECT 1 fallback for empty aggregations, + # protecting against BigQuery syntax errors when both groups and metrics are empty. + import importlib + + bq = importlib.import_module( + "bigframes_vendored.ibis.backends.sql.compilers.bigquery" + ) + sg = importlib.import_module("bigframes_vendored.sqlglot") + + compiler = bq.BigQueryCompiler() + res = compiler.visit_Aggregate( + "op", parent=sg.table("parent_table"), groups=[], metrics=[] + ) + assert "SELECT 1" in res.sql() diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index b95e428053..341b25ca1c 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -1394,9 +1394,17 @@ def _generate_groups(groups): return map(sge.convert, range(1, len(groups) + 1)) def visit_Aggregate(self, op, *, parent, groups, metrics): - sel = sg.select( - *self._cleanup_names(groups), *self._cleanup_names(metrics), copy=False - ).from_(parent, copy=False) + exprs = [] + if groups: + exprs.extend(self._cleanup_names(groups)) + if metrics: + exprs.extend(self._cleanup_names(metrics)) + + if not exprs: + # Empty aggregated projections are invalid in BigQuery + exprs = [sge.Literal.number(1)] + + sel = sg.select(*exprs, copy=False).from_(parent, copy=False) if groups: sel = sel.group_by(*self._generate_groups(groups.values()), copy=False) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 1fa5432a16..cd462f9e8f 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -540,6 +540,15 @@ def visit_TimestampFromUNIX(self, op, *, arg, unit): def visit_Cast(self, op, *, arg, to): from_ = op.arg.dtype + if to.is_null(): + return sge.Null() + if arg is NULL or ( + isinstance(arg, sge.Cast) + and getattr(arg, "to", None) is not None + and str(arg.to).upper() == "NULL" + ): + if to.is_struct() or to.is_array(): + return sge.Cast(this=NULL, to=self.type_mapper.from_ibis(to)) if from_.is_timestamp() and to.is_integer(): return self.f.unix_micros(arg) elif from_.is_integer() and to.is_timestamp(): From 56b17daf2d0b550eba5cb6033d73da3e9d628e3d Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 31 Mar 2026 15:08:07 -0700 Subject: [PATCH 24/25] deps: exclude gcsfs 2026.3.0 (#2546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes internal issue 497970577🦕 --- setup.py | 2 +- tests/system/small/test_session.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index e22b52442d..4ff5ec4587 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ # please keep these in sync with the minimum versions in testing/constraints-3.10.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", - "gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0", + "gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0, !=2026.3.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", "google-cloud-bigquery[bqstorage,pandas] >=3.36.0", diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 45e98cd960..e8e601cc76 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1380,9 +1380,6 @@ def test_read_csv_for_gcs_wildcard_path(session, df_and_gcs_csv): bigframes.testing.utils.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) -@pytest.mark.skip( - reason="Unstable test blocking PR submissions. Tracking bug b/497970577" -) def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns): _, path = df_and_gcs_csv_for_two_columns From be3327908591dae410567816dd3c1ab313ce1b92 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 31 Mar 2026 17:17:07 -0700 Subject: [PATCH 25/25] chore: librarian release pull request: 20260331T225508Z (#2547) PR created by the Librarian CLI to initialize a release. Merging this PR will auto trigger a release. Librarian Version: v0.7.0 Language Image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:160860d189ff1c2f7515638478823712fa5b243e27ccc33a2728669fa1e2ed0c
bigframes: 2.39.0 ## [2.39.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.38.0...v2.39.0) (2026-03-31) ### Features * add `df.bigquery.ai.forecast` method to pandas dataframe accessor (#2518) ([1126cec9](https://github.com/googleapis/python-bigquery-dataframes/commit/1126cec9)) * support full round-trip persistence for multimodal reference cols (#2511) ([494a0a11](https://github.com/googleapis/python-bigquery-dataframes/commit/494a0a11)) * expose DataFrame.bigquery in both pandas and bigframes DataFrames (#2533) ([69fe3176](https://github.com/googleapis/python-bigquery-dataframes/commit/69fe3176)) ### Bug Fixes * to_gbq may swap data columns when replace table (#2532) ([17ecc65e](https://github.com/googleapis/python-bigquery-dataframes/commit/17ecc65e)) * handle aggregate operations on empty selections (#2510) ([34fb5daa](https://github.com/googleapis/python-bigquery-dataframes/commit/34fb5daa)) * Localize BigQuery log suppression for gbq.py (#2541) ([af49ca29](https://github.com/googleapis/python-bigquery-dataframes/commit/af49ca29)) * Respect remote function config changes even if logic unchanged (#2512) ([b9524284](https://github.com/googleapis/python-bigquery-dataframes/commit/b9524284)) * support melting empty DataFrames without crashing (#2509) ([e8c46032](https://github.com/googleapis/python-bigquery-dataframes/commit/e8c46032)) ### Performance Improvements * Make executor data uploads async internally (#2529) ([96597f0b](https://github.com/googleapis/python-bigquery-dataframes/commit/96597f0b)) ### Documentation * gemini retouch of the index page for seo (#2514) ([2e5311e2](https://github.com/googleapis/python-bigquery-dataframes/commit/2e5311e2)) * Rename Blob column references to ObjectRef column (#2535) ([44e0ffd9](https://github.com/googleapis/python-bigquery-dataframes/commit/44e0ffd9))
--- .librarian/state.yaml | 2 +- CHANGELOG.md | 24 +++++++++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index ebc26d98ad..0344252107 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:160860d189ff1c2f7515638478823712fa5b243e27ccc33a2728669fa1e2ed0c libraries: - id: bigframes - version: 2.38.0 + version: 2.39.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fc39e4803..ab25756d9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,30 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.39.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.38.0...v2.39.0) (2026-03-31) + + +### Documentation + +* Rename Blob column references to ObjectRef column (#2535) ([44e0ffd947e9db66ab612f92de6e31f1085e7968](https://github.com/googleapis/python-bigquery-dataframes/commit/44e0ffd947e9db66ab612f92de6e31f1085e7968)) +* gemini retouch of the index page for seo (#2514) ([2e5311e2242b039da4c8e37b7b48942fa8ed34c2](https://github.com/googleapis/python-bigquery-dataframes/commit/2e5311e2242b039da4c8e37b7b48942fa8ed34c2)) + + +### Features + +* expose DataFrame.bigquery in both pandas and bigframes DataFrames (#2533) ([69fe317612a69aa92f06f0c418c67aa1f9488bd2](https://github.com/googleapis/python-bigquery-dataframes/commit/69fe317612a69aa92f06f0c418c67aa1f9488bd2)) +* support full round-trip persistence for multimodal reference cols (#2511) ([494a0a113b1ba6dcdc9f9b85a4f750d093f5652f](https://github.com/googleapis/python-bigquery-dataframes/commit/494a0a113b1ba6dcdc9f9b85a4f750d093f5652f)) +* add `df.bigquery.ai.forecast` method to pandas dataframe accessor (#2518) ([1126cec9cdfcc1ec1062c60e5affbe1b60223767](https://github.com/googleapis/python-bigquery-dataframes/commit/1126cec9cdfcc1ec1062c60e5affbe1b60223767)) + + +### Bug Fixes + +* handle aggregate operations on empty selections (#2510) ([34fb5daa93726d0d3ff364912a3c1de0fc535fb2](https://github.com/googleapis/python-bigquery-dataframes/commit/34fb5daa93726d0d3ff364912a3c1de0fc535fb2)) +* Localize BigQuery log suppression for gbq.py (#2541) ([af49ca29399aa2c63753d9045fd382e30334d134](https://github.com/googleapis/python-bigquery-dataframes/commit/af49ca29399aa2c63753d9045fd382e30334d134)) +* to_gbq may swap data columns when replace table (#2532) ([17ecc65e1c0397ef349fca4afcf5a77af72aa798](https://github.com/googleapis/python-bigquery-dataframes/commit/17ecc65e1c0397ef349fca4afcf5a77af72aa798)) +* Respect remote function config changes even if logic unchanged (#2512) ([b9524284ad3b457b15598f546bac04c76b3e27b8](https://github.com/googleapis/python-bigquery-dataframes/commit/b9524284ad3b457b15598f546bac04c76b3e27b8)) +* support melting empty DataFrames without crashing (#2509) ([e8c46032154e186042314d97aa813301413d8a13](https://github.com/googleapis/python-bigquery-dataframes/commit/e8c46032154e186042314d97aa813301413d8a13)) + ## [2.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.37.0...v2.38.0) (2026-03-16) diff --git a/bigframes/version.py b/bigframes/version.py index 4928dd5c20..8352be131d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.38.0" +__version__ = "2.39.0" # {x-release-please-start-date} -__release_date__ = "2026-03-16" +__release_date__ = "2026-03-31" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 4928dd5c20..8352be131d 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.38.0" +__version__ = "2.39.0" # {x-release-please-start-date} -__release_date__ = "2026-03-16" +__release_date__ = "2026-03-31" # {x-release-please-end}