From c1193d1609015fb2f5b0e8429bac97c7be29a225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 10 Mar 2026 19:16:16 +0000 Subject: [PATCH 1/8] feat: add `df.bigquery` pandas accessor --- bigframes/__init__.py | 3 + bigframes/extensions/__init__.py | 13 +++ bigframes/extensions/pandas/__init__.py | 13 +++ .../extensions/pandas/dataframe_accessor.py | 65 +++++++++++++++ docs/reference/index.rst | 10 +++ docs/user_guide/index.rst | 1 + .../getting_started/pandas_extensions.ipynb | 83 +++++++++++++++++++ tests/unit/extensions/__init__.py | 13 +++ tests/unit/extensions/pandas/__init__.py | 13 +++ .../pandas/test_dataframe_accessor.py | 67 +++++++++++++++ .../extensions/pandas/test_registration.py | 27 ++++++ 11 files changed, 308 insertions(+) create mode 100644 bigframes/extensions/__init__.py create mode 100644 bigframes/extensions/pandas/__init__.py create mode 100644 bigframes/extensions/pandas/dataframe_accessor.py create mode 100644 notebooks/getting_started/pandas_extensions.ipynb create mode 100644 tests/unit/extensions/__init__.py create mode 100644 tests/unit/extensions/pandas/__init__.py create mode 100644 tests/unit/extensions/pandas/test_dataframe_accessor.py create mode 100644 tests/unit/extensions/pandas/test_registration.py diff --git a/bigframes/__init__.py b/bigframes/__init__.py index a3a9b4e4c7..9b0d6bb00c 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -32,6 +32,9 @@ ) import bigframes.enums as enums # noqa: E402 import bigframes.exceptions as exceptions # noqa: E402 + +# Register pandas extensions +import bigframes.extensions.pandas.dataframe_accessor # noqa: F401, E402 from bigframes.session import connect, Session # noqa: E402 from bigframes.version import __version__ # noqa: E402 diff --git a/bigframes/extensions/__init__.py b/bigframes/extensions/__init__.py new file mode 100644 index 0000000000..58d482ea38 --- /dev/null +++ b/bigframes/extensions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/extensions/pandas/__init__.py b/bigframes/extensions/pandas/__init__.py new file mode 100644 index 0000000000..58d482ea38 --- /dev/null +++ b/bigframes/extensions/pandas/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py new file mode 100644 index 0000000000..9c5a766791 --- /dev/null +++ b/bigframes/extensions/pandas/dataframe_accessor.py @@ -0,0 +1,65 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import cast + +import pandas +import pandas.api.extensions + +import bigframes.core.global_session as bf_session +import bigframes.pandas as bpd + + +@pandas.api.extensions.register_dataframe_accessor("bigquery") +class BigQueryDataFrameAccessor: + """ + Pandas DataFrame accessor for BigQuery DataFrames functionality. + + This accessor is registered under the ``bigquery`` namespace on pandas DataFrame objects. + """ + + def __init__(self, pandas_obj: pandas.DataFrame): + self._obj = pandas_obj + + def sql_scalar(self, sql_template: str, session=None): + """ + Compute a new pandas Series by applying a SQL scalar function to the DataFrame. + + The DataFrame is converted to BigFrames by calling ``read_pandas``, then the SQL + template is applied using ``bigframes.bigquery.sql_scalar``, and the result is + converted back to a pandas Series using ``to_pandas``. + + Args: + sql_template (str): + A SQL format string with Python-style {0}, {1}, etc. placeholders for each of + the columns in the DataFrame (in the order they appear in ``df.columns``). + session (bigframes.session.Session, optional): + The BigFrames session to use. If not provided, the default global session is used. + + Returns: + pandas.Series: + The result of the SQL scalar function as a pandas Series. + """ + if session is None: + session = bf_session.get_global_session() + + bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) + + # Import bigframes.bigquery here to avoid circular imports + import bigframes.bigquery + + columns = [cast(bpd.Series, bf_df[col]) for col in bf_df.columns] + result = bigframes.bigquery.sql_scalar(sql_template, columns) + + return result.to_pandas() diff --git a/docs/reference/index.rst b/docs/reference/index.rst index bdf38e977d..cb295a4309 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -19,6 +19,16 @@ packages. bigframes.pandas.api.typing bigframes.streaming +Pandas Extensions +~~~~~~~~~~~~~~~~~ + +BigQuery DataFrames provides extensions to pandas DataFrame objects. + +.. autosummary:: + :toctree: api + + bigframes.extensions.pandas.dataframe_accessor.BigQueryDataFrameAccessor + ML APIs ~~~~~~~ diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index 02ab82d7d6..af09616e05 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -18,6 +18,7 @@ User Guide Getting Started <../notebooks/getting_started/getting_started_bq_dataframes.ipynb> Magics <../notebooks/getting_started/magics.ipynb> ML Fundamentals <../notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb> + Pandas Extensions <../notebooks/getting_started/pandas_extensions.ipynb> .. toctree:: :caption: DataFrames diff --git a/notebooks/getting_started/pandas_extensions.ipynb b/notebooks/getting_started/pandas_extensions.ipynb new file mode 100644 index 0000000000..80de060818 --- /dev/null +++ b/notebooks/getting_started/pandas_extensions.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas Extension for BigQuery DataFrames\n", + "\n", + "BigQuery DataFrames provides a pandas extension to execute BigQuery SQL scalar functions directly on pandas DataFrames." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import bigframes.pandas as bpd\n", + "import bigframes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using `sql_scalar`\n", + "\n", + "The `bigquery.sql_scalar` method allows you to apply a SQL scalar function to a pandas DataFrame by converting it to BigFrames, executing the SQL in BigQuery, and returning the result as a pandas Series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\"a\": [1.5, 2.5, 3.5]})\n", + "result = df.bigquery.sql_scalar(\"ROUND({0}, 0)\")\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also use multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\"a\": [1, 2, 3], \"b\": [10, 20, 30]})\n", + "result = df.bigquery.sql_scalar(\"{0} + {1}\")\n", + "result" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/unit/extensions/__init__.py b/tests/unit/extensions/__init__.py new file mode 100644 index 0000000000..58d482ea38 --- /dev/null +++ b/tests/unit/extensions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/extensions/pandas/__init__.py b/tests/unit/extensions/pandas/__init__.py new file mode 100644 index 0000000000..58d482ea38 --- /dev/null +++ b/tests/unit/extensions/pandas/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/extensions/pandas/test_dataframe_accessor.py b/tests/unit/extensions/pandas/test_dataframe_accessor.py new file mode 100644 index 0000000000..77e509544d --- /dev/null +++ b/tests/unit/extensions/pandas/test_dataframe_accessor.py @@ -0,0 +1,67 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest.mock as mock + +import pandas as pd + +# Importing bigframes registers the accessor. +import bigframes # noqa: F401 + + +def test_dataframe_accessor_sql_scalar(): + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + with mock.patch("bigframes.pandas.io.api.read_pandas") as mock_read_pandas: + with mock.patch("bigframes.bigquery.sql_scalar") as mock_sql_scalar: + mock_bf_df = mock.MagicMock() + mock_bf_df.columns = ["a", "b"] + mock_bf_df.__getitem__.side_effect = lambda x: f"series_{x}" + mock_read_pandas.return_value = mock_bf_df + + mock_result_series = mock.MagicMock() + mock_sql_scalar.return_value = mock_result_series + mock_result_series.to_pandas.return_value = pd.Series([4, 6]) + + # This should trigger the accessor + result = df.bigquery.sql_scalar("ROUND({0} + {1})") + + mock_read_pandas.assert_called_once() + # check it was called with df + assert mock_read_pandas.call_args[0][0] is df + + mock_sql_scalar.assert_called_once_with( + "ROUND({0} + {1})", ["series_a", "series_b"] + ) + + pd.testing.assert_series_equal(result, pd.Series([4, 6])) + + +def test_dataframe_accessor_sql_scalar_with_session(): + df = pd.DataFrame({"a": [1]}) + mock_session = mock.MagicMock() + + with mock.patch("bigframes.pandas.io.api.read_pandas") as mock_read_pandas: + with mock.patch("bigframes.bigquery.sql_scalar") as mock_sql_scalar: + mock_bf_df = mock.MagicMock() + mock_bf_df.columns = ["a"] + mock_bf_df.__getitem__.side_effect = lambda x: f"series_{x}" + mock_read_pandas.return_value = mock_bf_df + + mock_result_series = mock.MagicMock() + mock_sql_scalar.return_value = mock_result_series + + df.bigquery.sql_scalar("template", session=mock_session) + + mock_read_pandas.assert_called_once_with(df, session=mock_session) diff --git a/tests/unit/extensions/pandas/test_registration.py b/tests/unit/extensions/pandas/test_registration.py new file mode 100644 index 0000000000..1258098091 --- /dev/null +++ b/tests/unit/extensions/pandas/test_registration.py @@ -0,0 +1,27 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +# Importing bigframes registers the accessor. +import bigframes # noqa: F401 + + +def test_bigframes_import_registers_accessor(): + df = pd.DataFrame({"a": [1]}) + # If bigframes was imported, df.bigquery should exist + assert hasattr(df, "bigquery") + from bigframes.extensions.pandas.dataframe_accessor import BigQueryDataFrameAccessor + + assert isinstance(df.bigquery, BigQueryDataFrameAccessor) From 2505296642f7c901bcbfb9e2636766683733a38b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 13 Mar 2026 13:49:55 +0000 Subject: [PATCH 2/8] docs: run the notebook --- .../getting_started/pandas_extensions.ipynb | 73 ++++++++++++++++--- 1 file changed, 64 insertions(+), 9 deletions(-) diff --git a/notebooks/getting_started/pandas_extensions.ipynb b/notebooks/getting_started/pandas_extensions.ipynb index 80de060818..8d048712dc 100644 --- a/notebooks/getting_started/pandas_extensions.ipynb +++ b/notebooks/getting_started/pandas_extensions.ipynb @@ -4,20 +4,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Pandas Extension for BigQuery DataFrames\n", + "# BigQuery extension for pandas\n", "\n", "BigQuery DataFrames provides a pandas extension to execute BigQuery SQL scalar functions directly on pandas DataFrames." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "import bigframes.pandas as bpd\n", - "import bigframes" + "import bigframes # This import registers the bigquery accessor." ] }, { @@ -31,9 +30,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 2.0\n", + "1 3.0\n", + "2 4.0\n", + "dtype: Float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = pd.DataFrame({\"a\": [1.5, 2.5, 3.5]})\n", "result = df.bigquery.sql_scalar(\"ROUND({0}, 0)\")\n", @@ -49,9 +76,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 11\n", + "1 22\n", + "2 33\n", + "dtype: Int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = pd.DataFrame({\"a\": [1, 2, 3], \"b\": [10, 20, 30]})\n", "result = df.bigquery.sql_scalar(\"{0} + {1}\")\n", @@ -61,7 +116,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", "language": "python", "name": "python3" }, From 38c4d6f9677b3612e8e695706d54ae1f9bc01370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 13 Mar 2026 16:34:14 +0000 Subject: [PATCH 3/8] fix unit tests --- bigframes/bigquery/_operations/sql.py | 73 +++++++++++++++---- bigframes/core/blocks.py | 10 ++- bigframes/dataframe.py | 6 +- .../extensions/pandas/dataframe_accessor.py | 20 ++--- .../getting_started/pandas_extensions.ipynb | 34 +++++++-- .../compile/sqlglot/extensions/__init__.py | 13 ++++ .../sqlglot/extensions/pandas/__init__.py | 13 ++++ .../test_compile_aggregate/out.sql | 4 + .../pandas/test_dataframe_accessor.py | 41 +++++++++++ .../pandas/test_dataframe_accessor.py | 67 ----------------- 10 files changed, 179 insertions(+), 102 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/extensions/__init__.py create mode 100644 tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py create mode 100644 tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql create mode 100644 tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py delete mode 100644 tests/unit/extensions/pandas/test_dataframe_accessor.py diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index c3846b8335..b65dfd2d16 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -16,19 +16,31 @@ from __future__ import annotations -from typing import Sequence +from typing import cast, Optional, Sequence, Union import google.cloud.bigquery from bigframes.core.compile.sqlglot import sql +import bigframes.dataframe import bigframes.dtypes import bigframes.operations import bigframes.series +def _format_names(sql_template: str, dataframe: bigframes.dataframe.DataFrame): + """Turn sql_template from a template that uses names to one that uses + numbers. + """ + names_to_numbers = {name: f"{{{i}}}" for i, name in enumerate(dataframe.columns)} + numbers = [f"{{{i}}}" for i in range(len(dataframe.columns))] + return sql_template.format(*numbers, **names_to_numbers) + + def sql_scalar( sql_template: str, - columns: Sequence[bigframes.series.Series], + columns: Union[bigframes.dataframe.DataFrame, Sequence[bigframes.series.Series]], + *, + output_dtype: Optional[bigframes.dtypes.Dtype] = None, ) -> bigframes.series.Series: """Create a Series from a SQL template. @@ -37,6 +49,9 @@ def sql_scalar( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + Either pass in a sequence of series, in which case use integers in the + format strings. + >>> s = bpd.Series(["1.5", "2.5", "3.5"]) >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9))) >>> bbq.sql_scalar("ROUND({0}, 0, 'ROUND_HALF_EVEN')", [s]) @@ -45,13 +60,29 @@ def sql_scalar( 2 4.000000000 dtype: decimal128(38, 9)[pyarrow] + Or pass in a DataFrame, in which case use the column names in the format + strings. + + >>> df = bpd.DataFrame({"a": ["1.5", "2.5", "3.5"]}) + >>> df = df.astype({"a": pd.ArrowDtype(pa.decimal128(38, 9))}) + >>> bbq.sql_scalar("ROUND({a}, 0, 'ROUND_HALF_EVEN')", df) + 0 2.000000000 + 1 2.000000000 + 2 4.000000000 + dtype: decimal128(38, 9)[pyarrow] + Args: sql_template (str): A SQL format string with Python-style {0} placeholders for each of the Series objects in ``columns``. - columns (Sequence[bigframes.pandas.Series]): + columns ( + Sequence[bigframes.pandas.Series] | bigframes.pandas.DataFrame + ): Series objects representing the column inputs to the ``sql_template``. Must contain at least one Series. + output_dtype (a BigQuery DataFrames compatible dtype, optional): + If provided, BigQuery DataFrames uses this to determine the output + of the returned Series. This avoids a dry run query. Returns: bigframes.pandas.Series: @@ -60,28 +91,38 @@ def sql_scalar( Raises: ValueError: If ``columns`` is empty. """ + if isinstance(columns, bigframes.dataframe.DataFrame): + sql_template = _format_names(sql_template, columns) + columns = [ + cast(bigframes.series.Series, columns[column]) for column in columns.columns + ] + if len(columns) == 0: raise ValueError("Must provide at least one column in columns") + base_series = columns[0] + # To integrate this into our expression trees, we need to get the output # type, so we do some manual compilation and a dry run query to get that. # Another benefit of this is that if there is a syntax error in the SQL # template, then this will fail with an error earlier in the process, # aiding users in debugging. - literals_sql = [sql.to_sql(sql.literal(None, column.dtype)) for column in columns] - select_sql = sql_template.format(*literals_sql) - dry_run_sql = f"SELECT {select_sql}" - - # Use the executor directly, because we want the original column IDs, not - # the user-friendly column names that block.to_sql_query() would produce. - base_series = columns[0] - bqclient = base_series._session.bqclient - job = bqclient.query( - dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True) - ) - _, output_type = bigframes.dtypes.convert_schema_field(job.schema[0]) + if output_dtype is None: + literals_sql = [ + sql.to_sql(sql.literal(None, column.dtype)) for column in columns + ] + select_sql = sql_template.format(*literals_sql) + dry_run_sql = f"SELECT {select_sql}" + + # Use the executor directly, because we want the original column IDs, not + # the user-friendly column names that block.to_sql_query() would produce. + bqclient = base_series._session.bqclient + job = bqclient.query( + dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True) + ) + _, output_dtype = bigframes.dtypes.convert_schema_field(job.schema[0]) op = bigframes.operations.SqlScalarOp( - _output_type=output_type, sql_template=sql_template + _output_type=output_dtype, sql_template=sql_template ) return base_series._apply_nary_op(op, columns[1:]) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 239eedf6d3..680cb373a8 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2666,7 +2666,11 @@ def _array_value_for_output( ) def to_sql_query( - self, include_index: bool, enable_cache: bool = True + self, + include_index: bool, + enable_cache: bool = True, + *, + ordered=False, ) -> Tuple[str, list[str], list[Label]]: """ Compiles this DataFrame's expression tree to SQL, optionally @@ -2688,7 +2692,9 @@ def to_sql_query( # Note: this uses the sql from the executor, so is coupled tightly to execution # implementaton. It will reference cached tables instead of original data sources. # Maybe should just compile raw BFET? Depends on user intent. - sql = self.session._executor.to_sql(array_value, enable_cache=enable_cache) + sql = self.session._executor.to_sql( + array_value, enable_cache=enable_cache, ordered=ordered + ) return ( sql, idx_ids, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 25cedda8f4..303c2c14b6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -447,7 +447,7 @@ def _to_placeholder_table(self, dry_run: bool = False) -> bigquery.TableReferenc ) def _to_sql_query( - self, include_index: bool, enable_cache: bool = True + self, include_index: bool, enable_cache: bool = True, *, ordered: bool = False ) -> Tuple[str, list[str], list[blocks.Label]]: """Compiles this DataFrame's expression tree to SQL, optionally including index columns. @@ -461,7 +461,9 @@ def _to_sql_query( If include_index is set to False, index_column_id_list and index_column_label_list return empty lists. """ - return self._block.to_sql_query(include_index, enable_cache=enable_cache) + return self._block.to_sql_query( + include_index, enable_cache=enable_cache, ordered=ordered + ) @property def sql(self) -> str: diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py index 9c5a766791..2cb44fe3c5 100644 --- a/bigframes/extensions/pandas/dataframe_accessor.py +++ b/bigframes/extensions/pandas/dataframe_accessor.py @@ -32,7 +32,7 @@ class BigQueryDataFrameAccessor: def __init__(self, pandas_obj: pandas.DataFrame): self._obj = pandas_obj - def sql_scalar(self, sql_template: str, session=None): + def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None): """ Compute a new pandas Series by applying a SQL scalar function to the DataFrame. @@ -44,6 +44,9 @@ def sql_scalar(self, sql_template: str, session=None): sql_template (str): A SQL format string with Python-style {0}, {1}, etc. placeholders for each of the columns in the DataFrame (in the order they appear in ``df.columns``). + output_dtype (a BigQuery DataFrames compatible dtype, optional): + If provided, BigQuery DataFrames uses this to determine the output + of the returned Series. This avoids a dry run query. session (bigframes.session.Session, optional): The BigFrames session to use. If not provided, the default global session is used. @@ -51,15 +54,14 @@ def sql_scalar(self, sql_template: str, session=None): pandas.Series: The result of the SQL scalar function as a pandas Series. """ - if session is None: - session = bf_session.get_global_session() - - bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) - # Import bigframes.bigquery here to avoid circular imports import bigframes.bigquery - columns = [cast(bpd.Series, bf_df[col]) for col in bf_df.columns] - result = bigframes.bigquery.sql_scalar(sql_template, columns) + if session is None: + session = bf_session.get_global_session() - return result.to_pandas() + bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) + result = bigframes.bigquery.sql_scalar( + sql_template, bf_df, output_dtype=output_dtype + ) + return result.to_pandas(ordered=True) diff --git a/notebooks/getting_started/pandas_extensions.ipynb b/notebooks/getting_started/pandas_extensions.ipynb index 8d048712dc..c511eab9b4 100644 --- a/notebooks/getting_started/pandas_extensions.ipynb +++ b/notebooks/getting_started/pandas_extensions.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -19,6 +19,28 @@ "import bigframes # This import registers the bigquery accessor." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, BigQuery DataFrames selects a location to process data based on the\n", + "data location, but using a pandas object doesn't provide such informat. If\n", + "processing location is important to you, configure the location before using the\n", + "accessor." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "bpd.reset_session()\n", + "bpd.options.bigquery.location = \"US\"" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -30,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -56,7 +78,7 @@ "dtype: Float64" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -76,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -102,14 +124,14 @@ "dtype: Int64" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\"a\": [1, 2, 3], \"b\": [10, 20, 30]})\n", - "result = df.bigquery.sql_scalar(\"{0} + {1}\")\n", + "result = df.bigquery.sql_scalar(\"{a} + {b}\")\n", "result" ] } diff --git a/tests/unit/core/compile/sqlglot/extensions/__init__.py b/tests/unit/core/compile/sqlglot/extensions/__init__.py new file mode 100644 index 0000000000..58d482ea38 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/extensions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py b/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py new file mode 100644 index 0000000000..58d482ea38 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql b/tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql new file mode 100644 index 0000000000..14853067c7 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + ROUND(`int64_col` + `int64_too`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py new file mode 100644 index 0000000000..210e7e78c3 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py @@ -0,0 +1,41 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest.mock as mock + +import pandas as pd + +import bigframes.pandas as bpd +import bigframes.session + + +def test_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot, monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + session.read_pandas.return_value = scalar_types_df + + def to_pandas(series, ordered=True): + sql, _, _ = series.to_frame()._to_sql_query(include_index=True, ordered=ordered) + return sql + + monkeypatch.setattr(bpd.Series, "to_pandas", to_pandas) + + df = pd.DataFrame({"int64_col": [1, 2], "int64_too": [3, 4]}) + result = df.bigquery.sql_scalar( + "ROUND({int64_col} + {int64_too})", + output_dtype=pd.Int64Dtype(), + session=session, + ) + + session.read_pandas.assert_called_once() + snapshot.assert_match(result, "out.sql") diff --git a/tests/unit/extensions/pandas/test_dataframe_accessor.py b/tests/unit/extensions/pandas/test_dataframe_accessor.py deleted file mode 100644 index 77e509544d..0000000000 --- a/tests/unit/extensions/pandas/test_dataframe_accessor.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest.mock as mock - -import pandas as pd - -# Importing bigframes registers the accessor. -import bigframes # noqa: F401 - - -def test_dataframe_accessor_sql_scalar(): - df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - - with mock.patch("bigframes.pandas.io.api.read_pandas") as mock_read_pandas: - with mock.patch("bigframes.bigquery.sql_scalar") as mock_sql_scalar: - mock_bf_df = mock.MagicMock() - mock_bf_df.columns = ["a", "b"] - mock_bf_df.__getitem__.side_effect = lambda x: f"series_{x}" - mock_read_pandas.return_value = mock_bf_df - - mock_result_series = mock.MagicMock() - mock_sql_scalar.return_value = mock_result_series - mock_result_series.to_pandas.return_value = pd.Series([4, 6]) - - # This should trigger the accessor - result = df.bigquery.sql_scalar("ROUND({0} + {1})") - - mock_read_pandas.assert_called_once() - # check it was called with df - assert mock_read_pandas.call_args[0][0] is df - - mock_sql_scalar.assert_called_once_with( - "ROUND({0} + {1})", ["series_a", "series_b"] - ) - - pd.testing.assert_series_equal(result, pd.Series([4, 6])) - - -def test_dataframe_accessor_sql_scalar_with_session(): - df = pd.DataFrame({"a": [1]}) - mock_session = mock.MagicMock() - - with mock.patch("bigframes.pandas.io.api.read_pandas") as mock_read_pandas: - with mock.patch("bigframes.bigquery.sql_scalar") as mock_sql_scalar: - mock_bf_df = mock.MagicMock() - mock_bf_df.columns = ["a"] - mock_bf_df.__getitem__.side_effect = lambda x: f"series_{x}" - mock_read_pandas.return_value = mock_bf_df - - mock_result_series = mock.MagicMock() - mock_sql_scalar.return_value = mock_result_series - - df.bigquery.sql_scalar("template", session=mock_session) - - mock_read_pandas.assert_called_once_with(df, session=mock_session) From 01b719efdc58461efed6abad18a22050da2a8fe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 13 Mar 2026 16:45:48 +0000 Subject: [PATCH 4/8] try again --- .../core/compile/sqlglot/extensions/__init__.py | 13 ------------- .../compile/sqlglot/extensions/pandas/__init__.py | 13 ------------- .../test_sql_scalar}/out.sql | 0 .../pandas => }/test_dataframe_accessor.py | 0 4 files changed, 26 deletions(-) delete mode 100644 tests/unit/core/compile/sqlglot/extensions/__init__.py delete mode 100644 tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py rename tests/unit/core/compile/sqlglot/{extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate => snapshots/test_dataframe_accessor/test_sql_scalar}/out.sql (100%) rename tests/unit/core/compile/sqlglot/{extensions/pandas => }/test_dataframe_accessor.py (100%) diff --git a/tests/unit/core/compile/sqlglot/extensions/__init__.py b/tests/unit/core/compile/sqlglot/extensions/__init__.py deleted file mode 100644 index 58d482ea38..0000000000 --- a/tests/unit/core/compile/sqlglot/extensions/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py b/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py deleted file mode 100644 index 58d482ea38..0000000000 --- a/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_sql_scalar/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql rename to tests/unit/core/compile/sqlglot/snapshots/test_dataframe_accessor/test_sql_scalar/out.sql diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py similarity index 100% rename from tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py rename to tests/unit/core/compile/sqlglot/test_dataframe_accessor.py From b34cf3d6c01e77578a95ef6ac214c5644cbde6c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 13 Mar 2026 17:07:00 +0000 Subject: [PATCH 5/8] revert: to_sql_query changes --- bigframes/core/blocks.py | 10 ++-------- bigframes/dataframe.py | 6 ++---- .../core/compile/sqlglot/test_dataframe_accessor.py | 5 +++-- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 680cb373a8..239eedf6d3 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2666,11 +2666,7 @@ def _array_value_for_output( ) def to_sql_query( - self, - include_index: bool, - enable_cache: bool = True, - *, - ordered=False, + self, include_index: bool, enable_cache: bool = True ) -> Tuple[str, list[str], list[Label]]: """ Compiles this DataFrame's expression tree to SQL, optionally @@ -2692,9 +2688,7 @@ def to_sql_query( # Note: this uses the sql from the executor, so is coupled tightly to execution # implementaton. It will reference cached tables instead of original data sources. # Maybe should just compile raw BFET? Depends on user intent. - sql = self.session._executor.to_sql( - array_value, enable_cache=enable_cache, ordered=ordered - ) + sql = self.session._executor.to_sql(array_value, enable_cache=enable_cache) return ( sql, idx_ids, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 303c2c14b6..25cedda8f4 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -447,7 +447,7 @@ def _to_placeholder_table(self, dry_run: bool = False) -> bigquery.TableReferenc ) def _to_sql_query( - self, include_index: bool, enable_cache: bool = True, *, ordered: bool = False + self, include_index: bool, enable_cache: bool = True ) -> Tuple[str, list[str], list[blocks.Label]]: """Compiles this DataFrame's expression tree to SQL, optionally including index columns. @@ -461,9 +461,7 @@ def _to_sql_query( If include_index is set to False, index_column_id_list and index_column_label_list return empty lists. """ - return self._block.to_sql_query( - include_index, enable_cache=enable_cache, ordered=ordered - ) + return self._block.to_sql_query(include_index, enable_cache=enable_cache) @property def sql(self) -> str: diff --git a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py index 210e7e78c3..f225d18331 100644 --- a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py +++ b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py @@ -24,8 +24,9 @@ def test_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot, monkeypatch): session = mock.create_autospec(bigframes.session.Session) session.read_pandas.return_value = scalar_types_df - def to_pandas(series, ordered=True): - sql, _, _ = series.to_frame()._to_sql_query(include_index=True, ordered=ordered) + def to_pandas(series, *, ordered): + assert ordered is True + sql, _, _ = series.to_frame()._to_sql_query(include_index=True) return sql monkeypatch.setattr(bpd.Series, "to_pandas", to_pandas) From ec6a4f49be50cca52e05f37b42fbf34eb8dc80dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 13 Mar 2026 18:23:38 +0000 Subject: [PATCH 6/8] test: fix noextras build --- tests/unit/core/compile/sqlglot/test_dataframe_accessor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py index f225d18331..327b8e4206 100644 --- a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py +++ b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py @@ -15,10 +15,13 @@ import unittest.mock as mock import pandas as pd +import pytest import bigframes.pandas as bpd import bigframes.session +pytest.importorskip("pytest_snapshot") + def test_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot, monkeypatch): session = mock.create_autospec(bigframes.session.Session) From 7cf633ea026ff039d6e50353d3cb7764b3931b22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 13 Mar 2026 20:30:08 +0000 Subject: [PATCH 7/8] docs: ignore some intersphinx warnings --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 48e214cd86..c0561c6034 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -280,6 +280,8 @@ # See https://github.com/sphinx-doc/sphinx/blob # /2a65ffeef5c107c19084fabdd706cdff3f52d93c/sphinx/domains/python.py#L843 "ref.python", + # Allow external websites to be down occasionally. + "intersphinx.external", ] # -- Options for LaTeX output --------------------------------------------- From 2c60d8c5d43b0af173362e7312aa1f83f7ad0d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 13 Mar 2026 20:45:42 +0000 Subject: [PATCH 8/8] docs: remove pandas intersphinx config --- docs/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index c0561c6034..b518ac074f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -390,7 +390,8 @@ "grpc": ("https://grpc.github.io/grpc/python/", None), "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), - "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + # TODO(tswast): re-enable if we can get temporary failures to be ignored. + # "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "pydata-google-auth": ( "https://pydata-google-auth.readthedocs.io/en/latest/", None,