From 3c21993e6fca474c32f3c2371c41ef2be146267e Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 09:44:39 -0600 Subject: [PATCH 01/29] docs: Add code examples to configuration docstrings (#2352) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added code examples to the docstrings of global configuration properties to demonstrate how to set them using `bigframes.pandas.options`. This covers BigQuery options, compute options, sampling options, experiment options, and display options. --- *PR created automatically by Jules for task [15986639753712030034](https://jules.google.com/task/15986639753712030034) started by @tswast* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Tim Sweña (Swast) Co-authored-by: tswast <247555+tswast@users.noreply.github.com> --- bigframes/_config/bigquery_options.py | 57 +++++++++++++- bigframes/_config/compute_options.py | 44 +++++++++-- bigframes/_config/experiment_options.py | 14 ++++ bigframes/_config/sampling_options.py | 20 +++++ .../pandas/core/config_init.py | 74 +++++++++++++++++-- 5 files changed, 193 insertions(+), 16 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 648b69dea7f..e1e8129ca35 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -127,6 +127,11 @@ def application_name(self) -> Optional[str]: The recommended format is ``"application-name/major.minor.patch_version"`` or ``"(gpn:PartnerName;)"`` for official Google partners. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.application_name = "my-app/1.0.0" # doctest: +SKIP + Returns: None or str: Application name as a string if exists; otherwise None. @@ -145,6 +150,13 @@ def application_name(self, value: Optional[str]): def credentials(self) -> Optional[google.auth.credentials.Credentials]: """The OAuth2 credentials to use for this client. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import google.auth + >>> credentials, project = google.auth.default() # doctest: +SKIP + >>> bpd.options.bigquery.credentials = credentials # doctest: +SKIP + Returns: None or google.auth.credentials.Credentials: google.auth.credentials.Credentials if exists; otherwise None. @@ -163,6 +175,11 @@ def location(self) -> Optional[str]: For more information, see https://cloud.google.com/bigquery/docs/locations BigQuery locations. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.location = "US" # doctest: +SKIP + Returns: None or str: Default location as a string; otherwise None. @@ -179,6 +196,11 @@ def location(self, value: Optional[str]): def project(self) -> Optional[str]: """Google Cloud project ID to use for billing and as the default project. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.project = "my-project" # doctest: +SKIP + Returns: None or str: Google Cloud project ID as a string; otherwise None. @@ -206,6 +228,11 @@ def bq_connection(self) -> Optional[str]: If this option isn't provided, or project or location aren't provided, session will use its default project/location/connection_id as default connection. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.bq_connection = "my-project.us.my-connection" # doctest: +SKIP + Returns: None or str: Name of the BigQuery connection as a string; otherwise None. @@ -228,6 +255,11 @@ def skip_bq_connection_check(self) -> bool: necessary permissions set up to support BigQuery DataFrames operations, then a runtime error will be reported. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.skip_bq_connection_check = True # doctest: +SKIP + Returns: bool: A boolean value, where True indicates a BigQuery connection is @@ -300,6 +332,12 @@ def use_regional_endpoints(self) -> bool: does not promise any guarantee on the request remaining within the location during transit. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.location = "europe-west3" # doctest: +SKIP + >>> bpd.options.bigquery.use_regional_endpoints = True # doctest: +SKIP + Returns: bool: A boolean value, where True indicates that regional endpoints @@ -339,6 +377,11 @@ def kms_key_name(self) -> Optional[str]: For more information, see https://cloud.google.com/bigquery/docs/customer-managed-encryption#assign_role Assign the Encrypter/Decrypter. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.kms_key_name = "projects/my-project/locations/us/keyRings/my-ring/cryptoKeys/my-key" # doctest: +SKIP + Returns: None or str: Name of the customer managed encryption key as a string; otherwise None. @@ -356,6 +399,11 @@ def kms_key_name(self, value: str): def ordering_mode(self) -> Literal["strict", "partial"]: """Controls whether total row order is always maintained for DataFrame/Series. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.ordering_mode = "partial" # doctest: +SKIP + Returns: Literal: A literal string value of either strict or partial ordering mode. @@ -432,7 +480,14 @@ def requests_transport_adapters( @property def enable_polars_execution(self) -> bool: - """If True, will use polars to execute some simple query plans locally.""" + """If True, will use polars to execute some simple query plans locally. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.enable_polars_execution = True # doctest: +SKIP + + """ return self._enable_polars_execution @enable_polars_execution.setter diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 7810ee897f5..027566ae075 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -28,30 +28,30 @@ class ComputeOptions: >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") - >>> bpd.options.compute.maximum_bytes_billed = 500 + >>> bpd.options.compute.maximum_bytes_billed = 500 # doctest: +SKIP >>> df.to_pandas() # this should fail # doctest: +SKIP google.api_core.exceptions.InternalServerError: 500 Query exceeded limit for bytes billed: 500. 10485760 or higher required. - >>> bpd.options.compute.maximum_bytes_billed = None # reset option + >>> bpd.options.compute.maximum_bytes_billed = None # reset option # doctest: +SKIP To add multiple extra labels to a query configuration, use the `assign_extra_query_labels` method with keyword arguments: - >>> bpd.options.compute.assign_extra_query_labels(test1=1, test2="abc") - >>> bpd.options.compute.extra_query_labels + >>> bpd.options.compute.assign_extra_query_labels(test1=1, test2="abc") # doctest: +SKIP + >>> bpd.options.compute.extra_query_labels # doctest: +SKIP {'test1': 1, 'test2': 'abc'} Alternatively, you can add labels individually by directly accessing the `extra_query_labels` dictionary: - >>> bpd.options.compute.extra_query_labels["test3"] = False - >>> bpd.options.compute.extra_query_labels + >>> bpd.options.compute.extra_query_labels["test3"] = False # doctest: +SKIP + >>> bpd.options.compute.extra_query_labels # doctest: +SKIP {'test1': 1, 'test2': 'abc', 'test3': False} To remove a label from the configuration, use the `del` keyword on the desired label key: - >>> del bpd.options.compute.extra_query_labels["test1"] - >>> bpd.options.compute.extra_query_labels + >>> del bpd.options.compute.extra_query_labels["test1"] # doctest: +SKIP + >>> bpd.options.compute.extra_query_labels # doctest: +SKIP {'test2': 'abc', 'test3': False} """ @@ -63,6 +63,11 @@ class ComputeOptions: their operations to resume. The default value is 0. Set the value to None to turn off the guard. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.compute.ai_ops_confirmation_threshold = 100 # doctest: +SKIP + Returns: Optional[int]: Number of rows. """ @@ -73,6 +78,11 @@ class ComputeOptions: When set to True, the operation automatically fails without asking for user inputs. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.compute.ai_ops_threshold_autofail = True # doctest: +SKIP + Returns: bool: True if the guard is enabled. """ @@ -85,6 +95,10 @@ class ComputeOptions: 10 GB for potentially faster execution; BigQuery will raise an error if this limit is exceeded. Setting to True removes this result size limit. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.compute.allow_large_results = True # doctest: +SKIP Returns: bool | None: True if results > 10 GB are enabled. @@ -97,6 +111,10 @@ class ComputeOptions: query engine to handle. However this comes at the cost of increase cost and latency. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.compute.enable_multi_query_execution = True # doctest: +SKIP Returns: bool | None: True if enabled. @@ -121,6 +139,11 @@ class ComputeOptions: default. See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.compute.maximum_bytes_billed = 1000 # doctest: +SKIP + Returns: int | None: Number of bytes, if set. """ @@ -136,6 +159,11 @@ class ComputeOptions: of rows to be downloaded exceeds this limit, a ``bigframes.exceptions.MaximumResultRowsExceeded`` exception is raised. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.compute.maximum_result_rows = 1000 # doctest: +SKIP + Returns: int | None: Number of rows, if set. """ diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index ee54e017fe3..811d6b8bd45 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -31,6 +31,13 @@ def __init__(self): @property def semantic_operators(self) -> bool: + """Deprecated. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.experiments.semantic_operators = True # doctest: +SKIP + """ return self._semantic_operators @semantic_operators.setter @@ -44,6 +51,13 @@ def semantic_operators(self, value: bool): @property def ai_operators(self) -> bool: + """If True, allow using the AI operators. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.experiments.ai_operators = True # doctest: +SKIP + """ return self._ai_operators @ai_operators.setter diff --git a/bigframes/_config/sampling_options.py b/bigframes/_config/sampling_options.py index 107142c3ba9..9746e01f31d 100644 --- a/bigframes/_config/sampling_options.py +++ b/bigframes/_config/sampling_options.py @@ -31,6 +31,11 @@ class SamplingOptions: Download size threshold in MB. Default 500. If value set to None, the download size won't be checked. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.sampling.max_download_size = 1000 # doctest: +SKIP """ enable_downsampling: bool = False @@ -40,6 +45,11 @@ class SamplingOptions: If max_download_size is exceeded when downloading data (e.g., to_pandas()), the data will be downsampled if enable_downsampling is True, otherwise, an error will be raised. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.sampling.enable_downsampling = True # doctest: +SKIP """ sampling_method: Literal["head", "uniform"] = "uniform" @@ -50,6 +60,11 @@ class SamplingOptions: the beginning. It is fast and requires minimal computations to perform the downsampling.; "uniform": This algorithm returns uniform random samples of the data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.sampling.sampling_method = "head" # doctest: +SKIP """ random_state: Optional[int] = None @@ -58,6 +73,11 @@ class SamplingOptions: If provided, the uniform method may take longer to execute and require more computation. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.sampling.random_state = 42 # doctest: +SKIP """ def with_max_download_size(self, max_rows: Optional[int]) -> SamplingOptions: diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 194ec4a8a71..9ffd1ed59f9 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -29,13 +29,13 @@ class DisplayOptions: >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") - >>> bpd.options.display.repr_mode = "deferred" - >>> df.head(20) # will no longer run the job + >>> bpd.options.display.repr_mode = "deferred" # doctest: +SKIP + >>> df.head(20) # will no longer run the job # doctest: +SKIP Computation deferred. Computation will process 28.9 kB Users can also get a dry run of the job by accessing the query_job property before they've run the job. This will return a dry run instance of the job they can inspect. - >>> df.query_job.total_bytes_processed + >>> df.query_job.total_bytes_processed # doctest: +SKIP 28947 User can execute the job by calling .to_pandas() @@ -44,21 +44,21 @@ class DisplayOptions: Reset repr_mode option - >>> bpd.options.display.repr_mode = "head" + >>> bpd.options.display.repr_mode = "head" # doctest: +SKIP Can also set the progress_bar option to see the progress bar in terminal, - >>> bpd.options.display.progress_bar = "terminal" + >>> bpd.options.display.progress_bar = "terminal" # doctest: +SKIP notebook, - >>> bpd.options.display.progress_bar = "notebook" + >>> bpd.options.display.progress_bar = "notebook" # doctest: +SKIP or just remove it. Setting to default value "auto" will detect and show progress bar automatically. - >>> bpd.options.display.progress_bar = "auto" + >>> bpd.options.display.progress_bar = "auto" # doctest: +SKIP """ # Options borrowed from pandas. @@ -67,6 +67,11 @@ class DisplayOptions: Maximum number of columns to display. Default 20. If `max_columns` is exceeded, switch to truncate view. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.max_columns = 50 # doctest: +SKIP """ max_rows: int = 10 @@ -74,6 +79,11 @@ class DisplayOptions: Maximum number of rows to display. Default 10. If `max_rows` is exceeded, switch to truncate view. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.max_rows = 50 # doctest: +SKIP """ precision: int = 6 @@ -81,6 +91,11 @@ class DisplayOptions: Controls the floating point output precision. Defaults to 6. See :attr:`pandas.options.display.precision`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.precision = 2 # doctest: +SKIP """ # Options unique to BigQuery DataFrames. @@ -90,6 +105,11 @@ class DisplayOptions: Valid values are `auto`, `notebook`, and `terminal`. Set to `None` to remove progress bars. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = "terminal" # doctest: +SKIP """ repr_mode: Literal["head", "deferred", "anywidget"] = "head" @@ -105,6 +125,11 @@ class DisplayOptions: Instead, estimated bytes processed will be shown. DataFrame and Series objects can still be computed with methods that explicitly execute and download results. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.repr_mode = "deferred" # doctest: +SKIP """ max_colwidth: Optional[int] = 50 @@ -113,12 +138,22 @@ class DisplayOptions: When the column overflows, a "..." placeholder is embedded in the output. A 'None' value means unlimited. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.max_colwidth = 20 # doctest: +SKIP """ max_info_columns: int = 100 """ Used in DataFrame.info method to decide if information in each column will be printed. Default 100. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.max_info_columns = 50 # doctest: +SKIP """ max_info_rows: Optional[int] = 200_000 @@ -130,6 +165,11 @@ class DisplayOptions: For large frames, this can be quite slow. max_info_rows and max_info_cols limit this null check only to frames with smaller dimensions than specified. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.max_info_rows = 100 # doctest: +SKIP """ memory_usage: bool = True @@ -138,19 +178,39 @@ class DisplayOptions: df.info() is called. Default True. Valid values True, False. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.memory_usage = False # doctest: +SKIP """ blob_display: bool = True """ If True, display the blob content in notebook DataFrame preview. Default True. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.blob_display = True # doctest: +SKIP """ blob_display_width: Optional[int] = None """ Width in pixels that the blob constrained to. Default None.. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.blob_display_width = 100 # doctest: +SKIP """ blob_display_height: Optional[int] = None """ Height in pixels that the blob constrained to. Default None.. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.blob_display_height = 100 # doctest: +SKIP """ From ca9fb138c3718d8668a3c442efc69da3ae3eb610 Mon Sep 17 00:00:00 2001 From: Tomo Suzuki Date: Wed, 18 Feb 2026 15:44:22 -0500 Subject: [PATCH 02/29] chore: replace old partner teams with updated names (#2462) This PR replaces @googleapis/api-bigquery-dataframe with @googleapis/bigquery-dataframe-team and @googleapis/yoshi-python with @googleapis/cloud-sdk-python-team. b/478003109 --- .github/CODEOWNERS | 8 ++++---- .github/blunderbuss.yml | 6 +++--- .repo-metadata.json | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7686a50da62..ef110646417 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,8 +5,8 @@ # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-syntax # Note: This file is autogenerated. To make changes to the codeowner team, please update .repo-metadata.json. -# @googleapis/yoshi-python @googleapis/api-bigquery-dataframe are the default owners for changes in this repo -* @googleapis/yoshi-python @googleapis/api-bigquery-dataframe +# @googleapis/cloud-sdk-python-team @googleapis/bigquery-dataframe-team are the default owners for changes in this repo +* @googleapis/cloud-sdk-python-team @googleapis/bigquery-dataframe-team -# @googleapis/python-samples-reviewers @googleapis/api-bigquery-dataframe are the default owners for samples changes -/samples/ @googleapis/python-samples-reviewers @googleapis/api-bigquery-dataframe +# @googleapis/python-samples-reviewers @googleapis/bigquery-dataframe-team are the default owners for samples changes +/samples/ @googleapis/python-samples-reviewers @googleapis/bigquery-dataframe-team diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml index 8d9cb1008e7..527a5e8f9a0 100644 --- a/.github/blunderbuss.yml +++ b/.github/blunderbuss.yml @@ -4,14 +4,14 @@ # Note: This file is autogenerated. To make changes to the assignee # team, please update `codeowner_team` in `.repo-metadata.json`. assign_issues: - - googleapis/api-bigquery-dataframe + - googleapis/bigquery-dataframe-team assign_issues_by: - labels: - "samples" to: - googleapis/python-samples-reviewers - - googleapis/api-bigquery-dataframe + - googleapis/bigquery-dataframe-team assign_prs: - - googleapis/api-bigquery-dataframe + - googleapis/bigquery-dataframe-team diff --git a/.repo-metadata.json b/.repo-metadata.json index 0efaa967d2c..0dfe79ecb1b 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -11,6 +11,6 @@ "distribution_name": "bigframes", "api_id": "bigquery.googleapis.com", "default_version": "", - "codeowner_team": "@googleapis/api-bigquery-dataframe", + "codeowner_team": "@googleapis/bigquery-dataframe-team", "api_shortname": "bigquery" } From f7fd1895e64a133fe63eddeb90f57a42a35c29b2 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 18 Feb 2026 15:51:18 -0800 Subject: [PATCH 03/29] feat: Update bigquery.ai.generate_table output_schema to allow Mapping type (#2463) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/bigquery/_operations/ai.py | 19 +++++++++++++++---- tests/system/large/bigquery/test_ai.py | 17 +++++++++++++++++ tests/unit/bigquery/test_ai.py | 26 ++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 5fe9f306d55..477ca91366f 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -606,7 +606,7 @@ def generate_table( model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], *, - output_schema: str, + output_schema: Union[str, Mapping[str, str]], temperature: Optional[float] = None, top_p: Optional[float] = None, max_output_tokens: Optional[int] = None, @@ -642,8 +642,10 @@ def generate_table( treated as the 'prompt' column. If a DataFrame is provided, it must contain a 'prompt' column, or you must rename the column you wish to generate table to 'prompt'. - output_schema (str): - A string defining the output schema (e.g., "col1 STRING, col2 INT64"). + output_schema (str | Mapping[str, str]): + A string defining the output schema (e.g., "col1 STRING, col2 INT64"), + or a mapping value that specifies the schema of the output, in the form {field_name: data_type}. + Supported data types include `STRING`, `INT64`, `FLOAT64`, `BOOL`, `ARRAY`, and `STRUCT`. temperature (float, optional): A FLOAT64 value that is used for sampling promiscuity. The value must be in the range ``[0.0, 1.0]``. @@ -666,8 +668,17 @@ def generate_table( model_name, session = bq_utils.get_model_name_and_session(model, data) table_sql = bq_utils.to_sql(data) + if isinstance(output_schema, Mapping): + output_schema_str = ", ".join( + [f"{name} {sql_type}" for name, sql_type in output_schema.items()] + ) + # Validate user input + output_schemas.parse_sql_fields(output_schema_str) + else: + output_schema_str = output_schema + struct_fields_bq: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = { - "output_schema": output_schema + "output_schema": output_schema_str } if temperature is not None: struct_fields_bq["temperature"] = temperature diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py index 86cf4d7f001..668581c627d 100644 --- a/tests/system/large/bigquery/test_ai.py +++ b/tests/system/large/bigquery/test_ai.py @@ -111,3 +111,20 @@ def test_generate_table(text_model): assert "creator" in result.columns # The model may not always return the exact number of rows requested. assert len(result) > 0 + + +def test_generate_table_with_mapping_schema(text_model): + df = bpd.DataFrame( + {"prompt": ["Generate a table of 2 programming languages and their creators."]} + ) + + result = ai.generate_table( + text_model, + df, + output_schema={"language": "STRING", "creator": "STRING"}, + ) + + assert "language" in result.columns + assert "creator" in result.columns + # The model may not always return the exact number of rows requested. + assert len(result) > 0 diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py index 796e86f9245..c73e63b9db1 100644 --- a/tests/unit/bigquery/test_ai.py +++ b/tests/unit/bigquery/test_ai.py @@ -269,6 +269,32 @@ def test_generate_table_with_options(mock_dataframe, mock_session): ) +def test_generate_table_with_mapping_schema(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_table( + model_name, + mock_dataframe, + output_schema={"col1": "STRING", "col2": "INT64"}, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_TABLE(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT('col1 STRING, col2 INT64' AS output_schema)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + @mock.patch("bigframes.pandas.read_pandas") def test_generate_text_with_pandas_dataframe( read_pandas_mock, mock_dataframe, mock_session From 0f1ac1aaba60d7a4d9baf3513196cb5a43b641d5 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 19 Feb 2026 11:12:40 -0800 Subject: [PATCH 04/29] refactor: Decorator to take docs rather than inheritance (#2467) --- bigframes/_tools/docs.py | 39 +++++ bigframes/core/groupby/dataframe_group_by.py | 6 +- bigframes/core/groupby/series_group_by.py | 4 +- bigframes/core/indexes/base.py | 10 +- bigframes/core/indexes/datetimes.py | 5 +- bigframes/core/indexes/multi.py | 6 +- bigframes/core/window/rolling.py | 6 +- bigframes/dataframe.py | 141 ++++-------------- bigframes/geopandas/geoseries.py | 6 +- bigframes/ml/base.py | 4 +- bigframes/ml/model_selection.py | 6 +- bigframes/operations/lists.py | 6 +- bigframes/operations/plotting.py | 6 +- bigframes/operations/strings.py | 6 +- bigframes/operations/structs.py | 9 +- bigframes/series.py | 93 ++++-------- .../pandas/core/computation/align.py | 29 ++-- .../pandas/core/computation/eval.py | 7 +- .../bigframes_vendored/pandas/core/frame.py | 2 +- .../bigframes_vendored/pandas/core/generic.py | 17 +-- .../bigframes_vendored/pandas/core/series.py | 4 + 21 files changed, 163 insertions(+), 249 deletions(-) create mode 100644 bigframes/_tools/docs.py diff --git a/bigframes/_tools/docs.py b/bigframes/_tools/docs.py new file mode 100644 index 00000000000..1b4b329a945 --- /dev/null +++ b/bigframes/_tools/docs.py @@ -0,0 +1,39 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def inherit_docs(source_class): + """ + A class decorator that copies docstrings from source_class to the + decorated class for any methods or attributes that match names. + """ + + def decorator(target_class): + if not target_class.__doc__ and source_class.__doc__: + target_class.__doc__ = source_class.__doc__ + + for name, source_item in vars(source_class).items(): + if name in vars(target_class): + target_item = getattr(target_class, name) + + if hasattr(target_item, "__doc__") and not target_item.__doc__: + if hasattr(source_item, "__doc__") and source_item.__doc__: + try: + target_item.__doc__ = source_item.__doc__ + except AttributeError: + pass + + return target_class + + return decorator diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index 7f9e5d627ab..d61eceb1783 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -24,6 +24,7 @@ import pandas as pd from bigframes import session +from bigframes._tools import docs from bigframes.core import agg_expressions from bigframes.core import expression as ex import bigframes.core.block_transforms as block_ops @@ -44,9 +45,8 @@ @log_adapter.class_logger -class DataFrameGroupBy(vendored_pandas_groupby.DataFrameGroupBy): - __doc__ = vendored_pandas_groupby.GroupBy.__doc__ - +@docs.inherit_docs(vendored_pandas_groupby.DataFrameGroupBy) +class DataFrameGroupBy: def __init__( self, block: blocks.Block, diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index a8900cf5455..221c5ecef9c 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -24,6 +24,7 @@ import pandas from bigframes import session +from bigframes._tools import docs from bigframes.core import expression as ex import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks @@ -43,9 +44,8 @@ @log_adapter.class_logger +@docs.inherit_docs(vendored_pandas_groupby.SeriesGroupBy) class SeriesGroupBy(vendored_pandas_groupby.SeriesGroupBy): - __doc__ = vendored_pandas_groupby.GroupBy.__doc__ - def __init__( self, block: blocks.Block, diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 383534fa4df..011639ed9ef 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -27,6 +27,7 @@ import pandas from bigframes import dtypes +from bigframes._tools import docs import bigframes.core.agg_expressions as ex_types import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks @@ -47,8 +48,8 @@ import bigframes.series -class Index(vendored_pandas_index.Index): - __doc__ = vendored_pandas_index.Index.__doc__ +@docs.inherit_docs(vendored_pandas_index.Index) +class Index: _query_job = None _block: blocks.Block _linked_frame: Union[ @@ -777,6 +778,11 @@ def to_list(self, *, allow_large_results: Optional[bool] = None) -> list: def __len__(self): return self.shape[0] + def __bool__(self): + raise ValueError( + "Cannot convert Index into bool. Consider using .empty(), .item(), .any(), or .all() methods." + ) + def item(self): # Docstring is in third_party/bigframes_vendored/pandas/core/indexes/base.py return self.to_series().peek(2).item() diff --git a/bigframes/core/indexes/datetimes.py b/bigframes/core/indexes/datetimes.py index 23ad8b03b4d..ec5174e8a9a 100644 --- a/bigframes/core/indexes/datetimes.py +++ b/bigframes/core/indexes/datetimes.py @@ -20,13 +20,14 @@ datetimes as vendored_pandas_datetime_index, ) +from bigframes._tools import docs from bigframes.core import expression as ex from bigframes.core.indexes.base import Index from bigframes.operations import date_ops -class DatetimeIndex(Index, vendored_pandas_datetime_index.DatetimeIndex): - __doc__ = vendored_pandas_datetime_index.DatetimeIndex.__doc__ +@docs.inherit_docs(vendored_pandas_datetime_index.DatetimeIndex) +class DatetimeIndex(Index): # Must be above 5000 for pandas to delegate to bigframes for binops __pandas_priority__ = 12000 diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index cfabd9e70d1..d9f26ea8730 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -19,6 +19,7 @@ import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import pandas +from bigframes._tools import docs from bigframes.core import blocks from bigframes.core import expression as ex from bigframes.core.indexes.base import Index @@ -27,9 +28,8 @@ import bigframes.session -class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): - __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ - +@docs.inherit_docs(vendored_pandas_multindex.MultiIndex) +class MultiIndex(Index): @classmethod def from_tuples( cls, diff --git a/bigframes/core/window/rolling.py b/bigframes/core/window/rolling.py index b7bb62372cc..97af59edf5b 100644 --- a/bigframes/core/window/rolling.py +++ b/bigframes/core/window/rolling.py @@ -22,6 +22,7 @@ import pandas from bigframes import dtypes +from bigframes._tools import docs from bigframes.core import agg_expressions from bigframes.core import expression as ex from bigframes.core import ordering, utils, window_spec @@ -36,9 +37,8 @@ @log_adapter.class_logger -class Window(vendored_pandas_rolling.Window): - __doc__ = vendored_pandas_rolling.Window.__doc__ - +@docs.inherit_docs(vendored_pandas_rolling.Window) +class Window: def __init__( self, block: blocks.Block, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b195ce9902d..2a22fc4487d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -37,6 +37,7 @@ overload, Sequence, Tuple, + TypeVar, Union, ) import warnings @@ -53,6 +54,7 @@ import pyarrow import tabulate +from bigframes._tools import docs import bigframes.constants import bigframes.core from bigframes.core import agg_expressions @@ -107,6 +109,7 @@ "DataFrame", Sequence[int | float | str | pandas.Timedelta | Callable] ] +U = TypeVar("U") LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] @@ -120,8 +123,8 @@ # Inherits from pandas DataFrame so that we can use the same docstrings. @log_adapter.class_logger -class DataFrame(vendored_pandas_frame.DataFrame): - __doc__ = vendored_pandas_frame.DataFrame.__doc__ +@docs.inherit_docs(vendored_pandas_frame.DataFrame) +class DataFrame: # internal flag to disable cache at all _disable_cache_override: bool = False # Must be above 5000 for pandas to delegate to bigframes for binops @@ -343,6 +346,10 @@ def columns(self, labels: pandas.Index): def shape(self) -> Tuple[int, int]: return self._block.shape + @property + def axes(self) -> list: + return [self.index, self.columns] + @property def size(self) -> int: rows, cols = self.shape @@ -387,7 +394,10 @@ def __len__(self): rows, _ = self.shape return rows - __len__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__len__) + def __bool__(self): + raise ValueError( + "Cannot convert dataframe into bool. Consider using .empty(), .any(), or .all() methods." + ) def __iter__(self): return iter(self.columns) @@ -599,45 +609,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): self._query_job = query_job - @overload - def __getitem__( - self, - key: bigframes.series.Series, - ) -> DataFrame: - ... - - @overload - def __getitem__( - self, - key: slice, - ) -> DataFrame: - ... - - @overload - def __getitem__( - self, - key: List[str], - ) -> DataFrame: - ... - - @overload - def __getitem__( - self, - key: List[blocks.Label], - ) -> DataFrame: - ... - - @overload - def __getitem__(self, key: pandas.Index) -> DataFrame: - ... - - @overload - def __getitem__( - self, - key: blocks.Label, - ) -> bigframes.series.Series: - ... - def __getitem__( self, key: Union[ @@ -670,8 +641,6 @@ def __getitem__( # TODO(tswast): What case is this supposed to be handling? return self._getitem_columns([cast(Hashable, key)]) - __getitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__getitem__) - def _getitem_columns(self, key: Sequence[blocks.Label]) -> DataFrame: selected_ids: Tuple[str, ...] = () for label in key: @@ -858,8 +827,6 @@ def __setitem__( df = self._assign_single_item(key, value) self._set_block(df._get_block()) - __setitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__setitem__) - def _apply_binop( self, other: float | int | bigframes.series.Series | DataFrame, @@ -970,53 +937,39 @@ def eq(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: def __eq__(self, other) -> DataFrame: # type: ignore return self.eq(other) - __eq__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__eq__) - def ne(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.ne_op, axis=axis) def __ne__(self, other) -> DataFrame: # type: ignore return self.ne(other) - __ne__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__ne__) - def __invert__(self) -> DataFrame: return self._apply_unary_op(ops.invert_op) - __invert__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__invert__) - def le(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.le_op, axis=axis) def __le__(self, other) -> DataFrame: return self.le(other) - __le__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__le__) - def lt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.lt_op, axis=axis) def __lt__(self, other) -> DataFrame: return self.lt(other) - __lt__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__lt__) - def ge(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.ge_op, axis=axis) def __ge__(self, other) -> DataFrame: return self.ge(other) - __ge__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__ge__) - def gt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.gt_op, axis=axis) def __gt__(self, other) -> DataFrame: return self.gt(other) - __gt__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__gt__) - def add( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1038,13 +991,9 @@ def radd( def __add__(self, other) -> DataFrame: return self.add(other) - __add__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__add__) - def __radd__(self, other) -> DataFrame: return self.radd(other) - __radd__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__radd__) - def sub( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1053,13 +1002,10 @@ def sub( return self._apply_binop(other, ops.sub_op, axis=axis) subtract = sub - subtract.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.sub) def __sub__(self, other): return self.sub(other) - __sub__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__sub__) - def rsub( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1070,8 +1016,6 @@ def rsub( def __rsub__(self, other): return self.rsub(other) - __rsub__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rsub__) - def mul( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1080,13 +1024,10 @@ def mul( return self._apply_binop(other, ops.mul_op, axis=axis) multiply = mul - multiply.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.mul) def __mul__(self, other): return self.mul(other) - __mul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__mul__) - def rmul( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1097,8 +1038,6 @@ def rmul( def __rmul__(self, other): return self.rmul(other) - __rmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rmul__) - def truediv( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1106,14 +1045,11 @@ def truediv( ) -> DataFrame: return self._apply_binop(other, ops.div_op, axis=axis) - truediv.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.truediv) div = divide = truediv def __truediv__(self, other): return self.truediv(other) - __truediv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__truediv__) - def rtruediv( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1122,13 +1058,10 @@ def rtruediv( return self._apply_binop(other, ops.div_op, axis=axis, reverse=True) rdiv = rtruediv - rdiv.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.rtruediv) def __rtruediv__(self, other): return self.rtruediv(other) - __rtruediv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rtruediv__) - def floordiv( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1139,8 +1072,6 @@ def floordiv( def __floordiv__(self, other): return self.floordiv(other) - __floordiv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__floordiv__) - def rfloordiv( self, other: float | int | bigframes.series.Series | DataFrame, @@ -1151,26 +1082,18 @@ def rfloordiv( def __rfloordiv__(self, other): return self.rfloordiv(other) - __rfloordiv__.__doc__ = inspect.getdoc( - vendored_pandas_frame.DataFrame.__rfloordiv__ - ) - def mod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.mod_op, axis=axis) def __mod__(self, other): return self.mod(other) - __mod__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__mod__) - def rmod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.mod_op, axis=axis, reverse=True) def __rmod__(self, other): return self.rmod(other) - __rmod__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rmod__) - def pow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: @@ -1179,8 +1102,6 @@ def pow( def __pow__(self, other): return self.pow(other) - __pow__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__pow__) - def rpow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: @@ -1189,27 +1110,19 @@ def rpow( def __rpow__(self, other): return self.rpow(other) - __rpow__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rpow__) - def __and__(self, other: bool | int | bigframes.series.Series) -> DataFrame: return self._apply_binop(other, ops.and_op) - __and__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__and__) - __rand__ = __and__ def __or__(self, other: bool | int | bigframes.series.Series) -> DataFrame: return self._apply_binop(other, ops.or_op) - __or__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__or__) - __ror__ = __or__ def __xor__(self, other: bool | int | bigframes.series.Series) -> DataFrame: return self._apply_binop(other, ops.xor_op) - __xor__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__xor__) - __rxor__ = __xor__ def __pos__(self) -> DataFrame: @@ -1221,8 +1134,6 @@ def __neg__(self) -> DataFrame: def __abs__(self) -> DataFrame: return self._apply_unary_op(ops.abs_op) - __abs__.__doc__ = abs.__doc__ - def align( self, other: typing.Union[DataFrame, bigframes.series.Series], @@ -3186,7 +3097,6 @@ def prod( return bigframes.series.Series(block) product = prod - product.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.prod) def count(self, *, numeric_only: bool = False) -> bigframes.series.Series: if not numeric_only: @@ -3265,7 +3175,6 @@ def agg(self, func) -> DataFrame | bigframes.series.Series: ) aggregate = agg - aggregate.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.agg) @validations.requires_index @validations.requires_ordering() @@ -3338,7 +3247,6 @@ def kurt(self, *, numeric_only: bool = False): return bigframes.series.Series(result_block) kurtosis = kurt - kurtosis.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.kurt) def _pivot( self, @@ -3858,6 +3766,22 @@ def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: self._block, window, self._block.value_columns ) + def pipe( + self, + func: Union[Callable[..., U], tuple[Callable[..., U], str]], + *args, + **kwargs, + ) -> U: + import bigframes_vendored.pandas.core.common as common + + return common.pipe(self, func, *args, **kwargs) + + def get(self, key, default=None): + try: + return self[key] + except (KeyError, ValueError, IndexError): + return default + def groupby( self, by: typing.Union[ @@ -3998,13 +3922,11 @@ def isna(self) -> DataFrame: return self._apply_unary_op(ops.isnull_op) isnull = isna - isnull.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.isna) def notna(self) -> DataFrame: return self._apply_unary_op(ops.notnull_op) notnull = notna - notnull.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.notna) @validations.requires_ordering() def cumsum(self): @@ -4407,8 +4329,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) - __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__) - def to_parquet( self, path=None, @@ -4946,7 +4866,6 @@ def first_valid_index(self): return applymap = map - applymap.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.map) def _slice( self, @@ -5148,8 +5067,6 @@ def scatter( def __matmul__(self, other) -> DataFrame: return self.dot(other) - __matmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__matmul__) - @property def struct(self): return bigframes.operations.structs.StructFrameAccessor(self) diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index 660f1939a94..deae960d753 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -19,14 +19,14 @@ import bigframes_vendored.geopandas.geoseries as vendored_geoseries import geopandas.array # type: ignore +from bigframes._tools import docs import bigframes.operations as ops import bigframes.series import bigframes.session -class GeoSeries(vendored_geoseries.GeoSeries, bigframes.series.Series): - __doc__ = vendored_geoseries.GeoSeries.__doc__ - +@docs.inherit_docs(vendored_geoseries.GeoSeries) +class GeoSeries(bigframes.series.Series): def __init__(self, data=None, index=None, **kwargs): super().__init__( data=data, index=index, dtype=geopandas.array.GeometryDtype(), **kwargs diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 3f6ccecaa2b..76a99304738 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -30,13 +30,15 @@ import bigframes_vendored.sklearn.base +from bigframes._tools import docs import bigframes.exceptions as bfe from bigframes.ml import core import bigframes.ml.utils as utils import bigframes.pandas as bpd -class BaseEstimator(bigframes_vendored.sklearn.base.BaseEstimator, abc.ABC): +@docs.inherit_docs(bigframes_vendored.sklearn.base.BaseEstimator) +class BaseEstimator(abc.ABC): """ A BigQuery DataFrames machine learning component follows sklearn API design Ref: https://bit.ly/3NyhKjN diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 3d23fbf5684..7dd13ec4d73 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -27,6 +27,7 @@ import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation import pandas as pd +from bigframes._tools import docs from bigframes.core.logging import log_adapter from bigframes.ml import utils import bigframes.pandas as bpd @@ -133,9 +134,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra @log_adapter.class_logger -class KFold(vendored_model_selection_split.KFold): - __doc__ = inspect.getdoc(vendored_model_selection_split.KFold) - +@docs.inherit_docs(vendored_model_selection_split.KFold) +class KFold: def __init__(self, n_splits: int = 5, *, random_state: Union[int, None] = None): if n_splits < 2: raise ValueError(f"n_splits must be at least 2. Got {n_splits}") diff --git a/bigframes/operations/lists.py b/bigframes/operations/lists.py index 9974e686933..e9d560791da 100644 --- a/bigframes/operations/lists.py +++ b/bigframes/operations/lists.py @@ -19,6 +19,7 @@ import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors +from bigframes._tools import docs from bigframes.core.logging import log_adapter import bigframes.operations as ops from bigframes.operations._op_converters import convert_index, convert_slice @@ -26,9 +27,8 @@ @log_adapter.class_logger -class ListAccessor(vendoracessors.ListAccessor): - __doc__ = vendoracessors.ListAccessor.__doc__ - +@docs.inherit_docs(vendoracessors.ListAccessor) +class ListAccessor: def __init__(self, data: series.Series): self._data = data diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py index 21a23a9ab54..c459af9550b 100644 --- a/bigframes/operations/plotting.py +++ b/bigframes/operations/plotting.py @@ -17,14 +17,14 @@ import bigframes_vendored.constants as constants import bigframes_vendored.pandas.plotting._core as vendordt +from bigframes._tools import docs from bigframes.core.logging import log_adapter import bigframes.operations._matplotlib as bfplt @log_adapter.class_logger -class PlotAccessor(vendordt.PlotAccessor): - __doc__ = vendordt.PlotAccessor.__doc__ - +@docs.inherit_docs(vendordt.PlotAccessor) +class PlotAccessor: _common_kinds = ("line", "area", "hist", "bar", "barh", "pie") _dataframe_kinds = ("scatter", "hexbin,") _all_kinds = _common_kinds + _dataframe_kinds diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 922d26a23c1..8b5b57b259e 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -20,6 +20,7 @@ import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr +from bigframes._tools import docs import bigframes.core.indexes.base as indices from bigframes.core.logging import log_adapter import bigframes.dataframe as df @@ -39,9 +40,8 @@ @log_adapter.class_logger -class StringMethods(vendorstr.StringMethods, Generic[T]): - __doc__ = vendorstr.StringMethods.__doc__ - +@docs.inherit_docs(vendorstr.StringMethods) +class StringMethods(Generic[T]): def __init__(self, data: T): self._data: T = data diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index ec0b5dae526..31aeb345728 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -17,6 +17,7 @@ import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors import pandas as pd +from bigframes._tools import docs from bigframes.core import backports from bigframes.core.logging import log_adapter import bigframes.dataframe @@ -25,9 +26,8 @@ @log_adapter.class_logger -class StructAccessor(vendoracessors.StructAccessor): - __doc__ = vendoracessors.StructAccessor.__doc__ - +@docs.inherit_docs(vendoracessors.StructAccessor) +class StructAccessor: def __init__(self, data: bigframes.series.Series): self._data = data @@ -69,7 +69,8 @@ def dtypes(self) -> pd.Series: @log_adapter.class_logger -class StructFrameAccessor(vendoracessors.StructFrameAccessor): +@docs.inherit_docs(vendoracessors.StructFrameAccessor) +class StructFrameAccessor: __doc__ = vendoracessors.StructAccessor.__doc__ def __init__(self, data: bigframes.dataframe.DataFrame) -> None: diff --git a/bigframes/series.py b/bigframes/series.py index 0c74a0dd19c..299c39637d0 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -18,7 +18,6 @@ import datetime import functools -import inspect import itertools import numbers import textwrap @@ -35,6 +34,7 @@ overload, Sequence, Tuple, + TypeVar, Union, ) import warnings @@ -48,6 +48,7 @@ import pyarrow as pa import typing_extensions +from bigframes._tools import docs import bigframes.core from bigframes.core import agg_expressions, groupby import bigframes.core.block_transforms as block_ops @@ -84,6 +85,7 @@ import bigframes.operations.strings as strings +U = TypeVar("U") LevelType = typing.Union[str, int] LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] @@ -97,7 +99,8 @@ @log_adapter.class_logger -class Series(vendored_pandas_series.Series): +@docs.inherit_docs(vendored_pandas_series.Series) +class Series: # Must be above 5000 for pandas to delegate to bigframes for binops __pandas_priority__ = 13000 @@ -358,7 +361,10 @@ def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): def __len__(self): return self.shape[0] - __len__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__len__) + def __bool__(self): + raise ValueError( + "Cannot convert Series into bool. Consider using .empty(), .item(), .any(), or .all() methods." + ) def __iter__(self) -> typing.Iterator: return itertools.chain.from_iterable( @@ -918,7 +924,6 @@ def ffill(self, *, limit: typing.Optional[int] = None) -> Series: return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill - pad.__doc__ = inspect.getdoc(vendored_pandas_series.Series.ffill) @validations.requires_ordering() def bfill(self, *, limit: typing.Optional[int] = None) -> Series: @@ -1166,45 +1171,33 @@ def isna(self) -> "Series": return self._apply_unary_op(ops.isnull_op) isnull = isna - isnull.__doc__ = inspect.getdoc(vendored_pandas_series.Series.isna) def notna(self) -> "Series": return self._apply_unary_op(ops.notnull_op) notnull = notna - notnull.__doc__ = inspect.getdoc(vendored_pandas_series.Series.notna) def __and__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.and_op) - __and__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__and__) - __rand__ = __and__ def __or__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.or_op) - __or__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__or__) - __ror__ = __or__ def __xor__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.xor_op) - __or__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__xor__) - __rxor__ = __xor__ def __add__(self, other: float | int | pandas.Timedelta | Series) -> Series: return self.add(other) - __add__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__add__) - def __radd__(self, other: float | int | pandas.Timedelta | Series) -> Series: return self.radd(other) - __radd__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__radd__) - def add(self, other: float | int | pandas.Timedelta | Series) -> Series: return self._apply_binary_op(other, ops.add_op) @@ -1214,13 +1207,9 @@ def radd(self, other: float | int | pandas.Timedelta | Series) -> Series: def __sub__(self, other: float | int | Series) -> Series: return self.sub(other) - __sub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__sub__) - def __rsub__(self, other: float | int | Series) -> Series: return self.rsub(other) - __rsub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rsub__) - def sub(self, other) -> Series: return self._apply_binary_op(other, ops.sub_op) @@ -1228,18 +1217,13 @@ def rsub(self, other) -> Series: return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub - subtract.__doc__ = inspect.getdoc(vendored_pandas_series.Series.sub) def __mul__(self, other: float | int | Series) -> Series: return self.mul(other) - __mul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__mul__) - def __rmul__(self, other: float | int | Series) -> Series: return self.rmul(other) - __rmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmul__) - def mul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op) @@ -1247,40 +1231,29 @@ def rmul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op, reverse=True) multiply = mul - multiply.__doc__ = inspect.getdoc(vendored_pandas_series.Series.mul) def __truediv__(self, other: float | int | pandas.Timedelta | Series) -> Series: return self.truediv(other) - __truediv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__truediv__) - def __rtruediv__(self, other: float | int | pandas.Timedelta | Series) -> Series: return self.rtruediv(other) - __rtruediv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rtruediv__) - def truediv(self, other: float | int | pandas.Timedelta | Series) -> Series: return self._apply_binary_op(other, ops.div_op) def rtruediv(self, other: float | int | pandas.Timedelta | Series) -> Series: return self._apply_binary_op(other, ops.div_op, reverse=True) - truediv.__doc__ = inspect.getdoc(vendored_pandas_series.Series.truediv) div = divide = truediv rdiv = rtruediv - rdiv.__doc__ = inspect.getdoc(vendored_pandas_series.Series.rtruediv) def __floordiv__(self, other: float | int | pandas.Timedelta | Series) -> Series: return self.floordiv(other) - __floordiv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__floordiv__) - def __rfloordiv__(self, other: float | int | pandas.Timedelta | Series) -> Series: return self.rfloordiv(other) - __rfloordiv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rfloordiv__) - def floordiv(self, other: float | int | pandas.Timedelta | Series) -> Series: return self._apply_binary_op(other, ops.floordiv_op) @@ -1290,13 +1263,9 @@ def rfloordiv(self, other: float | int | pandas.Timedelta | Series) -> Series: def __pow__(self, other: float | int | Series) -> Series: return self.pow(other) - __pow__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__pow__) - def __rpow__(self, other: float | int | Series) -> Series: return self.rpow(other) - __rpow__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rpow__) - def pow(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.pow_op) @@ -1330,13 +1299,9 @@ def ge(self, other) -> Series: def __mod__(self, other) -> Series: # type: ignore return self.mod(other) - __mod__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__mod__) - def __rmod__(self, other) -> Series: # type: ignore return self.rmod(other) - __rmod__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmod__) - def mod(self, other) -> Series: # type: ignore return self._apply_binary_op(other, ops.mod_op) @@ -1359,13 +1324,9 @@ def dot(self, other): def __matmul__(self, other): return self.dot(other) - __matmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__matmul__) - def __rmatmul__(self, other): return self.dot(other) - __rmatmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmatmul__) - def combine_first(self, other: Series) -> Series: result = self._apply_binary_op(other, ops.coalesce_op) result.name = self.name @@ -1380,8 +1341,6 @@ def update(self, other: Union[Series, Sequence, Mapping]) -> None: def __abs__(self) -> Series: return self.abs() - __abs__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.abs) - def abs(self) -> Series: return self._apply_unary_op(ops.abs_op) @@ -1456,7 +1415,6 @@ def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: return self._apply_aggregation(agg_ops.lookup_agg_func(func)[0]) aggregate = agg - aggregate.__doc__ = inspect.getdoc(vendored_pandas_series.Series.agg) def describe(self) -> Series: from bigframes.pandas.core.methods import describe @@ -1496,7 +1454,6 @@ def kurt(self): return (numerator / denominator) - adjustment kurtosis = kurt - kurtosis.__doc__ = inspect.getdoc(vendored_pandas_series.Series.kurt) def mode(self) -> Series: block = self._block @@ -1561,7 +1518,6 @@ def prod(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.product_op)) product = prod - product.__doc__ = inspect.getdoc(vendored_pandas_series.Series.prod) def __eq__(self, other: object) -> Series: # type: ignore return self.eq(other) @@ -1572,8 +1528,6 @@ def __ne__(self, other: object) -> Series: # type: ignore def __invert__(self) -> Series: return self._apply_unary_op(ops.invert_op) - __invert__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__invert__) - def __pos__(self) -> Series: return self._apply_unary_op(ops.pos_op) @@ -1751,8 +1705,6 @@ def __getitem__(self, indexer): return Series(block) return self.loc[indexer] - __getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__) - def __getattr__(self, key: str): # Protect against recursion errors with uninitialized Series objects. # We use "_block" attribute to check whether the instance is initialized. @@ -1936,6 +1888,22 @@ def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: self._block, window_spec, self._block.value_columns, is_series=True ) + def pipe( + self, + func: Union[Callable[..., U], tuple[Callable[..., U], str]], + *args, + **kwargs, + ) -> U: + import bigframes_vendored.pandas.core.common as common + + return common.pipe(self, func, *args, **kwargs) + + def get(self, key, default=None): + try: + return self[key] + except (KeyError, ValueError, IndexError): + return default + def groupby( self, by: typing.Union[ @@ -2374,7 +2342,6 @@ def tolist( return self.to_pandas(allow_large_results=allow_large_results).to_list() to_list = tolist - to_list.__doc__ = inspect.getdoc(vendored_pandas_series.Series.tolist) def to_markdown( self, @@ -2405,8 +2372,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) - __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) - def to_pickle(self, path, *, allow_large_results=None, **kwargs) -> None: return self.to_pandas(allow_large_results=allow_large_results).to_pickle( path, **kwargs @@ -2575,8 +2540,6 @@ def hist( ): return self.plot.hist(by=by, bins=bins, **kwargs) - hist.__doc__ = inspect.getdoc(plotting.PlotAccessor.hist) - def line( self, x: typing.Optional[typing.Hashable] = None, @@ -2585,8 +2548,6 @@ def line( ): return self.plot.line(x=x, y=y, **kwargs) - line.__doc__ = inspect.getdoc(plotting.PlotAccessor.line) - def area( self, x: typing.Optional[typing.Hashable] = None, @@ -2596,8 +2557,6 @@ def area( ): return self.plot.area(x=x, y=y, stacked=stacked, **kwargs) - area.__doc__ = inspect.getdoc(plotting.PlotAccessor.area) - def bar( self, x: typing.Optional[typing.Hashable] = None, @@ -2606,8 +2565,6 @@ def bar( ): return self.plot.bar(x=x, y=y, **kwargs) - bar.__doc__ = inspect.getdoc(plotting.PlotAccessor.bar) - def _slice( self, start: typing.Optional[int] = None, diff --git a/third_party/bigframes_vendored/pandas/core/computation/align.py b/third_party/bigframes_vendored/pandas/core/computation/align.py index 2608dabe7ac..4280c32f9b6 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/align.py +++ b/third_party/bigframes_vendored/pandas/core/computation/align.py @@ -5,7 +5,7 @@ from __future__ import annotations from functools import partial, wraps -from typing import Callable, TYPE_CHECKING +from typing import Callable, TYPE_CHECKING, Union import warnings import bigframes_vendored.pandas.core.common as com @@ -17,15 +17,18 @@ if TYPE_CHECKING: from collections.abc import Sequence - from bigframes_vendored.pandas.core.generic import NDFrame from bigframes_vendored.pandas.core.indexes.base import Index from pandas._typing import F + from bigframes.pandas import DataFrame, Series + + FrameT = Union[Series, DataFrame] + def _align_core_single_unary_op( term, -) -> tuple[partial | type[NDFrame], dict[str, Index] | None]: - typ: partial | type[NDFrame] +) -> tuple[partial | FrameT, dict[str, Index] | None]: + typ: partial | FrameT axes: dict[str, Index] | None = None if isinstance(term.value, np.ndarray): @@ -38,9 +41,7 @@ def _align_core_single_unary_op( return typ, axes -def _zip_axes_from_type( - typ: type[NDFrame], new_axes: Sequence[Index] -) -> dict[str, Index]: +def _zip_axes_from_type(typ: FrameT, new_axes: Sequence[Index]) -> dict[str, Index]: return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} @@ -207,20 +208,18 @@ def is_series(obj) -> bool: def is_series_or_dataframe(obj) -> bool: - from bigframes_vendored.pandas.core.frame import NDFrame + from bigframes.pandas import DataFrame, Series - return isinstance(obj, NDFrame) + return isinstance(obj, Series | DataFrame) def is_pandas_object(obj) -> bool: - from bigframes_vendored.pandas.core.frame import NDFrame - from bigframes_vendored.pandas.core.indexes.base import Index + from bigframes.pandas import DataFrame, Index, Series - return isinstance(obj, NDFrame) or isinstance(obj, Index) + return isinstance(obj, Series | DataFrame | Index) def is_pandas_type(type) -> bool: - from bigframes_vendored.pandas.core.frame import NDFrame - from bigframes_vendored.pandas.core.indexes.base import Index + from bigframes.pandas import DataFrame, Index, Series - return issubclass(type, NDFrame) or issubclass(type, Index) + return issubclass(type, Series | DataFrame | Index) diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index a1809f6cb3a..b16c62d9552 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -12,7 +12,6 @@ from bigframes_vendored.pandas.core.computation.expr import Expr, PARSERS from bigframes_vendored.pandas.core.computation.parsing import tokenize_string from bigframes_vendored.pandas.core.computation.scope import ensure_scope -from bigframes_vendored.pandas.core.generic import NDFrame from bigframes_vendored.pandas.util._validators import validate_bool_kwarg from pandas.io.formats.printing import pprint_thing @@ -317,6 +316,8 @@ def eval( # assign if needed assigner = parsed_expr.assigner + from bigframes.pandas import DataFrame, Series + if env.target is not None and assigner is not None: target_modified = True @@ -324,7 +325,7 @@ def eval( if not inplace and first_expr: try: target = env.target - if isinstance(target, NDFrame): + if isinstance(target, Series | DataFrame): target = target.copy() except AttributeError as err: raise ValueError("Cannot return a copy of the target") from err @@ -338,7 +339,7 @@ def eval( try: with warnings.catch_warnings(record=True): # TODO: Filter the warnings we actually care about here. - if inplace and isinstance(target, NDFrame): + if inplace and isinstance(target, Series | DataFrame): target.loc[:, assigner] = ret else: target[ # pyright: ignore[reportGeneralTypeIssues] diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index dc1bcca213e..f04d9989dd4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -67,7 +67,7 @@ def axes(self) -> list: >>> df.axes[1:] [Index(['col1', 'col2'], dtype='object')] """ - return [self.index, self.columns] + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def values(self) -> np.ndarray: diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 63b9f8199b6..417ccd2d0e5 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -5,7 +5,6 @@ import bigframes_vendored.constants as constants from bigframes_vendored.pandas.core import indexing -import bigframes_vendored.pandas.core.common as common if TYPE_CHECKING: from bigframes_vendored.pandas.pandas._typing import T @@ -395,10 +394,7 @@ def get(self, key, default=None): Any: same type as items contained in object """ - try: - return self[key] - except (KeyError, ValueError, IndexError): - return default + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def add_prefix(self, prefix: str, axis: int | str | None = None): """Prefix labels with string `prefix`. @@ -1227,16 +1223,7 @@ def pipe( bigframes.pandas.DataFrame or bigframes.pandas.Series: Object of same type as caller """ - return common.pipe(self, func, *args, **kwargs) - - def __nonzero__(self): - """Returns the truth value of the object.""" - raise ValueError( - f"The truth value of a {type(self).__name__} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - ) - - __bool__ = __nonzero__ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __getattr__(self, name: str): """ diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 2c0f493d81e..775971ab358 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -32,6 +32,10 @@ class Series(NDFrame): # type: ignore[misc] + """ + One-dimensional ndarray with axis labels (including time series). + """ + @property def dt(self): """ From 630647820e97dcb4a4821ed36da49e965e093add Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:29:55 -0800 Subject: [PATCH 05/29] chore: fix udf text dedent format (#2468) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/functions/function_template.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/bigframes/functions/function_template.py b/bigframes/functions/function_template.py index a3680a7a88c..e48ffda8ed1 100644 --- a/bigframes/functions/function_template.py +++ b/bigframes/functions/function_template.py @@ -331,8 +331,10 @@ def generate_managed_function_code( udf_code = textwrap.dedent(inspect.getsource(get_pd_series)) udf_code = udf_code[udf_code.index("def") :] bigframes_handler_code = textwrap.dedent( - f"""def bigframes_handler(str_arg): - return {udf_name}({get_pd_series.__name__}(str_arg))""" + f""" + def bigframes_handler(str_arg): + return {udf_name}({get_pd_series.__name__}(str_arg)) + """ ) sig = inspect.signature(def_) @@ -352,15 +354,19 @@ def generate_managed_function_code( udf_call_str = ", ".join(udf_call_parts) bigframes_handler_code = textwrap.dedent( - f"""def bigframes_handler({handler_def_str}): - return {udf_name}({udf_call_str})""" + f""" + def bigframes_handler({handler_def_str}): + return {udf_name}({udf_call_str}) + """ ) else: udf_code = "" bigframes_handler_code = textwrap.dedent( - f"""def bigframes_handler(*args): - return {udf_name}(*args)""" + f""" + def bigframes_handler(*args): + return {udf_name}(*args) + """ ) udf_code_block = [] From 1d81b414acbc964502ca624eae72cdb8c14e1576 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 19 Feb 2026 16:26:14 -0800 Subject: [PATCH 06/29] fix: allow IsInOp with same dtypes regardless nullable (#2466) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update Ibis isin_op_impl to compare types by name, allowing comparisons between columns and literals with different nullability. - Update SQLGlot IsInOp implementation to use dtypes.can_compare for more robust type compatibility checking. - Improve dtypes.can_compare to gracefully handle type coercion failures. - Migrate TPCH verification script to tests/system/large/test_tpch.py for better integration with the test suite. Fixes 485642936 🦕 --- .../ibis_compiler/scalar_op_registry.py | 2 +- .../sqlglot/expressions/comparison_ops.py | 7 +- bigframes/dtypes.py | 7 +- scripts/tpch_result_verify.py | 128 ------------------ tests/system/large/test_tpch.py | 101 ++++++++++++++ 5 files changed, 108 insertions(+), 137 deletions(-) delete mode 100644 scripts/tpch_result_verify.py create mode 100644 tests/system/large/test_tpch.py diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 519b2c94426..9632e65e4d4 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -962,7 +962,7 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): # to actually cast it, as that could be lossy (eg float -> int) item_inferred_type = ibis_types.literal(item).type() if ( - x.type() == item_inferred_type + x.type().name == item_inferred_type.name or x.type().is_numeric() and item_inferred_type.is_numeric() ): diff --git a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py index 550a6c25be2..f767314be74 100644 --- a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py @@ -33,16 +33,11 @@ @register_unary_op(ops.IsInOp, pass_op=True) def _(expr: TypedExpr, op: ops.IsInOp) -> sge.Expression: values = [] - is_numeric_expr = dtypes.is_numeric(expr.dtype, include_bool=False) for value in op.values: if _is_null(value): continue dtype = dtypes.bigframes_type(type(value)) - if ( - expr.dtype == dtype - or is_numeric_expr - and dtypes.is_numeric(dtype, include_bool=False) - ): + if dtypes.can_compare(expr.dtype, dtype): values.append(sge.convert(value)) if op.match_nulls: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 8caddcdb002..a2abe9b817a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -370,8 +370,11 @@ def is_comparable(type_: ExpressionType) -> bool: def can_compare(type1: ExpressionType, type2: ExpressionType) -> bool: - coerced_type = coerce_to_common(type1, type2) - return is_comparable(coerced_type) + try: + coerced_type = coerce_to_common(type1, type2) + return is_comparable(coerced_type) + except TypeError: + return False def get_struct_fields(type_: ExpressionType) -> dict[str, Dtype]: diff --git a/scripts/tpch_result_verify.py b/scripts/tpch_result_verify.py deleted file mode 100644 index 0c932f6eac8..00000000000 --- a/scripts/tpch_result_verify.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import re - -from google.cloud import bigquery -import pandas as pd -from tqdm import tqdm - -import bigframes - -project_id = "bigframes-dev-perf" -dataset_id = "tpch_0001g" -dataset = { - "line_item_ds": f"bigframes-dev-perf.{dataset_id}.LINEITEM", - "region_ds": f"bigframes-dev-perf.{dataset_id}.REGION", - "nation_ds": f"bigframes-dev-perf.{dataset_id}.NATION", - "supplier_ds": f"bigframes-dev-perf.{dataset_id}.SUPPLIER", - "part_ds": f"bigframes-dev-perf.{dataset_id}.PART", - "part_supp_ds": f"bigframes-dev-perf.{dataset_id}.PARTSUPP", - "customer_ds": f"bigframes-dev-perf.{dataset_id}.CUSTOMER", - "orders_ds": f"bigframes-dev-perf.{dataset_id}.ORDERS", -} - - -def _execute_query(query): - client = bigquery.Client() - job_config = bigquery.QueryJobConfig(use_query_cache=False) - query_job = client.query(query, job_config=job_config) - query_job.result() - df = query_job.to_dataframe() - df.columns = df.columns.str.upper() - return df - - -def _initialize_session(ordered: bool): - context = bigframes.BigQueryOptions( - location="US", ordering_mode="strict" if ordered else "partial" - ) - session = bigframes.Session(context=context) - return session - - -def _verify_result(bigframes_query, sql_result): - exec_globals = {"_initialize_session": _initialize_session} - exec(bigframes_query, exec_globals) - bigframes_result = exec_globals.get("result") - if isinstance(bigframes_result, pd.DataFrame): - pd.testing.assert_frame_equal( - sql_result.reset_index(drop=True), - bigframes_result.reset_index(drop=True), - check_dtype=False, - ) - else: - assert sql_result.shape == (1, 1) - sql_scalar = sql_result.iloc[0, 0] - assert sql_scalar == bigframes_result - - -def verify(query_num=None): - range_iter = range(1, 23) if query_num is None else [query_num] - for i in tqdm(range_iter, desc="Processing queries"): - if query_num is not None and i != query_num: - continue - - # Execute SQL: - sql_file_path = f"third_party/bigframes_vendored/tpch/sql_queries/q{i}.sql" - with open(sql_file_path, "r") as f: - sql_query = f.read() - sql_query = sql_query.format(**dataset) - file_path = f"third_party/bigframes_vendored/tpch/queries/q{i}.py" - if os.path.exists(file_path): - with open(file_path, "r") as file: - file_content = file.read() - - file_content = re.sub( - r"next\((\w+)\.to_pandas_batches\((.*?)\)\)", - r"return \1.to_pandas()", - file_content, - ) - file_content = re.sub(r"_\s*=\s*(\w+)", r"return \1", file_content) - sql_result = _execute_query(sql_query) - - print(f"Checking {file_path} in ordered session") - bigframes_query = ( - file_content - + f"\nresult = q('{project_id}', '{dataset_id}', _initialize_session(ordered=True))" - ) - _verify_result(bigframes_query, sql_result) - - print(f"Checking {file_path} in unordered session") - bigframes_query = ( - file_content - + f"\nresult = q('{project_id}', '{dataset_id}', _initialize_session(ordered=False))" - ) - _verify_result(bigframes_query, sql_result) - - else: - raise FileNotFoundError(f"File {file_path} not found.") - - -if __name__ == "__main__": - """ - Runs verification of TPCH benchmark script outputs to ensure correctness for a specified query or all queries - with 1GB dataset. - - Example: - python scripts/tpch_result_verify.py -q 15 # Verifies TPCH query number 15 - python scripts/tpch_result_verify.py # Verifies all TPCH queries from 1 to 22 - """ - parser = argparse.ArgumentParser() - parser.add_argument("-q", "--query_number", type=int, default=None) - args = parser.parse_args() - - verify(args.query_number) diff --git a/tests/system/large/test_tpch.py b/tests/system/large/test_tpch.py new file mode 100644 index 00000000000..7cb243b0a39 --- /dev/null +++ b/tests/system/large/test_tpch.py @@ -0,0 +1,101 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +from google.cloud import bigquery +import pandas as pd +import pytest + +TPCH_PATH = "third_party/bigframes_vendored/tpch" +PROJECT_ID = "bigframes-dev-perf" +DATASET_ID = "tpch_0001g" +DATASET = { + "line_item_ds": f"{PROJECT_ID}.{DATASET_ID}.LINEITEM", + "region_ds": f"{PROJECT_ID}.{DATASET_ID}.REGION", + "nation_ds": f"{PROJECT_ID}.{DATASET_ID}.NATION", + "supplier_ds": f"{PROJECT_ID}.{DATASET_ID}.SUPPLIER", + "part_ds": f"{PROJECT_ID}.{DATASET_ID}.PART", + "part_supp_ds": f"{PROJECT_ID}.{DATASET_ID}.PARTSUPP", + "customer_ds": f"{PROJECT_ID}.{DATASET_ID}.CUSTOMER", + "orders_ds": f"{PROJECT_ID}.{DATASET_ID}.ORDERS", +} + + +def _execute_sql_query(bigquery_client, sql_query): + sql_query = sql_query.format(**DATASET) + + job_config = bigquery.QueryJobConfig(use_query_cache=False) + query_job = bigquery_client.query(sql_query, job_config=job_config) + query_job.result() + df = query_job.to_dataframe() + df.columns = df.columns.str.upper() + return df + + +def _execute_bigframes_script(session, bigframes_script): + bigframes_script = re.sub( + r"next\((\w+)\.to_pandas_batches\((.*?)\)\)", + r"return \1.to_pandas()", + bigframes_script, + ) + bigframes_script = re.sub(r"_\s*=\s*(\w+)", r"return \1", bigframes_script) + + bigframes_script = ( + bigframes_script + + f"\nresult = q('{PROJECT_ID}', '{DATASET_ID}', _initialize_session)" + ) + exec_globals = {"_initialize_session": session} + exec(bigframes_script, exec_globals) + bigframes_result = exec_globals.get("result") + return bigframes_result + + +def _verify_result(bigframes_result, sql_result): + if isinstance(bigframes_result, pd.DataFrame): + pd.testing.assert_frame_equal( + sql_result.reset_index(drop=True), + bigframes_result.reset_index(drop=True), + check_dtype=False, + ) + else: + assert sql_result.shape == (1, 1) + sql_scalar = sql_result.iloc[0, 0] + assert sql_scalar == bigframes_result + + +@pytest.mark.parametrize("query_num", range(1, 23)) +@pytest.mark.parametrize("ordered", [True, False]) +def test_tpch_correctness(session, unordered_session, query_num, ordered): + """Runs verification of TPCH benchmark script outputs to ensure correctness.""" + # Execute SQL: + sql_file_path = f"{TPCH_PATH}/sql_queries/q{query_num}.sql" + assert os.path.exists(sql_file_path) + with open(sql_file_path, "r") as f: + sql_query = f.read() + + sql_result = _execute_sql_query(session.bqclient, sql_query) + + # Execute BigFrames: + file_path = f"{TPCH_PATH}/queries/q{query_num}.py" + assert os.path.exists(file_path) + with open(file_path, "r") as file: + bigframes_script = file.read() + + bigframes_result = _execute_bigframes_script( + session if ordered else unordered_session, bigframes_script + ) + + _verify_result(bigframes_result, sql_result) From f70f93a1227add1627d522d7e55a37f42fc3549e Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 20 Feb 2026 10:29:10 -0800 Subject: [PATCH 07/29] feat: add dt.tz_localize() (#2469) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only `None` and `"UTC"` time zones are supported in this version. Fixes b/481069646 🦕 --- .../ibis_compiler/scalar_op_registry.py | 9 +++-- .../sqlglot/expressions/datetime_ops.py | 4 +-- bigframes/operations/datetime_ops.py | 2 ++ bigframes/operations/datetimes.py | 17 ++++++++- .../system/small/operations/test_datetimes.py | 36 +++++++++++++++++++ .../test_to_datetime/out.sql | 3 +- .../test_to_timestamp/out.sql | 3 +- .../sqlglot/expressions/test_datetime_ops.py | 5 +-- .../pandas/core/indexes/accessor.py | 30 ++++++++++++++++ 9 files changed, 99 insertions(+), 10 deletions(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 9632e65e4d4..4df99b1e528 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -978,7 +978,7 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): - if x.type() == ibis_dtypes.str: + if x.type() in (ibis_dtypes.str, ibis_dtypes.Timestamp("UTC")): # type: ignore return x.try_cast(ibis_dtypes.Timestamp(None)) # type: ignore else: # Numerical inputs. @@ -1001,6 +1001,9 @@ def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp): if op.format else timestamp(x) ) + elif x.type() == ibis_dtypes.Timestamp(None): # type: ignore + + return timestamp(x) else: # Numerical inputs. if op.format: @@ -2016,8 +2019,8 @@ def _ibis_num(number: float): @ibis_udf.scalar.builtin -def timestamp(a: str) -> ibis_dtypes.timestamp: # type: ignore - """Convert string to timestamp.""" +def timestamp(a) -> ibis_dtypes.timestamp: # type: ignore + """Convert string or a datetime to timestamp.""" @ibis_udf.scalar.builtin diff --git a/bigframes/core/compile/sqlglot/expressions/datetime_ops.py b/bigframes/core/compile/sqlglot/expressions/datetime_ops.py index 82f2f34edf3..a1c70262d55 100644 --- a/bigframes/core/compile/sqlglot/expressions/datetime_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/datetime_ops.py @@ -371,7 +371,7 @@ def _(expr: TypedExpr, op: ops.ToDatetimeOp) -> sge.Expression: ) return sge.Cast(this=result, to="DATETIME") - if expr.dtype == dtypes.STRING_DTYPE: + if expr.dtype in (dtypes.STRING_DTYPE, dtypes.TIMESTAMP_DTYPE): return sge.TryCast(this=expr.expr, to="DATETIME") value = expr.expr @@ -396,7 +396,7 @@ def _(expr: TypedExpr, op: ops.ToTimestampOp) -> sge.Expression: "PARSE_TIMESTAMP", sge.convert(op.format), expr.expr, sge.convert("UTC") ) - if expr.dtype == dtypes.STRING_DTYPE: + if expr.dtype in (dtypes.STRING_DTYPE, dtypes.DATETIME_DTYPE): return sge.func("TIMESTAMP", expr.expr) value = expr.expr diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py index 9988e8ed7b9..19541a383c8 100644 --- a/bigframes/operations/datetime_ops.py +++ b/bigframes/operations/datetime_ops.py @@ -73,6 +73,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT dtypes.INT_DTYPE, dtypes.STRING_DTYPE, dtypes.DATE_DTYPE, + dtypes.TIMESTAMP_DTYPE, ): raise TypeError("expected string or numeric input") return pd.ArrowDtype(pa.timestamp("us", tz=None)) @@ -91,6 +92,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT dtypes.INT_DTYPE, dtypes.STRING_DTYPE, dtypes.DATE_DTYPE, + dtypes.DATETIME_DTYPE, ): raise TypeError("expected string or numeric input") return pd.ArrowDtype(pa.timestamp("us", tz="UTC")) diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 2eedb96b43e..f66c37bb645 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -15,7 +15,7 @@ from __future__ import annotations import datetime as dt -from typing import Optional +from typing import Literal, Optional import bigframes_vendored.pandas.core.arrays.datetimelike as vendored_pandas_datetimelike import bigframes_vendored.pandas.core.indexes.accessor as vendordt @@ -147,6 +147,21 @@ def tz(self) -> Optional[dt.timezone]: else: raise ValueError(f"Unexpected timezone {tz_string}") + def tz_localize(self, tz: Literal["UTC"] | None) -> series.Series: + if tz == "UTC": + if self._data.dtype == dtypes.TIMESTAMP_DTYPE: + raise ValueError("Already tz-aware.") + + return self._data._apply_unary_op(ops.ToTimestampOp()) + + if tz is None: + if self._data.dtype == dtypes.DATETIME_DTYPE: + return self._data # no-op + + return self._data._apply_unary_op(ops.ToDatetimeOp()) + + raise ValueError(f"Unsupported timezone {tz}") + @property def unit(self) -> str: # Assumption: pyarrow dtype diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 0e023189d56..ad632e1c2ca 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -324,6 +324,42 @@ def test_dt_tz(scalars_dfs, col_name): assert bf_result == pd_result +@pytest.mark.parametrize( + ("col_name", "tz"), + [ + ("datetime_col", None), + ("timestamp_col", None), + ("datetime_col", "UTC"), + ], +) +def test_dt_tz_localize(scalars_dfs, col_name, tz): + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df[col_name] + + bf_result = bf_series.dt.tz_localize(tz) + pd_result = scalars_pandas_df[col_name].dt.tz_localize(tz) + + testing.assert_series_equal( + bf_result.to_pandas(), pd_result, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("col_name", "tz"), + [ + ("timestamp_col", "UTC"), + ("datetime_col", "US/Eastern"), + ], +) +def test_dt_tz_localize_invalid_inputs(scalars_dfs, col_name, tz): + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + scalars_df[col_name].dt.tz_localize(tz) + + @pytest.mark.parametrize( ("col_name",), DATETIME_COL_NAMES, diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql index 5cbfa3dbe77..3d0b8213b6e 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_datetime/out.sql @@ -1,5 +1,6 @@ SELECT CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS DATETIME) AS `int64_col`, SAFE_CAST(`string_col` AS DATETIME), - CAST(TIMESTAMP_MICROS(CAST(TRUNC(`float64_col` * 0.001) AS INT64)) AS DATETIME) AS `float64_col` + CAST(TIMESTAMP_MICROS(CAST(TRUNC(`float64_col` * 0.001) AS INT64)) AS DATETIME) AS `float64_col`, + SAFE_CAST(`timestamp_col` AS DATETIME) FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql index eb829c05804..1e8910fad7c 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_datetime_ops/test_to_timestamp/out.sql @@ -4,5 +4,6 @@ SELECT CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 1000000) AS INT64)) AS TIMESTAMP) AS `int64_col_s`, CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 1000) AS INT64)) AS TIMESTAMP) AS `int64_col_ms`, CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col`) AS INT64)) AS TIMESTAMP) AS `int64_col_us`, - CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS TIMESTAMP) AS `int64_col_ns` + CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS TIMESTAMP) AS `int64_col_ns`, + TIMESTAMP(`datetime_col`) AS `datetime_col` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py index 95156748e96..76966d3c9bb 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py @@ -180,7 +180,7 @@ def test_time(scalar_types_df: bpd.DataFrame, snapshot): def test_to_datetime(scalar_types_df: bpd.DataFrame, snapshot): - col_names = ["int64_col", "string_col", "float64_col"] + col_names = ["int64_col", "string_col", "float64_col", "timestamp_col"] bf_df = scalar_types_df[col_names] ops_map = {col_name: ops.ToDatetimeOp().as_expr(col_name) for col_name in col_names} @@ -189,7 +189,7 @@ def test_to_datetime(scalar_types_df: bpd.DataFrame, snapshot): def test_to_timestamp(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col", "string_col", "float64_col"]] + bf_df = scalar_types_df[["int64_col", "string_col", "float64_col", "datetime_col"]] ops_map = { "int64_col": ops.ToTimestampOp().as_expr("int64_col"), "float64_col": ops.ToTimestampOp().as_expr("float64_col"), @@ -197,6 +197,7 @@ def test_to_timestamp(scalar_types_df: bpd.DataFrame, snapshot): "int64_col_ms": ops.ToTimestampOp(unit="ms").as_expr("int64_col"), "int64_col_us": ops.ToTimestampOp(unit="us").as_expr("int64_col"), "int64_col_ns": ops.ToTimestampOp(unit="ns").as_expr("int64_col"), + "datetime_col": ops.ToTimestampOp().as_expr("datetime_col"), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index a0388317be8..a3404c222d4 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -1,3 +1,5 @@ +from typing import Literal + from bigframes import constants @@ -499,6 +501,34 @@ def tz(self): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def tz_localize(self, tz: Literal["UTC"] | None): + """Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. + + This method takes a time zone (tz) naive Datetime Array/Index object and makes + this time zone aware. It does not move the time to another time zone. Only "UTC" + timezone is supported. + + This method can also be used to do the inverse - to create a time zone unaware + object from an aware object. To that end, pass tz=None. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> s = bpd.Series([pd.Timestamp(year = 2026, month=1, day=1)]) + >>> s + 0 2026-01-01 00:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.tz_localize('UTC') + 0 2026-01-01 00:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + + Returns: + A BigFrames series with the updated timezone. + """ + + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def unit(self) -> str: """Returns the unit of time precision. From a9512498ef39b9d5260cad2ca0513c701a6d3592 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 20 Feb 2026 10:35:02 -0800 Subject: [PATCH 08/29] docs: Skip inherited methods, use autosummary only for big classes (#2470) --- .../autosummary/templates/autosummary/class.rst | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/third_party/sphinx/ext/autosummary/templates/autosummary/class.rst b/third_party/sphinx/ext/autosummary/templates/autosummary/class.rst index 89550cb3864..6651591be64 100644 --- a/third_party/sphinx/ext/autosummary/templates/autosummary/class.rst +++ b/third_party/sphinx/ext/autosummary/templates/autosummary/class.rst @@ -2,11 +2,13 @@ .. currentmodule:: {{ module }} +{% set is_pandas = module.startswith("bigframes.pandas") or module.startswith("bigframes.geopandas") %} +{% set skip_inherited = is_pandas and not module.startswith("bigframes.pandas.typing.api") %} + +{% if is_pandas %} .. autoclass:: {{ objname }} :no-members: - {% block methods %} - {% block attributes %} {% if attributes %} .. rubric:: {{ _('Attributes') }} @@ -14,18 +16,27 @@ .. autosummary:: :toctree: {% for item in attributes %} + {%- if not skip_inherited or not item in inherited_members%} ~{{ name }}.{{ item }} + {%- endif %} {%- endfor %} {% endif %} {% endblock %} + {% block methods %} {% if methods %} .. rubric:: {{ _('Methods') }} .. autosummary:: :toctree: + {% for item in methods %} + {%- if not skip_inherited or not item in inherited_members%} ~{{ name }}.{{ item }} + {%- endif %} {%- endfor %} {% endif %} {% endblock %} +{% else %} +.. autoclass:: {{ objname }} +{% endif %} From 61a948451baeb1caa323e721ad88b31c7cd0b3cb Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 20 Feb 2026 12:25:35 -0800 Subject: [PATCH 09/29] docs: Move readme content to new User Guide section (#2464) --- README.rst | 3 +- docs/conf.py | 11 +------ docs/index.rst | 61 ++++++++++++++++++++++++++++++++++++++- docs/user_guide/index.rst | 11 +++++++ 4 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 docs/user_guide/index.rst diff --git a/README.rst b/README.rst index 366062b1d3a..ef9bcd50523 100644 --- a/README.rst +++ b/README.rst @@ -1,8 +1,7 @@ -:orphan: - BigQuery DataFrames (BigFrames) =============================== + |GA| |pypi| |versions| BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame diff --git a/docs/conf.py b/docs/conf.py index 9883467edfa..b4954ac6592 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -118,6 +118,7 @@ "samples/AUTHORING_GUIDE.md", "samples/CONTRIBUTING.md", "samples/snippets/README.rst", + "README.rst", # used for include in overview.rst only ] # The reST default role (used for this markup: `text`) to use for all @@ -163,16 +164,6 @@ "logo": { "text": "BigQuery DataFrames (BigFrames)", }, - "external_links": [ - { - "name": "Getting started", - "url": "https://docs.cloud.google.com/bigquery/docs/dataframes-quickstart", - }, - { - "name": "User guide", - "url": "https://docs.cloud.google.com/bigquery/docs/bigquery-dataframes-introduction", - }, - ], "analytics": { "google_analytics_id": "G-XVSRMCJ37X", }, diff --git a/docs/index.rst b/docs/index.rst index b17ac7cbd9c..00c59a6745e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,63 @@ -.. include:: README.rst +.. BigQuery DataFrames documentation main file + +Welcome to BigQuery DataFrames +============================== + +**BigQuery DataFrames** (``bigframes``) provides a Pythonic interface for data analysis that scales to petabytes. It gives you the best of both worlds: the familiar API of **pandas** and **scikit-learn**, powered by the distributed computing engine of **BigQuery**. + +BigQuery DataFrames consists of three main components: + +* **bigframes.pandas**: A pandas-compatible API for data exploration and transformation. +* **bigframes.ml**: A scikit-learn-like interface for BigQuery ML, including integration with Gemini. +* **bigframes.bigquery**: Specialized functions for managing BigQuery resources and deploying custom logic. + +Why BigQuery DataFrames? +------------------------ + +BigFrames allows you to process data where it lives. Instead of downloading massive datasets to your local machine, BigFrames translates your Python code into SQL and executes it across the BigQuery fleet. + +* **Scalability:** Work with datasets that exceed local memory limits without complex refactoring. +* **Collaboration & Extensibility:** Bridge the gap between Python and SQL. Deploy custom Python functions to BigQuery, making your logic accessible to SQL-based teammates and data analysts. +* **Production-Ready Pipelines:** Move seamlessly from interactive notebooks to production. BigFrames simplifies data engineering by integrating with tools like **dbt** and **Airflow**, offering a simpler operational model than Spark. +* **Security & Governance:** Keep your data within the BigQuery perimeter. Benefit from enterprise-grade security, auditing, and data governance throughout your entire Python workflow. +* **Familiarity:** Use ``read_gbq``, ``merge``, ``groupby``, and ``pivot_table`` just like you do in pandas. + +Quickstart +---------- + +Install the library via pip: + +.. code-block:: bash + + pip install --upgrade bigframes + +Load and aggregate a public dataset in just a few lines: + +.. code-block:: python + + import bigframes.pandas as bpd + + # Load data from BigQuery + df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + + # Perform familiar pandas operations at scale + top_names = ( + df.groupby("name") + .agg({"number": "sum"}) + .sort_values("number", ascending=False) + .head(10) + ) + + print(top_names.to_pandas()) + + +User Guide +---------- + +.. toctree:: + :maxdepth: 2 + + user_guide/index API reference ------------- diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst new file mode 100644 index 00000000000..18644829b33 --- /dev/null +++ b/docs/user_guide/index.rst @@ -0,0 +1,11 @@ +User Guide +********** + +.. include:: ../README.rst + +.. toctree:: + :caption: Guides + :maxdepth: 1 + + Getting Started + Cloud Docs User Guides From f1bbba23667f01d3b8e7c51b18fe64641a4b135f Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 23 Feb 2026 07:47:58 -0800 Subject: [PATCH 10/29] docs: add bigframes default connection warning (#2471) fixes b/471256706 --- bigframes/bigquery/_operations/ai.py | 2 +- bigframes/functions/_function_session.py | 2 +- bigframes/operations/blob.py | 2 +- bigframes/session/__init__.py | 12 +++++++++++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 477ca91366f..dd9c4e236b1 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -1018,7 +1018,7 @@ def _convert_series( def _resolve_connection_id(series: series.Series, connection_id: str | None): return clients.get_canonical_bq_connection_id( - connection_id or series._session._bq_connection, + connection_id or series._session.bq_connection, series._session._project, series._session._location, ) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index a456f054170..b0fc25219af 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -162,7 +162,7 @@ def _resolve_bigquery_connection_id( ) -> str: """Resolves BigQuery connection id.""" if not bigquery_connection: - bigquery_connection = session._bq_connection # type: ignore + bigquery_connection = session.bq_connection # type: ignore bigquery_connection = clients.get_canonical_bq_connection_id( bigquery_connection, diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 9210addaa81..fd8509672dd 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -311,7 +311,7 @@ def _resolve_connection(self, connection: Optional[str] = None) -> str: Raises: ValueError: If the connection cannot be resolved to a valid string. """ - connection = connection or self._data._block.session._bq_connection + connection = connection or self._data._block.session.bq_connection return clients.get_canonical_bq_connection_id( connection, default_project=self._data._block.session._project, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 757bb50a940..23f4178f3dd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -374,6 +374,16 @@ def _allows_ambiguity(self) -> bool: def _anonymous_dataset(self): return self._anon_dataset_manager.dataset + @property + def bq_connection(self) -> str: + msg = bfe.format_message( + f"""You are using the BigFrames session default connection: {self._bq_connection}, + which can be different from the BigQuery project default connection. + This default connection may change in the future.""" + ) + warnings.warn(msg, category=FutureWarning) + return self._bq_connection + def __hash__(self): # Stable hash needed to use in expression tree return hash(str(self._session_id)) @@ -2253,7 +2263,7 @@ def _create_bq_connection( ) -> str: """Create the connection with the session settings and try to attach iam role to the connection SA. If any of project, location or connection isn't specified, use the session defaults. Returns fully-qualified connection name.""" - connection = self._bq_connection if not connection else connection + connection = self.bq_connection if not connection else connection connection = bigframes.clients.get_canonical_bq_connection_id( connection_id=connection, default_project=self._project, From ae5c8b322765aef51eed016bfacaff5a7a917a7b Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 23 Feb 2026 09:58:03 -0800 Subject: [PATCH 11/29] feat: Support pd.col expressions with .loc and getitem (#2473) --- bigframes/core/array_value.py | 7 ++++++- bigframes/core/indexers.py | 16 +++++++++++++++- bigframes/dataframe.py | 7 ++++++- tests/unit/test_col.py | 18 ++++++++++++++++++ 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index ccec1f9b954..b20c6561ea9 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -204,7 +204,12 @@ def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue return self.filter(predicate) def filter(self, predicate: ex.Expression): - return ArrayValue(nodes.FilterNode(child=self.node, predicate=predicate)) + if predicate.is_scalar_expr: + return ArrayValue(nodes.FilterNode(child=self.node, predicate=predicate)) + else: + arr, filter_ids = self.compute_general_expression([predicate]) + arr = arr.filter_by_id(filter_ids[0]) + return arr.drop_columns(filter_ids) def order_by( self, by: Sequence[OrderingExpression], is_total_order: bool = False diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index c60e40880b7..987edf2339c 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -23,6 +23,7 @@ import pandas as pd import bigframes.core.blocks +import bigframes.core.col import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.indexes as indexes @@ -36,7 +37,11 @@ if typing.TYPE_CHECKING: LocSingleKey = Union[ - bigframes.series.Series, indexes.Index, slice, bigframes.core.scalar.Scalar + bigframes.series.Series, + indexes.Index, + slice, + bigframes.core.scalar.Scalar, + bigframes.core.col.Expression, ] @@ -309,6 +314,15 @@ def _loc_getitem_series_or_dataframe( raise NotImplementedError( f"loc does not yet support indexing with a slice. {constants.FEEDBACK_LINK}" ) + if isinstance(key, bigframes.core.col.Expression): + label_to_col_ref = { + label: ex.deref(id) + for id, label in series_or_dataframe._block.col_id_to_label.items() + } + resolved_expr = key._value.bind_variables(label_to_col_ref) + result = series_or_dataframe.copy() + result._set_block(series_or_dataframe._block.filter(resolved_expr)) + return result if callable(key): raise NotImplementedError( f"loc does not yet support indexing with a callable. {constants.FEEDBACK_LINK}" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2a22fc4487d..2c734f2943e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -623,13 +623,18 @@ def __getitem__( ): # No return type annotations (like pandas) as type cannot always be determined statically # NOTE: This implements the operations described in # https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html + import bigframes.core.col + import bigframes.pandas - if isinstance(key, bigframes.series.Series): + if isinstance(key, bigframes.pandas.Series): return self._getitem_bool_series(key) if isinstance(key, slice): return self.iloc[key] + if isinstance(key, bigframes.core.col.Expression): + return self.loc[key] + # TODO(tswast): Fix this pylance warning: Class overlaps "Hashable" # unsafely and could produce a match at runtime if isinstance(key, blocks.Label): diff --git a/tests/unit/test_col.py b/tests/unit/test_col.py index e01c25ddd2c..9c9088e037c 100644 --- a/tests/unit/test_col.py +++ b/tests/unit/test_col.py @@ -158,3 +158,21 @@ def test_pd_col_binary_bool_operators(scalars_dfs, op): pd_result = scalars_pandas_df.assign(**pd_kwargs) assert_frame_equal(bf_result, pd_result) + + +def test_loc_with_pd_col(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.loc[bpd.col("float64_col") > 4].to_pandas() + pd_result = scalars_pandas_df.loc[pd.col("float64_col") > 4] # type: ignore + + assert_frame_equal(bf_result, pd_result) + + +def test_getitem_with_pd_col(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[bpd.col("float64_col") > 4].to_pandas() + pd_result = scalars_pandas_df[pd.col("float64_col") > 4] # type: ignore + + assert_frame_equal(bf_result, pd_result) From 8a1a82f7a0fd224f2b075c68ab116d1f580d1d82 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 23 Feb 2026 10:40:48 -0800 Subject: [PATCH 12/29] docs: use direct API for image (#2465) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR updates notebooks/multimodal/multimodal_dataframe.ipynb to demonstrate image modifications using custom BigQuery Python UDFs with the opencv library. verified at: screen/BfesAowVQWEGsCf Fixes #<478952827> 🦕 --- .../multimodal/multimodal_dataframe.ipynb | 987 ++++++++---------- 1 file changed, 440 insertions(+), 547 deletions(-) diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index a578910b658..89af5767113 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -171,37 +171,7 @@ " return bbq.json_value(get_metadata(series), \"$.size\").astype(\"Int64\")\n", "\n", "def get_updated(series):\n", - " return bpd.to_datetime(bbq.json_value(get_metadata(series), \"$.updated\").astype(\"Int64\"), unit=\"us\", utc=True)\n", - "\n", - "def display_blob(series, n=3):\n", - " import IPython.display as ipy_display\n", - " import pandas as pd\n", - " import requests\n", - " \n", - " # Retrieve access URLs and content types\n", - " runtime_json = bbq.to_json_string(bbq.obj.get_access_url(series, mode=\"R\"))\n", - " read_url = bbq.json_value(runtime_json, \"$.access_urls.read_url\")\n", - " content_type = get_content_type(series)\n", - " \n", - " # Pull to pandas to display\n", - " pdf = bpd.DataFrame({\"read_url\": read_url, \"content_type\": content_type}).head(n).to_pandas()\n", - " \n", - " width = bigframes.options.display.blob_display_width\n", - " height = bigframes.options.display.blob_display_height\n", - " \n", - " for _, row in pdf.iterrows():\n", - " if pd.isna(row[\"read_url\"]):\n", - " ipy_display.display(\"\")\n", - " elif pd.isna(row[\"content_type\"]):\n", - " ipy_display.display(requests.get(row[\"read_url\"]).content)\n", - " elif row[\"content_type\"].casefold().startswith(\"image\"):\n", - " ipy_display.display(ipy_display.Image(url=row[\"read_url\"], width=width, height=height))\n", - " elif row[\"content_type\"].casefold().startswith(\"audio\"):\n", - " ipy_display.display(ipy_display.Audio(requests.get(row[\"read_url\"]).content))\n", - " elif row[\"content_type\"].casefold().startswith(\"video\"):\n", - " ipy_display.display(ipy_display.Video(row[\"read_url\"], width=width, height=height))\n", - " else:\n", - " ipy_display.display(requests.get(row[\"read_url\"]).content)" + " return bpd.to_datetime(bbq.json_value(get_metadata(series), \"$.updated\").astype(\"Int64\"), unit=\"us\", utc=True)" ] }, { @@ -229,13 +199,7 @@ "# Create blob columns from wildcard path.\n", "df_image = bpd.from_glob_path(\n", " \"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*\", name=\"image\"\n", - ")\n", - "# Other ways are: from string uri column\n", - "# df = bpd.DataFrame({\"uri\": [\"gs:///\", \"gs:///\"]})\n", - "# df[\"blob_col\"] = df[\"uri\"].str.to_blob()\n", - "\n", - "# From an existing object table\n", - "# df = bpd.read_gbq_object_table(\"\", name=\"blob_col\")" + ")" ] }, { @@ -254,7 +218,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", @@ -289,23 +253,23 @@ " \n", " \n", " 0\n", - " \n", + " \n", " \n", " \n", " 1\n", - " \n", + " \n", " \n", " \n", " 2\n", - " \n", + " \n", " \n", " \n", " 3\n", - " \n", + " \n", " \n", " \n", " 4\n", - " \n", + " \n", " \n", " \n", "\n", @@ -314,11 +278,11 @@ ], "text/plain": [ " image\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5...\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", "\n", "[5 rows x 1 columns]" ] @@ -363,7 +327,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", @@ -402,7 +366,7 @@ " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", @@ -410,7 +374,7 @@ " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", @@ -418,7 +382,7 @@ " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", @@ -426,7 +390,7 @@ " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", @@ -434,7 +398,7 @@ " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", @@ -447,11 +411,11 @@ ], "text/plain": [ " image author content_type \\\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... alice image/png \n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... bob image/png \n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... bob image/png \n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... alice image/png \n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-13T01:5... bob image/png \n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... alice image/png \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... bob image/png \n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... bob image/png \n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... alice image/png \n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... bob image/png \n", "\n", " size updated \n", "0 1591240 2025-03-20 17:45:04+00:00 \n", @@ -478,294 +442,48 @@ "df_image" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "NUd4Kog_QLRS" - }, - "source": [ - "Then you can filter the rows based on the structured data. And for different content types, you can display them respectively or together." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 75 - }, - "id": "UGuAk9PNDRF3", - "outputId": "73feb33d-4a05-48fb-96e5-3c48c2a456f3" - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# filter images and display, you can also display audio and video types\n", - "display_blob(df_image[df_image[\"author\"] == \"alice\"][\"image\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1IJuakwJTZey" - }, - "source": [ - "### 3. Conduct image transformations\n", - "BigFrames Multimodal DataFrame provides image(and other) transformation functions. Such as image_blur, image_resize and image_normalize. The output can be saved to GCS folders or to BQ as bytes." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VWsl5BBPJ6N7", - "outputId": "45d2356e-322b-4982-cfa7-42d034dc4344" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n" - ] - } - ], - "source": [ - "df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n", - " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n", - ")\n", - "df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n", - " (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n", - ")\n", - "df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n", - " alpha=50.0,\n", - " beta=150.0,\n", - " norm_type=\"minmax\",\n", - " dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n", - " engine=\"opencv\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rWCAGC8w64vU", - "outputId": "d7d456f0-8b56-492c-fe1b-967e9664d813" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n" - ] - } - ], - "source": [ - "# You can also chain functions together\n", - "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Using `verbose` mode for detailed output\\n\n", - "\\n\n", - "All multimodal functions support a `verbose` parameter, which defaults to `False`.\\n\n", - "\\n\n", - "* When `verbose=False` (the default), the function will only return the main content of the result (e.g., the transformed image, the extracted text).\\n\n", - "* When `verbose=True`, the function returns a `STRUCT` containing two fields:\\n\n", - " * `content`: The main result of the operation.\\n\n", - " * `status`: An informational field. If the operation is successful, this will be empty. If an error occurs during the processing of a specific row, this field will contain the error message, allowing the overall job to complete without failing.\\n\n", - "\\n\n", - "Using `verbose=True` is highly recommended for debugging and for workflows where you need to handle potential failures on a row-by-row basis. Let's see it in action with the `image_blur` function." + "### 3. Conduct image transformations" ] }, { - "cell_type": "code", - "execution_count": 10, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
blurred_verbose
0{'status': '', 'content': {'uri': 'gs://bigfra...
1{'status': '', 'content': {'uri': 'gs://bigfra...
2{'status': '', 'content': {'uri': 'gs://bigfra...
3{'status': '', 'content': {'uri': 'gs://bigfra...
4{'status': '', 'content': {'uri': 'gs://bigfra...
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" - ], - "text/plain": [ - " blurred_verbose\n", - "0 {'status': '', 'content': {'uri': 'gs://bigfra...\n", - "1 {'status': '', 'content': {'uri': 'gs://bigfra...\n", - "2 {'status': '', 'content': {'uri': 'gs://bigfra...\n", - "3 {'status': '', 'content': {'uri': 'gs://bigfra...\n", - "4 {'status': '', 'content': {'uri': 'gs://bigfra...\n", - "\n", - "[5 rows x 1 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "df_image[\"blurred_verbose\"] = df_image[\"image\"].blob.image_blur(\n", - " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed_verbose/\", engine=\"opencv\", verbose=True\n", - ")\n", - "df_image[[\"blurred_verbose\"]]" + "This section demonstrates how to perform image transformations like blur, resize, and normalize using custom BigQuery Python UDFs and the `opencv-python` library." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 605 + "height": 487 }, - "id": "6NGK6GYSU44B", - "outputId": "859101c1-2ee4-4f9a-e250-e8947127420a" + "id": "HhCb8jRsLe9B", + "outputId": "03081cf9-3a22-42c9-b38f-649f592fdada" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:151: PreviewWarning: udf is in preview.\n", + " return global_session.with_default_session(\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4655: FunctionAxisOnePreviewWarning: DataFrame.apply with parameter axis=1 scenario is in preview.\n", + " warnings.warn(msg, category=bfe.FunctionAxisOnePreviewWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" ] }, { @@ -790,148 +508,136 @@ " \n", " \n", " image\n", - " author\n", - " content_type\n", - " size\n", - " updated\n", " blurred\n", - " resized\n", - " normalized\n", - " blur_resized\n", - " blurred_verbose\n", " \n", " \n", " \n", " \n", " 0\n", - " \n", - " alice\n", - " image/png\n", - " 1591240\n", - " 2025-03-20 17:45:04+00:00\n", - " \n", - " \n", - " \n", - " \n", - " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-paw-balm.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", + " \n", + " \n", " \n", " \n", " 1\n", - " \n", - " bob\n", - " image/png\n", - " 1182951\n", - " 2025-03-20 17:45:02+00:00\n", - " \n", - " \n", - " \n", - " \n", - " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-hot-spot-spray.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", + " \n", + " \n", " \n", " \n", " 2\n", - " \n", - " bob\n", - " image/png\n", - " 1520884\n", - " 2025-03-20 17:44:55+00:00\n", - " \n", - " \n", - " \n", - " \n", - " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/fluffy-buns-chinchilla-food-variety-pack.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", + " \n", + " \n", " \n", " \n", " 3\n", - " \n", - " alice\n", - " image/png\n", - " 1235401\n", - " 2025-03-20 17:45:19+00:00\n", - " \n", - " \n", - " \n", - " \n", - " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/purrfect-perch-cat-scratcher.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", + " \n", + " \n", " \n", " \n", " 4\n", - " \n", - " bob\n", - " image/png\n", - " 1591923\n", - " 2025-03-20 17:44:47+00:00\n", - " \n", - " \n", - " \n", - " \n", - " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/chirpy-seed-deluxe-bird-food.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", + " \n", + " \n", " \n", " \n", "\n", - "

5 rows × 10 columns

\n", - "[5 rows x 10 columns in total]" + "

5 rows × 2 columns

\n", + "[5 rows x 2 columns in total]" ], "text/plain": [ - " image author content_type \\\n", - "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", - "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", - "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", - "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", - "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", - "\n", - " size updated \\\n", - "0 1591240 2025-03-20 17:45:04+00:00 \n", - "1 1182951 2025-03-20 17:45:02+00:00 \n", - "2 1520884 2025-03-20 17:44:55+00:00 \n", - "3 1235401 2025-03-20 17:45:19+00:00 \n", - "4 1591923 2025-03-20 17:44:47+00:00 \n", - "\n", - " blurred \\\n", - "0 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", - "1 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", - "2 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", - "3 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", - "4 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", - "\n", - " resized \\\n", - "0 {'uri': 'gs://bigframes_blob_test/image_resize... \n", - "1 {'uri': 'gs://bigframes_blob_test/image_resize... \n", - "2 {'uri': 'gs://bigframes_blob_test/image_resize... \n", - "3 {'uri': 'gs://bigframes_blob_test/image_resize... \n", - "4 {'uri': 'gs://bigframes_blob_test/image_resize... \n", - "\n", - " normalized \\\n", - "0 {'uri': 'gs://bigframes_blob_test/image_normal... \n", - "1 {'uri': 'gs://bigframes_blob_test/image_normal... \n", - "2 {'uri': 'gs://bigframes_blob_test/image_normal... \n", - "3 {'uri': 'gs://bigframes_blob_test/image_normal... \n", - "4 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + " image \\\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", "\n", - " blur_resized \\\n", - "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + " blurred \n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", "\n", - " blurred_verbose \n", - "0 {'status': '', 'content': {'uri': 'gs://bigfra... \n", - "1 {'status': '', 'content': {'uri': 'gs://bigfra... \n", - "2 {'status': '', 'content': {'uri': 'gs://bigfra... \n", - "3 {'status': '', 'content': {'uri': 'gs://bigfra... \n", - "4 {'status': '', 'content': {'uri': 'gs://bigfra... \n", - "\n", - "[5 rows x 10 columns]" + "[5 rows x 2 columns]" ] }, - "execution_count": 11, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_image" + "# Construct the canonical connection ID\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", + "@bpd.udf(\n", + " input_types=[str, str, int, int],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"image_blur\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"opencv-python\", \"numpy\", \"requests\"],\n", + ")\n", + "def image_blur(src_rt: str, dst_rt: str, kx: int, ky: int) -> str:\n", + " import json\n", + " import cv2 as cv\n", + " import numpy as np\n", + " import requests\n", + " import base64\n", + "\n", + " src_obj = json.loads(src_rt)\n", + " src_url = src_obj[\"access_urls\"][\"read_url\"]\n", + " \n", + " response = requests.get(src_url, timeout=30)\n", + " response.raise_for_status()\n", + " \n", + " img = cv.imdecode(np.frombuffer(response.content, np.uint8), cv.IMREAD_UNCHANGED)\n", + " if img is None:\n", + " raise ValueError(\"cv.imdecode failed\")\n", + " \n", + " kx, ky = int(kx), int(ky)\n", + " img_blurred = cv.blur(img, ksize=(kx, ky))\n", + " \n", + " success, encoded = cv.imencode(\".jpeg\", img_blurred)\n", + " if not success:\n", + " raise ValueError(\"cv.imencode failed\")\n", + " \n", + " # Handle two output modes\n", + " if dst_rt: # GCS/Series output mode\n", + " dst_obj = json.loads(dst_rt)\n", + " dst_url = dst_obj[\"access_urls\"][\"write_url\"]\n", + " \n", + " requests.put(dst_url, data=encoded.tobytes(), headers={\"Content-Type\": \"image/jpeg\"}, timeout=30).raise_for_status()\n", + " \n", + " uri = dst_obj[\"objectref\"][\"uri\"]\n", + " return uri\n", + " \n", + " else: # BigQuery bytes output mode \n", + " image_bytes = encoded.tobytes()\n", + " return base64.b64encode(image_bytes).decode()\n", + "\n", + "def apply_transformation(series, dst_folder, udf, *args, verbose=False):\n", + " import os\n", + " dst_folder = os.path.join(dst_folder, \"\")\n", + " # Fetch metadata to get the URI\n", + " metadata = bbq.obj.fetch_metadata(series)\n", + " current_uri = metadata.struct.field(\"uri\")\n", + " dst_uri = current_uri.str.replace(r\"^.*\\/(.*)$\", rf\"{dst_folder}\\1\", regex=True)\n", + " dst_blob = dst_uri.str.to_blob(connection=FULL_CONNECTION_ID)\n", + " df_transform = bpd.DataFrame({\n", + " \"src_rt\": get_runtime_json_str(series, mode=\"R\"),\n", + " \"dst_rt\": get_runtime_json_str(dst_blob, mode=\"RW\"),\n", + " })\n", + " res = df_transform[[\"src_rt\", \"dst_rt\"]].apply(\n", + " udf, axis=1, args=args\n", + " )\n", + " return res if verbose else res.str.to_blob(connection=FULL_CONNECTION_ID)\n", + "\n", + "# Apply transformations\n", + "df_image[\"blurred\"] = apply_transformation(\n", + " df_image[\"image\"], f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\",\n", + " image_blur, 20, 20\n", + ")\n", + "df_image[[\"image\", \"blurred\"]]" ] }, { @@ -945,7 +651,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": { "id": "mRUGfcaFVW-3" }, @@ -954,7 +660,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:183: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n" @@ -968,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -982,22 +688,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" ] }, { @@ -1028,69 +730,84 @@ " \n", " \n", " 0\n", - " The item is a tin of K9 Guard dog paw balm.\n", - " \n", + " The item is a container of K9 Guard Dog Paw Balm.\n", + " \n", " \n", " \n", " 1\n", " The item is K9 Guard Dog Hot Spot Spray.\n", - " \n", + " \n", + " \n", + " \n", + " 2\n", + " The image contains three bags of food, likely for small animals like rabbits or guinea pigs. They are labeled \"Timoth Hay Lend Variety Plend\", \"Herbal Greeıs Mix Variety Blend\", and \"Berry & Blossom Treat Blend\", all under the brand \"Fluffy Buns.\" The bags are yellow, green, and purple, respectively. Each bag has a pile of its contents beneath it.\n", + " \n", + " \n", + " \n", + " 3\n", + " The item is a cat tree.\\n\n", + " \n", + " \n", + " \n", + " 4\n", + " The item is a bag of bird seed. Specifically, it's labeled \"Chirpy Seed\", \"Deluxe Bird Food\".\\n\n", + " \n", " \n", " \n", "\n", - "

2 rows × 2 columns

\n", - "[2 rows x 2 columns in total]" + "

5 rows × 2 columns

\n", + "[5 rows x 2 columns in total]" ], "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 The item is a tin of K9 Guard dog paw balm. \n", - "1 The item is K9 Guard Dog Hot Spot Spray. \n", + " ml_generate_text_llm_result \\\n", + "0 The item is a container of K9 Guard Dog Paw Balm. \n", + "1 The item is K9 Guard Dog Hot Spot Spray. \n", + "2 The image contains three bags of food, likely ... \n", + "3 The item is a cat tree.\\n \n", + "4 The item is a bag of bird seed. Specifically, ... \n", "\n", " image \n", - "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", - "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", "\n", - "[2 rows x 2 columns]" + "[5 rows x 2 columns]" ] }, - "execution_count": 13, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Ask the same question on the images\n", - "df_image = df_image.head(2)\n", "answer = gemini.predict(df_image, prompt=[\"what item is it?\", df_image[\"image\"]])\n", "answer[[\"ml_generate_text_llm_result\", \"image\"]]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": { "id": "IG3J3HsKhyBY" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - } - ], + "outputs": [], "source": [ "# Ask different questions\n", - "df_image[\"question\"] = [\"what item is it?\", \"what color is the picture?\"]" + "df_image[\"question\"] = [\n", + " \"what item is it?\",\n", + " \"what color is the picture?\",\n", + " \"what is the product name?\",\n", + " \"is it for pets?\",\n", + " \"what is the weight of the product?\",\n", + "]" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1104,22 +821,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" ] }, { @@ -1150,32 +863,53 @@ " \n", " \n", " 0\n", - " The item is a tin of K9Guard Dog Paw Balm.\n", - " \n", + " The item is a container of Dog Paw Balm.\n", + " \n", " \n", " \n", " 1\n", - " The bottle is mostly white, with a light blue accents. The background is a light gray. There are also black and green elements on the bottle's label.\n", - " \n", + " The picture contains many colors, including white, black, green, and a bright blue. The product label predominantly features a bright blue hue. The background is a solid gray.\n", + " \n", + " \n", + " \n", + " 2\n", + " Here are the product names from the image:\\n\\n* **Timoth Hay Lend Variety Plend** is the product in the yellow bag.\\n* **Herbal Greeıs Mix Variety Blend** is the product in the green bag.\\n* **Berry & Blossom Treat Blend** is the product in the purple bag.\n", + " \n", + " \n", + " \n", + " 3\n", + " Yes, it is for pets. It appears to be a cat tree or scratching post.\\n\n", + " \n", + " \n", + " \n", + " 4\n", + " The image shows that the weight of the product is 15 oz/ 257g.\n", + " \n", " \n", " \n", "\n", - "

2 rows × 2 columns

\n", - "[2 rows x 2 columns in total]" + "

5 rows × 2 columns

\n", + "[5 rows x 2 columns in total]" ], "text/plain": [ " ml_generate_text_llm_result \\\n", - "0 The item is a tin of K9Guard Dog Paw Balm. \n", - "1 The bottle is mostly white, with a light blue ... \n", + "0 The item is a container of Dog Paw Balm. \n", + "1 The picture contains many colors, including wh... \n", + "2 Here are the product names from the image:\\n\\n... \n", + "3 Yes, it is for pets. It appears to be a cat tr... \n", + "4 The image shows that the weight of the product... \n", "\n", " image \n", - "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", - "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", "\n", - "[2 rows x 2 columns]" + "[5 rows x 2 columns]" ] }, - "execution_count": 15, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1187,7 +921,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1201,19 +935,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:183: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" @@ -1250,46 +982,82 @@ " \n", " \n", " 0\n", - " [ 0.00638842 0.01666344 0.00451782 ... -0.02...\n", + " [ 0.00638822 0.01666385 0.00451817 ... -0.02...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...\n", " \n", " \n", " 1\n", - " [ 0.00973689 0.02148374 0.00244311 ... 0.00...\n", + " [ 0.00973976 0.02148137 0.0024429 ... 0.00...\n", + " \n", + " <NA>\n", + " <NA>\n", + " {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...\n", + " \n", + " \n", + " 2\n", + " [ 0.01195884 0.02139394 0.05968047 ... -0.01...\n", + " \n", + " <NA>\n", + " <NA>\n", + " {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...\n", + " \n", + " \n", + " 3\n", + " [-0.02621161 0.02797648 0.04416926 ... -0.01...\n", + " \n", + " <NA>\n", + " <NA>\n", + " {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...\n", + " \n", + " \n", + " 4\n", + " [ 0.05918628 0.0125137 0.01907336 ... 0.01...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...\n", " \n", " \n", "\n", - "

2 rows × 5 columns

\n", - "[2 rows x 5 columns in total]" + "

5 rows × 5 columns

\n", + "[5 rows x 5 columns in total]" ], "text/plain": [ " ml_generate_embedding_result \\\n", - "0 [ 0.00638842 0.01666344 0.00451782 ... -0.02... \n", - "1 [ 0.00973689 0.02148374 0.00244311 ... 0.00... \n", + "0 [ 0.00638822 0.01666385 0.00451817 ... -0.02... \n", + "1 [ 0.00973976 0.02148137 0.0024429 ... 0.00... \n", + "2 [ 0.01195884 0.02139394 0.05968047 ... -0.01... \n", + "3 [-0.02621161 0.02797648 0.04416926 ... -0.01... \n", + "4 [ 0.05918628 0.0125137 0.01907336 ... 0.01... \n", "\n", " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", "0 \n", "1 \n", + "2 \n", + "3 \n", + "4 \n", "\n", " ml_generate_embedding_end_sec \\\n", "0 \n", "1 \n", + "2 \n", + "3 \n", + "4 \n", "\n", " content \n", - "0 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", + "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", "\n", - "[2 rows x 5 columns]" + "[5 rows x 5 columns]" ] }, - "execution_count": 16, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1314,9 +1082,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:151: PreviewWarning: udf is in preview.\n", + " return global_session.with_default_session(\n" + ] + } + ], "source": [ "# Construct the canonical connection ID\n", "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", @@ -1334,12 +1111,9 @@ " import json\n", " from pypdf import PdfReader\n", " import requests\n", - " from requests import adapters\n", - " session = requests.Session()\n", - " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", - " response = session.get(src_url, timeout=30, stream=True)\n", + " response = requests.get(src_url, timeout=30, stream=True)\n", " response.raise_for_status()\n", " pdf_bytes = response.content\n", " pdf_file = io.BytesIO(pdf_bytes)\n", @@ -1364,12 +1138,9 @@ " import json\n", " from pypdf import PdfReader\n", " import requests\n", - " from requests import adapters\n", - " session = requests.Session()\n", - " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", - " response = session.get(src_url, timeout=30, stream=True)\n", + " response = requests.get(src_url, timeout=30, stream=True)\n", " response.raise_for_status()\n", " pdf_bytes = response.content\n", " pdf_file = io.BytesIO(pdf_bytes)\n", @@ -1395,9 +1166,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extracted_textchunked
0CritterCuisine Pro 5000 - Automatic Pet Feeder...[\"CritterCuisine Pro 5000 - Automatic Pet Feed...
\n", + "

1 rows × 2 columns

\n", + "
[1 rows x 2 columns in total]" + ], + "text/plain": [ + " extracted_text \\\n", + "0 CritterCuisine Pro 5000 - Automatic Pet Feeder... \n", + "\n", + " chunked \n", + "0 [\"CritterCuisine Pro 5000 - Automatic Pet Feed... \n", + "\n", + "[1 rows x 2 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n", "\n", @@ -1415,9 +1237,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
0    CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
+              "0    on a level, stable surface to prevent tipping....\n",
+              "0    included)\\nto maintain the schedule during pow...\n",
+              "0    digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
+              "0    paperclip) for 5\\nseconds. This will reset all...\n",
+              "0    unit with a damp cloth. Do not immerse the bas...\n",
+              "0    continues,\\ncontact customer support.\\nE2: Foo...
" + ], + "text/plain": [ + "0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n", + "0 on a level, stable surface to prevent tipping....\n", + "0 included)\\nto maintain the schedule during pow...\n", + "0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n", + "0 paperclip) for 5\\nseconds. This will reset all...\n", + "0 unit with a damp cloth. Do not immerse the bas...\n", + "0 continues,\\ncontact customer support.\\nE2: Foo...\n", + "Name: chunked, dtype: string" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Explode the chunks to see each chunk as a separate row\n", "chunked = df_pdf[\"chunked\"].explode()\n", @@ -1433,7 +1282,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -1443,25 +1292,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" ] + }, + { + "data": { + "text/html": [ + "
0    Now, as all books, not primarily intended as p...
" + ], + "text/plain": [ + "0 Now, as all books, not primarily intended as p...\n", + "Name: transcribed_content, dtype: string" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "# The audio_transcribe function is a convenience wrapper around bigframes.bigquery.ai.generate.\n", "# Here's how to perform the same operation directly:\n", "\n", - "audio_series = df['audio']\n", + "audio_series = df[\"audio\"]\n", "prompt_text = (\n", " \"**Task:** Transcribe the provided audio. **Instructions:** - Your response \"\n", " \"must contain only the verbatim transcription of the audio. - Do not include \"\n", @@ -1486,7 +1349,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1499,7 +1362,7 @@ "Name: transcription_results, dtype: struct[pyarrow]" ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1536,9 +1399,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:151: PreviewWarning: udf is in preview.\n", + " return global_session.with_default_session(\n" + ] + } + ], "source": [ "# Construct the canonical connection ID\n", "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", @@ -1559,12 +1431,9 @@ " import json\n", " from PIL import ExifTags, Image\n", " import requests\n", - " from requests import adapters\n", - " session = requests.Session()\n", - " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", - " response = session.get(src_url, timeout=30)\n", + " response = requests.get(src_url, timeout=30)\n", " bts = response.content\n", " image = Image.open(io.BytesIO(bts))\n", " exif_data = image.getexif()\n", @@ -1578,9 +1447,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/utils.py:228: PreviewWarning: The JSON-related API `parse_json` is in preview. Its behavior may\n", + "change in future versions.\n", + " warnings.warn(bfe.format_message(msg), category=bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
0    {\"ExifOffset\":47,\"Make\":\"MyCamera\"}
" + ], + "text/plain": [ + "0 {\"ExifOffset\":47,\"Make\":\"MyCamera\"}\n", + "Name: blob_col, dtype: extension>[pyarrow]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Create a Multimodal DataFrame from the sample image URIs\n", "exif_image_df = bpd.from_glob_path(\n", @@ -1608,7 +1501,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "venv", + "display_name": "venv (3.13.0)", "language": "python", "name": "python3" }, @@ -1622,7 +1515,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.13.0" } }, "nbformat": 4, From a2f2b65e2651d858e87337e8fe83abd4c2895303 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 23 Feb 2026 10:45:45 -0800 Subject: [PATCH 13/29] chore: deprecate Claude 3.5 in the codebase (#2472) Fixes b/486207478 --- bigframes/ml/llm.py | 2 +- .../remote_function_vertex_claude_model.ipynb | 158 +++++++----------- tests/system/load/test_llm.py | 44 +---- 3 files changed, 72 insertions(+), 132 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 585599c9b6c..68842961e3f 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -877,7 +877,7 @@ class Claude3TextGenerator(base.RetriableRemotePredictor): The model for natural language tasks. Possible values are "claude-3-sonnet", "claude-3-haiku", "claude-3-5-sonnet" and "claude-3-opus". "claude-3-sonnet" (deprecated) is Anthropic's dependable combination of skills and speed. It is engineered to be dependable for scaled AI deployments across a variety of use cases. "claude-3-haiku" is Anthropic's fastest, most compact vision and text model for near-instant responses to simple queries, meant for seamless AI experiences mimicking human interactions. - "claude-3-5-sonnet" is Anthropic's most powerful AI model and maintains the speed and cost of Claude 3 Sonnet, which is a mid-tier model. + "claude-3-5-sonnet" (deprecated) is Anthropic's most powerful AI model and maintains the speed and cost of Claude 3 Sonnet, which is a mid-tier model. "claude-3-opus" (deprecated) is Anthropic's second-most powerful AI model, with strong performance on highly complex tasks. https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#available-claude-models If no setting is provided, "claude-3-sonnet" will be used by default diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb index 9792c90205c..017ed596a2f 100644 --- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb +++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb @@ -28,20 +28,6 @@ "" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "# Python 3.13 is not yet a supported runtime for remote functions.\n", - "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", - "if sys.version_info >= (3, 13, 0):\n", - " sys.exit(0)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -155,30 +141,6 @@ "execution_count": 4, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job c4c27713-51c8-4293-8454-5c904df79318 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0b1b71d8-8546-45f2-b403-707161fe4002 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -267,13 +229,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 488a116f-44b2-4ff7-9f95-bd36473dab0f is DONE. 0 Bytes processed. Open Job" + "\n", + " Query processed 0 Bytes in a moment of slot time. [Job bigframes-dev:us-east5.9bc70627-6891-44a4-b7d7-8a28e213cdec details]\n", + " " ], "text/plain": [ "" @@ -301,7 +265,7 @@ " \"content\": message,\n", " }\n", " ],\n", - " model=\"claude-3-5-sonnet@20240620\",\n", + " model=\"claude-3-haiku@20240307\",\n", " )\n", " content_text = message.content[0].text if message.content else \"\"\n", " return content_text" @@ -309,16 +273,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'bigframes-dev._b52b272a35b88e236e1f96fbe3f560c83a8fee85.bigframes_session265649_de1176dd4c57f40ba959503af3981682'" + "'bigframes-dev._e9a5162ae4daa9f50fda3f95febaa9781131f3b8.bigframes_sessionc10c73_49262141176cbf70037559ae84e834d3'" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -330,16 +294,16 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'projects/bigframes-dev/locations/us-east5/functions/bigframes-session265649-de1176dd4c57f40ba959503af3981682'" + "'projects/bigframes-dev/locations/us-east5/functions/bigframes-sessionc10c73-49262141176cbf70037559ae84e834d3'" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -351,49 +315,40 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 58b230a8-6536-4bac-ab02-dcf574692dd6 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 46d6a1e9-426a-4615-8eb5-98d34d08ec07 is DONE. 1.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job cf8fcbaa-b233-47cd-b4e3-60876b24879f is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 2620a544-d1df-4b30-bec4-4221e79ddf0f is DONE. 1.4 kB processed. Open Job" + "\n", + " Query started with request ID bigframes-dev:us-east5.821579f4-63ea-4072-a3ce-318e43768432.
SQL
SELECT\n",
+       "`bfuid_col_3` AS `bfuid_col_3`,\n",
+       "`bfuid_col_4` AS `bfuid_col_4`,\n",
+       "`bfuid_col_5` AS `bfuid_col_5`\n",
+       "FROM\n",
+       "(SELECT\n",
+       "  `t1`.`bfuid_col_3`,\n",
+       "  `t1`.`bfuid_col_4`,\n",
+       "  `t1`.`bfuid_col_5`,\n",
+       "  `t1`.`bfuid_col_6` AS `bfuid_col_7`\n",
+       "FROM (\n",
+       "  SELECT\n",
+       "    `t0`.`level_0`,\n",
+       "    `t0`.`column_0`,\n",
+       "    `t0`.`bfuid_col_6`,\n",
+       "    `t0`.`level_0` AS `bfuid_col_3`,\n",
+       "    `t0`.`column_0` AS `bfuid_col_4`,\n",
+       "    `bigframes-dev._e9a5162ae4daa9f50fda3f95febaa9781131f3b8.bigframes_sessionc10c73_49262141176cbf70037559ae84e834d3`(`t0`.`column_0`) AS `bfuid_col_5`\n",
+       "  FROM (\n",
+       "    SELECT\n",
+       "      *\n",
+       "    FROM UNNEST(ARRAY<STRUCT<`level_0` INT64, `column_0` STRING, `bfuid_col_6` INT64>>[STRUCT(0, 'What is the capital of France?', 0), STRUCT(1, 'Explain the concept of photosynthesis in simple terms.', 1), STRUCT(2, 'Write a haiku about artificial intelligence.', 2)]) AS `level_0`\n",
+       "  ) AS `t0`\n",
+       ") AS `t1`)\n",
+       "ORDER BY `bfuid_col_7` ASC NULLS LAST\n",
+       "LIMIT 10
\n", + " " ], "text/plain": [ "" @@ -436,13 +391,12 @@ " \n", " 1\n", " Explain the concept of photosynthesis in simpl...\n", - " Photosynthesis is the process plants use to ma...\n", + " Photosynthesis is the process by which plants ...\n", " \n", " \n", " 2\n", " Write a haiku about artificial intelligence.\n", - " Here's a haiku about artificial intelligence:\n", - "...\n", + " Here is a haiku about artificial intelligence:...\n", " \n", " \n", "\n", @@ -457,14 +411,13 @@ "\n", " answers \n", "0 The capital of France is Paris. \n", - "1 Photosynthesis is the process plants use to ma... \n", - "2 Here's a haiku about artificial intelligence:\n", - "... \n", + "1 Photosynthesis is the process by which plants ... \n", + "2 Here is a haiku about artificial intelligence:... \n", "\n", "[3 rows x 2 columns]" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -484,9 +437,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Session sessionc10c73 closed." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "bpd.close_session()" ] @@ -494,7 +460,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv", + "display_name": "venv (3.14.2)", "language": "python", "name": "python3" }, @@ -508,7 +474,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.14.2" } }, "nbformat": 4, diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 25cde92c133..22a6f0bfd2e 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -98,18 +98,10 @@ def test_llm_gemini_w_ground_with_google_search(llm_remote_text_df): # (b/366290533): Claude models are of extremely low capacity. The tests should reside in small tests. Moving these here just to protect BQML's shared capacity(as load test only runs once per day.) and make sure we still have minimum coverage. -@pytest.mark.parametrize( - "model_name", - ("claude-3-haiku", "claude-3-5-sonnet"), -) @pytest.mark.flaky(retries=3, delay=120) -def test_claude3_text_generator_create_load( - dataset_id, model_name, session, session_us_east5, bq_connection -): - if model_name in ("claude-3-5-sonnet",): - session = session_us_east5 +def test_claude3_text_generator_create_load(dataset_id, session, bq_connection): claude3_text_generator_model = llm.Claude3TextGenerator( - model_name=model_name, connection_name=bq_connection, session=session + model_name="claude-3-haiku", connection_name=bq_connection, session=session ) assert claude3_text_generator_model is not None assert claude3_text_generator_model._bqml_model is not None @@ -120,21 +112,15 @@ def test_claude3_text_generator_create_load( ) assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name assert reloaded_model.connection_name == bq_connection - assert reloaded_model.model_name == model_name + assert reloaded_model.model_name == "claude-3-haiku" -@pytest.mark.parametrize( - "model_name", - ("claude-3-haiku", "claude-3-5-sonnet"), -) @pytest.mark.flaky(retries=3, delay=120) def test_claude3_text_generator_predict_default_params_success( - llm_text_df, model_name, session, session_us_east5, bq_connection + llm_text_df, session, bq_connection ): - if model_name in ("claude-3-5-sonnet",): - session = session_us_east5 claude3_text_generator_model = llm.Claude3TextGenerator( - model_name=model_name, connection_name=bq_connection, session=session + model_name="claude-3-haiku", connection_name=bq_connection, session=session ) df = claude3_text_generator_model.predict(llm_text_df).to_pandas() utils.check_pandas_df_schema_and_index( @@ -142,18 +128,12 @@ def test_claude3_text_generator_predict_default_params_success( ) -@pytest.mark.parametrize( - "model_name", - ("claude-3-haiku", "claude-3-5-sonnet"), -) @pytest.mark.flaky(retries=3, delay=120) def test_claude3_text_generator_predict_with_params_success( - llm_text_df, model_name, session, session_us_east5, bq_connection + llm_text_df, session, bq_connection ): - if model_name in ("claude-3-5-sonnet",): - session = session_us_east5 claude3_text_generator_model = llm.Claude3TextGenerator( - model_name=model_name, connection_name=bq_connection, session=session + model_name="claude-3-haiku", connection_name=bq_connection, session=session ) df = claude3_text_generator_model.predict( llm_text_df, max_output_tokens=100, top_k=20, top_p=0.5 @@ -163,20 +143,14 @@ def test_claude3_text_generator_predict_with_params_success( ) -@pytest.mark.parametrize( - "model_name", - ("claude-3-haiku", "claude-3-5-sonnet"), -) @pytest.mark.flaky(retries=3, delay=120) def test_claude3_text_generator_predict_multi_col_success( - llm_text_df, model_name, session, session_us_east5, bq_connection + llm_text_df, session, bq_connection ): - if model_name in ("claude-3-5-sonnet",): - session = session_us_east5 llm_text_df["additional_col"] = 1 claude3_text_generator_model = llm.Claude3TextGenerator( - model_name=model_name, connection_name=bq_connection, session=session + model_name="claude-3-haiku", connection_name=bq_connection, session=session ) df = claude3_text_generator_model.predict(llm_text_df).to_pandas() utils.check_pandas_df_schema_and_index( From 867951bcabcff12e2fce88143b45d929d3237088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 24 Feb 2026 12:27:02 -0600 Subject: [PATCH 14/29] docs: add code sample and docstring for bpd.options.experiments.sql_compiler (#2474) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added documentation for the sql_compiler property. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/_config/experiment_options.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index 811d6b8bd45..6c51ef6db39 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -72,6 +72,13 @@ def ai_operators(self, value: bool): @property def sql_compiler(self) -> Literal["legacy", "stable", "experimental"]: + """Set to 'experimental' to try out the latest in compilation experiments.. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.experiments.sql_compiler = 'experimental' # doctest: +SKIP + """ return self._sql_compiler @sql_compiler.setter From 9e4da68e15a27a120c5585c62dc664fb4798c00b Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 24 Feb 2026 12:37:03 -0800 Subject: [PATCH 15/29] test: Fix prerelease and pandas 3.0 test compat (#2457) --- .../ibis_compiler/scalar_op_registry.py | 6 +- bigframes/core/compile/polars/lowering.py | 50 ++- .../compile/sqlglot/expressions/common.py | 33 ++ .../sqlglot/expressions/numeric_ops.py | 7 +- bigframes/core/rewrite/timedeltas.py | 8 +- bigframes/core/utils.py | 4 +- bigframes/dataframe.py | 2 +- bigframes/ml/metrics/_metrics.py | 8 +- bigframes/series.py | 2 +- bigframes/session/polars_executor.py | 2 + bigframes/testing/__init__.py | 7 + bigframes/testing/utils.py | 44 ++- noxfile.py | 92 +---- .../large/functions/test_remote_function.py | 42 +- tests/system/small/bigquery/test_array.py | 6 +- tests/system/small/bigquery/test_datetime.py | 19 +- tests/system/small/bigquery/test_geo.py | 25 +- tests/system/small/bigquery/test_sql.py | 30 +- tests/system/small/bigquery/test_struct.py | 5 +- tests/system/small/core/test_reshape.py | 4 +- .../system/small/engines/test_numeric_ops.py | 41 ++ tests/system/small/ml/test_metrics.py | 61 +-- tests/system/small/ml/test_utils.py | 10 +- tests/system/small/operations/test_dates.py | 12 +- .../system/small/operations/test_datetimes.py | 40 +- .../small/operations/test_timedeltas.py | 364 ++++++++++-------- tests/system/small/test_dataframe.py | 354 +++++++++-------- tests/system/small/test_dataframe_io.py | 64 +-- tests/system/small/test_groupby.py | 125 +++--- tests/system/small/test_multiindex.py | 141 +++---- tests/system/small/test_numpy.py | 14 +- tests/system/small/test_pandas.py | 127 +++--- tests/system/small/test_series.py | 335 ++++++++-------- tests/system/small/test_session.py | 123 +++--- tests/system/small/test_unordered.py | 6 +- tests/system/small/test_window.py | 49 ++- .../test_div_timedelta/out.sql | 6 +- .../test_mul_timedelta/out.sql | 12 +- tests/unit/core/test_groupby.py | 26 +- tests/unit/test_dataframe_polars.py | 7 +- .../ibis/expr/operations/numeric.py | 2 - 41 files changed, 1304 insertions(+), 1011 deletions(-) create mode 100644 bigframes/core/compile/sqlglot/expressions/common.py diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 4df99b1e528..d89c239cf4d 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -169,6 +169,8 @@ def arctanh_op_impl(x: ibis_types.Value): @scalar_op_compiler.register_unary_op(ops.floor_op) def floor_op_impl(x: ibis_types.Value): x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_boolean(): + return x_numeric.cast(ibis_dtypes.Int64()).cast(ibis_dtypes.Float64()) if x_numeric.type().is_integer(): return x_numeric.cast(ibis_dtypes.Float64()) if x_numeric.type().is_floating(): @@ -181,6 +183,8 @@ def floor_op_impl(x: ibis_types.Value): @scalar_op_compiler.register_unary_op(ops.ceil_op) def ceil_op_impl(x: ibis_types.Value): x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_boolean(): + return x_numeric.cast(ibis_dtypes.Int64()).cast(ibis_dtypes.Float64()) if x_numeric.type().is_integer(): return x_numeric.cast(ibis_dtypes.Float64()) if x_numeric.type().is_floating(): @@ -1026,7 +1030,7 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): @scalar_op_compiler.register_unary_op(ops.timedelta_floor_op) def timedelta_floor_op_impl(x: ibis_types.NumericValue): - return x.floor() + return ibis_api.case().when(x > ibis.literal(0), x.floor()).else_(x.ceil()).end() @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index bf617d6879f..cddf20e05de 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -174,12 +174,10 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: divisor.output_type ): # exact same as floordiv impl for timedelta - numeric_result = ops.floordiv_op.as_expr( + numeric_result = ops.div_op.as_expr( ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend), divisor ) - int_result = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(numeric_result) - return ops.AsTypeOp(to_type=dtypes.TIMEDELTA_DTYPE).as_expr(int_result) - + return _numeric_to_timedelta(numeric_result) if ( dividend.output_type == dtypes.BOOL_DTYPE and divisor.output_type == dtypes.BOOL_DTYPE @@ -226,11 +224,10 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: divisor.output_type ): # this is pretty fragile as zero will break it, and must fit back into int - numeric_result = expr.op.as_expr( + numeric_result = ops.div_op.as_expr( ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend), divisor ) - int_result = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(numeric_result) - return ops.AsTypeOp(to_type=dtypes.TIMEDELTA_DTYPE).as_expr(int_result) + return _numeric_to_timedelta(numeric_result) if dividend.output_type == dtypes.BOOL_DTYPE: dividend = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend) @@ -319,6 +316,32 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: return expr +class LowerCeilOp(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.CeilOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, numeric_ops.CeilOp) + arg = expr.children[0] + if arg.output_type in (dtypes.INT_DTYPE, dtypes.BOOL_DTYPE): + return expr.op.as_expr(ops.AsTypeOp(dtypes.FLOAT_DTYPE).as_expr(arg)) + return expr + + +class LowerFloorOp(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.FloorOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, numeric_ops.FloorOp) + arg = expr.children[0] + if arg.output_type in (dtypes.INT_DTYPE, dtypes.BOOL_DTYPE): + return expr.op.as_expr(ops.AsTypeOp(dtypes.FLOAT_DTYPE).as_expr(arg)) + return expr + + class LowerIsinOp(op_lowering.OpLoweringRule): @property def op(self) -> type[ops.ScalarOp]: @@ -465,8 +488,21 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): LowerInvertOp(), LowerIsinOp(), LowerLenOp(), + LowerCeilOp(), + LowerFloorOp(), ) def lower_ops_to_polars(root: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: return op_lowering.lower_ops(root, rules=POLARS_LOWERING_RULES) + + +def _numeric_to_timedelta(expr: expression.Expression) -> expression.Expression: + """rounding logic used for emulating timedelta ops""" + rounded_value = ops.where_op.as_expr( + ops.floor_op.as_expr(expr), + ops.gt_op.as_expr(expr, expression.const(0)), + ops.ceil_op.as_expr(expr), + ) + int_value = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rounded_value) + return ops.AsTypeOp(to_type=dtypes.TIMEDELTA_DTYPE).as_expr(int_value) diff --git a/bigframes/core/compile/sqlglot/expressions/common.py b/bigframes/core/compile/sqlglot/expressions/common.py new file mode 100644 index 00000000000..067ca070edf --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/common.py @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import bigframes_vendored.sqlglot.expressions as sge + + +def round_towards_zero(expr: sge.Expression): + """ + Round a float value to to an integer, always rounding towards zero. + + This is used to handle duration/timedelta emulation mostly. + """ + return sge.Cast( + this=sge.If( + this=sge.GT(this=expr, expression=sge.convert(0)), + true=sge.Floor(this=expr), + false=sge.Ceil(this=expr), + ), + to="INT64", + ) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index 2285a3a0bc5..d70ec2ef3f9 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -20,6 +20,7 @@ from bigframes import dtypes from bigframes import operations as ops import bigframes.core.compile.sqlglot.expression_compiler as expression_compiler +from bigframes.core.compile.sqlglot.expressions.common import round_towards_zero import bigframes.core.compile.sqlglot.expressions.constants as constants from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr from bigframes.operations import numeric_ops @@ -467,7 +468,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: result = sge.func("IEEE_DIVIDE", left_expr, right_expr) if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return sge.Cast(this=sge.Floor(this=result), to="INT64") + return round_towards_zero(result) else: return result @@ -510,7 +511,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: ) if dtypes.is_numeric(right.dtype) and left.dtype == dtypes.TIMEDELTA_DTYPE: - result = sge.Cast(this=sge.Floor(this=result), to="INT64") + result = round_towards_zero(sge.func("IEEE_DIVIDE", left_expr, right_expr)) return result @@ -578,7 +579,7 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: if (dtypes.is_numeric(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE) or ( left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype) ): - return sge.Cast(this=sge.Floor(this=result), to="INT64") + return round_towards_zero(result) else: return result diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index 7190810f714..7544963732e 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -206,12 +206,12 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: - result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right) - if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) + return _TypedExpr.create_op_expr( + ops.timedelta_floor_op, _TypedExpr.create_op_expr(ops.div_op, left, right) + ) - return result + return _TypedExpr.create_op_expr(ops.floordiv_op, left, right) def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr): diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index dd37a352a7c..74f5d05196c 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -113,13 +113,13 @@ def get_standardized_ids( """ col_ids = [ UNNAMED_COLUMN_ID - if col_label is None + if pd.isna(col_label) # type: ignore else label_to_identifier(col_label, strict=strict) for col_label in col_labels ] idx_ids = [ UNNAMED_INDEX_ID - if idx_label is None + if pd.isna(idx_label) # type: ignore else label_to_identifier(idx_label, strict=strict) for idx_label in idx_labels ] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2c734f2943e..25cedda8f4a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4398,7 +4398,7 @@ def to_excel( **kwargs, ) -> None: return self.to_pandas(allow_large_results=allow_large_results).to_excel( - excel_writer, sheet_name, **kwargs + excel_writer, sheet_name=sheet_name, **kwargs ) def to_latex( diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 8787a68c58b..6a080b8e04f 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -214,7 +214,7 @@ def confusion_matrix( y_true = row["y_true"] y_pred = row["y_pred"] count = row["dummy"] - confusion_matrix[y_pred][y_true] = count + confusion_matrix.at[y_true, y_pred] = count return confusion_matrix @@ -251,7 +251,7 @@ def recall_score( / is_accurate.groupby(y_true_series).count() ).to_pandas() - recall_score = pd.Series(0, index=index) + recall_score = pd.Series(0.0, index=index) for i in recall_score.index: recall_score.loc[i] = recall.loc[i] @@ -321,7 +321,7 @@ def _precision_score_per_label(y_true: bpd.Series, y_pred: bpd.Series) -> pd.Ser is_accurate.groupby(y_pred).sum() / is_accurate.groupby(y_pred).count() ).to_pandas() - precision_score = pd.Series(0, index=index) + precision_score = pd.Series(0.0, index=index) for i in precision.index: precision_score.loc[i] = precision.loc[i] @@ -366,7 +366,7 @@ def f1_score( recall = recall_score(y_true_series, y_pred_series, average=None) precision = precision_score(y_true_series, y_pred_series, average=None) - f1_score = pd.Series(0, index=recall.index) + f1_score = pd.Series(0.0, index=recall.index) for index in recall.index: if precision[index] + recall[index] != 0: f1_score[index] = ( diff --git a/bigframes/series.py b/bigframes/series.py index 299c39637d0..cd564e5c911 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2292,7 +2292,7 @@ def to_excel( self, excel_writer, sheet_name="Sheet1", *, allow_large_results=None, **kwargs ) -> None: return self.to_pandas(allow_large_results=allow_large_results).to_excel( - excel_writer, sheet_name, **kwargs + excel_writer, sheet_name=sheet_name, **kwargs ) def to_json( diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 575beff8fc0..a786d1b8a88 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -79,6 +79,8 @@ numeric_ops.SubOp, numeric_ops.MulOp, numeric_ops.DivOp, + numeric_ops.CeilOp, + numeric_ops.FloorOp, numeric_ops.FloorDivOp, numeric_ops.ModOp, generic_ops.AsTypeOp, diff --git a/bigframes/testing/__init__.py b/bigframes/testing/__init__.py index 529c08241d7..9c1fb7c283b 100644 --- a/bigframes/testing/__init__.py +++ b/bigframes/testing/__init__.py @@ -17,3 +17,10 @@ These modules are provided for testing the BigQuery DataFrames package. The interface is not considered stable. """ +from bigframes.testing.utils import ( + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) + +__all__ = ["assert_frame_equal", "assert_series_equal", "assert_index_equal"] diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index 6679f53b2ce..26a944d760a 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -15,7 +15,7 @@ import base64 import decimal import re -from typing import Iterable, Optional, Sequence, Set, Union +from typing import Iterable, Optional, Sequence, Set, TypeVar, Union import geopandas as gpd # type: ignore import google.api_core.operation @@ -29,7 +29,6 @@ from bigframes import operations as ops from bigframes.core import expression as ex -import bigframes.dtypes import bigframes.functions._utils as bff_utils import bigframes.pandas as bpd @@ -69,6 +68,8 @@ "content", ] +SeriesOrIndexT = TypeVar("SeriesOrIndexT", pd.Series, pd.Index) + def pandas_major_version() -> int: match = re.search(r"^v?(\d+)", pd.__version__.strip()) @@ -90,19 +91,30 @@ def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwar def _normalize_all_nulls(col: pd.Series) -> pd.Series: - if col.dtype in (bigframes.dtypes.FLOAT_DTYPE, bigframes.dtypes.INT_DTYPE): - col = col.astype("float64") - if pd_types.is_object_dtype(col): - col = col.fillna(float("nan")) + if pd_types.is_float_dtype(col.dtype): + col = col.astype("float64").astype("Float64") return col +def _normalize_index_nulls(idx: pd.Index) -> pd.Index: + if isinstance(idx, pd.MultiIndex): + new_levels = [ + _normalize_index_nulls(idx.get_level_values(i)) for i in range(idx.nlevels) + ] + return pd.MultiIndex.from_arrays(new_levels, names=idx.names) + if idx.hasnans: + if pd_types.is_float_dtype(idx.dtype): + idx = idx.astype("float64").astype("Float64") + return idx + + def assert_frame_equal( left: pd.DataFrame, right: pd.DataFrame, *, ignore_order: bool = False, nulls_are_nan: bool = True, + downcast_object: bool = True, **kwargs, ): if ignore_order: @@ -118,9 +130,17 @@ def assert_frame_equal( left = left.sort_index() right = right.sort_index() + # Pandas sometimes likes to produce object dtype columns + # However, nan/None/Null inconsistency makes comparison futile, convert to typed column + if downcast_object: + left = left.apply(lambda x: x.infer_objects()) + right = right.apply(lambda x: x.infer_objects()) + if nulls_are_nan: left = left.apply(_normalize_all_nulls) right = right.apply(_normalize_all_nulls) + left.index = _normalize_index_nulls(left.index) + right.index = _normalize_index_nulls(right.index) pd.testing.assert_frame_equal(left, right, **kwargs) @@ -151,12 +171,20 @@ def assert_series_equal( right.index = right.index.astype("Int64") if nulls_are_nan: - left = _normalize_all_nulls(left) - right = _normalize_all_nulls(right) + left = _normalize_all_nulls(left.infer_objects()) + right = _normalize_all_nulls(right.infer_objects()) + left.index = _normalize_index_nulls(left.index) + right.index = _normalize_index_nulls(right.index) + left.name = pd.NA if pd.isna(left.name) else left.name # type: ignore + right.name = pd.NA if pd.isna(right.name) else right.name # type: ignore pd.testing.assert_series_equal(left, right, **kwargs) +def assert_index_equal(left, right, **kwargs): + pd.testing.assert_index_equal(left, right, **kwargs) + + def _standardize_index(idx): return pd.Index(list(idx), name=idx.name) diff --git a/noxfile.py b/noxfile.py index a8a1a84987e..89e9c7684f2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -20,7 +20,6 @@ import multiprocessing import os import pathlib -import re import shutil import time from typing import Dict, List @@ -499,8 +498,8 @@ def cover(session): "report", "--show-missing", "--include=tests/system/small/*", - # TODO(b/353775058) resume coverage to 100 when the issue is fixed. - "--fail-under=99", + # Some tests only run under old pandas, some only under new pandas version + "--fail-under=98", ) session.run("coverage", "erase") @@ -588,99 +587,36 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( constraints_path = str( CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) - - # Ignore officially released versions of certain packages specified in - # testing/constraints-*.txt and install a more recent, pre-release versions - # directly - already_installed = set() + session.install( + *set(UNIT_TEST_STANDARD_DEPENDENCIES + SYSTEM_TEST_STANDARD_DEPENDENCIES), + "-c", + constraints_path, + "-e", + ".", + ) # PyArrow prerelease packages are published to an alternative PyPI host. # https://arrow.apache.org/docs/python/install.html#installing-nightly-packages session.install( + "--no-deps", + "--upgrade", "--extra-index-url", "https://pypi.fury.io/arrow-nightlies/", - "--prefer-binary", - "--pre", - "--upgrade", "pyarrow", - ) - already_installed.add("pyarrow") - - session.install( - "--prefer-binary", - "--pre", - "--upgrade", # We exclude each version individually so that we can continue to test # some prerelease packages. See: # https://github.com/googleapis/python-bigquery-dataframes/pull/268#discussion_r1423205172 # "pandas!=2.1.4, !=2.2.0rc0, !=2.2.0, !=2.2.1", "pandas", - ) - already_installed.add("pandas") - - # Try to avoid a cap on our SQLGlot so that bigframes - # can be integrated with SQLMesh. See: - # https://github.com/googleapis/python-bigquery-dataframes/issues/942 - # If SQLGlot introduces something that breaks us, lets file an issue - # upstream and/or make sure we fix bigframes to work with it. - session.install( - "--upgrade", - "git+https://github.com/tobymao/sqlglot.git#egg=sqlglot", - ) - already_installed.add("sqlglot") - - # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 - session.install("--no-deps", "db-dtypes") - already_installed.add("db-dtypes") - - # Ensure we catch breaking changes in the client libraries early. - session.install( - "--upgrade", + # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 + "db-dtypes", + # Ensure we catch breaking changes in the client libraries early. "git+https://github.com/googleapis/python-bigquery.git#egg=google-cloud-bigquery", - ) - already_installed.add("google-cloud-bigquery") - session.install( "--upgrade", "-e", "git+https://github.com/googleapis/google-cloud-python.git#egg=google-cloud-bigquery-storage&subdirectory=packages/google-cloud-bigquery-storage", - ) - already_installed.add("google-cloud-bigquery-storage") - session.install( - "--upgrade", "git+https://github.com/googleapis/python-bigquery-pandas.git#egg=pandas-gbq", ) - already_installed.add("pandas-gbq") - - session.install( - *set(UNIT_TEST_STANDARD_DEPENDENCIES + SYSTEM_TEST_STANDARD_DEPENDENCIES), - "-c", - constraints_path, - ) - - # Because we test minimum dependency versions on the minimum Python - # version, the first version we test with in the unit tests sessions has a - # constraints file containing all dependencies and extras. - with open( - CURRENT_DIRECTORY / "testing" / f"constraints-{DEFAULT_PYTHON_VERSION}.txt", - encoding="utf-8", - ) as constraints_file: - constraints_text = constraints_file.read() - - # Ignore leading whitespace and comment lines. - deps = [ - match.group(1) - for match in re.finditer( - r"^\s*(\S+)(?===\S+)", constraints_text, flags=re.MULTILINE - ) - if match.group(1) not in already_installed - ] - - print(already_installed) - - # We use --no-deps to ensure that pre-release versions aren't overwritten - # by the version ranges in setup.py. - session.install(*deps) - session.install("--no-deps", "-e", ".") # Print out prerelease package versions. session.run("python", "-m", "pip", "freeze") diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 2591c0c13a2..f8c4c472a9c 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -1261,14 +1261,14 @@ def test_remote_function_via_session_custom_sa(scalars_dfs): cloud_function_service_account=gcf_service_account, cloud_function_ingress_settings="all", ) - def square_num(x): + def double_num(x): if x is None: return x - return x * x + return x + x # assert that the GCF is created with the intended SA gcf = rf_session.cloudfunctionsclient.get_function( - name=square_num.bigframes_cloud_function + name=double_num.bigframes_cloud_function ) assert gcf.service_config.service_account_email == gcf_service_account @@ -1276,18 +1276,18 @@ def square_num(x): scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] - bf_result_col = bf_int64_col.apply(square_num) + bf_result_col = bf_int64_col.apply(double_num) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() pd_int64_col = scalars_pandas_df["int64_col"] - pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x * x) + pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x + x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_frame_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( - square_num, rf_session.bqclient, rf_session.cloudfunctionsclient + double_num, rf_session.bqclient, rf_session.cloudfunctionsclient ) @@ -1335,14 +1335,14 @@ def test_remote_function_via_session_custom_build_sa( cloud_build_service_account=set_build_service_account, cloud_function_ingress_settings="all", ) - def square_num(x): + def double_num(x): if x is None: return x - return x * x + return x + x # assert that the GCF is created with the intended SA gcf = rf_session.cloudfunctionsclient.get_function( - name=square_num.bigframes_cloud_function + name=double_num.bigframes_cloud_function ) assert gcf.build_config.service_account == expected_build_service_account @@ -1350,18 +1350,18 @@ def square_num(x): scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] - bf_result_col = bf_int64_col.apply(square_num) + bf_result_col = bf_int64_col.apply(double_num) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() pd_int64_col = scalars_pandas_df["int64_col"] - pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x * x) + pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x + x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_frame_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( - square_num, rf_session.bqclient, rf_session.cloudfunctionsclient + double_num, rf_session.bqclient, rf_session.cloudfunctionsclient ) @@ -1465,14 +1465,14 @@ def test_remote_function_via_session_vpc(scalars_dfs): try: - def square_num(x): + def double_num(x): if x is None: return x - return x * x + return x + x # TODO(shobs): See if the test vpc can be configured to make this flow # work with the default ingress setting (internal-only) - square_num_remote = rf_session.remote_function( + double_num_remote = rf_session.remote_function( input_types=[int], output_type=int, reuse=False, @@ -1480,13 +1480,13 @@ def square_num(x): cloud_function_vpc_connector=gcf_vpc_connector, cloud_function_vpc_connector_egress_settings="all", cloud_function_ingress_settings="all", - )(square_num) + )(double_num) gcf = rf_session.cloudfunctionsclient.get_function( - name=square_num_remote.bigframes_cloud_function + name=double_num_remote.bigframes_cloud_function ) - # assert that the GCF is created with the intended vpc connector and + # assert that the GCF test_remote_function_via_session_custom_sais created with the intended vpc connector and # egress settings. assert gcf.service_config.vpc_connector == gcf_vpc_connector # The value is since we set @@ -1497,18 +1497,18 @@ def square_num(x): scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] - bf_result_col = bf_int64_col.apply(square_num_remote) + bf_result_col = bf_int64_col.apply(double_num_remote) bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() pd_int64_col = scalars_pandas_df["int64_col"] - pd_result_col = pd_int64_col.apply(square_num) + pd_result_col = pd_int64_col.apply(double_num).astype("Int64") pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_frame_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( - square_num_remote, rf_session.bqclient, rf_session.cloudfunctionsclient + double_num_remote, rf_session.bqclient, rf_session.cloudfunctionsclient ) diff --git a/tests/system/small/bigquery/test_array.py b/tests/system/small/bigquery/test_array.py index 2ceb90e22c8..2c2b2001ebe 100644 --- a/tests/system/small/bigquery/test_array.py +++ b/tests/system/small/bigquery/test_array.py @@ -67,7 +67,11 @@ ) def test_array_length(input_data, expected): series = bpd.Series(input_data) - expected = pd.Series(expected, dtype=bigframes.dtypes.INT_DTYPE) + expected = pd.Series( + expected, + index=pd.Index(range(len(input_data)), dtype="Int64"), + dtype=bigframes.dtypes.INT_DTYPE, + ) pd.testing.assert_series_equal( bbq.array_length(series).to_pandas(), expected, diff --git a/tests/system/small/bigquery/test_datetime.py b/tests/system/small/bigquery/test_datetime.py index dc68e7b892d..789ae47ae2e 100644 --- a/tests/system/small/bigquery/test_datetime.py +++ b/tests/system/small/bigquery/test_datetime.py @@ -19,6 +19,7 @@ import pytest from bigframes import bigquery +import bigframes.testing _TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) @@ -40,7 +41,7 @@ def test_unix_seconds(scalars_dfs): .apply(lambda ts: _to_unix_epoch(ts, "s")) .astype("Int64") ) - pd.testing.assert_series_equal(actual_res, expected_res) + bigframes.testing.assert_series_equal(actual_res, expected_res) def test_unix_seconds_after_type_casting(int_series): @@ -53,7 +54,9 @@ def test_unix_seconds_after_type_casting(int_series): .apply(lambda ts: _to_unix_epoch(ts, "s")) .astype("Int64") ) - pd.testing.assert_series_equal(actual_res, expected_res, check_index_type=False) + bigframes.testing.assert_series_equal( + actual_res, expected_res, check_index_type=False + ) def test_unix_seconds_incorrect_input_type_raise_error(scalars_dfs): @@ -73,7 +76,7 @@ def test_unix_millis(scalars_dfs): .apply(lambda ts: _to_unix_epoch(ts, "ms")) .astype("Int64") ) - pd.testing.assert_series_equal(actual_res, expected_res) + bigframes.testing.assert_series_equal(actual_res, expected_res) def test_unix_millis_after_type_casting(int_series): @@ -86,7 +89,9 @@ def test_unix_millis_after_type_casting(int_series): .apply(lambda ts: _to_unix_epoch(ts, "ms")) .astype("Int64") ) - pd.testing.assert_series_equal(actual_res, expected_res, check_index_type=False) + bigframes.testing.assert_series_equal( + actual_res, expected_res, check_index_type=False + ) def test_unix_millis_incorrect_input_type_raise_error(scalars_dfs): @@ -106,7 +111,7 @@ def test_unix_micros(scalars_dfs): .apply(lambda ts: _to_unix_epoch(ts, "us")) .astype("Int64") ) - pd.testing.assert_series_equal(actual_res, expected_res) + bigframes.testing.assert_series_equal(actual_res, expected_res) def test_unix_micros_after_type_casting(int_series): @@ -119,7 +124,9 @@ def test_unix_micros_after_type_casting(int_series): .apply(lambda ts: _to_unix_epoch(ts, "us")) .astype("Int64") ) - pd.testing.assert_series_equal(actual_res, expected_res, check_index_type=False) + bigframes.testing.assert_series_equal( + actual_res, expected_res, check_index_type=False + ) def test_unix_micros_incorrect_input_type_raise_error(scalars_dfs): diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py index 28db58c7112..24ecb7f6394 100644 --- a/tests/system/small/bigquery/test_geo.py +++ b/tests/system/small/bigquery/test_geo.py @@ -32,6 +32,7 @@ import bigframes.bigquery as bbq import bigframes.geopandas import bigframes.session +import bigframes.testing def test_geo_st_area(session: bigframes.session.Session): @@ -56,7 +57,7 @@ def test_geo_st_area(session: bigframes.session.Session): geobf_s_result = bbq.st_area(geobf_s).to_pandas().round(-3) assert geobf_s_result.iloc[0] >= 1000 - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, geopd_s_result, check_dtype=False, @@ -109,7 +110,7 @@ def test_st_length_various_geometries(session): # Test default use_spheroid result_default = st_length(geoseries).to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( result_default, expected_lengths, rtol=1e-3, @@ -118,7 +119,7 @@ def test_st_length_various_geometries(session): # Test explicit use_spheroid=False result_explicit_false = st_length(geoseries, use_spheroid=False).to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( result_explicit_false, expected_lengths, rtol=1e-3, @@ -152,7 +153,7 @@ def test_geo_st_difference_with_geometry_objects(session: bigframes.session.Sess index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -191,7 +192,7 @@ def test_geo_st_difference_with_single_geometry_object( index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -217,7 +218,7 @@ def test_geo_st_difference_with_similar_geometry_objects( index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -273,7 +274,7 @@ def test_geo_st_distance_with_geometry_objects(session: bigframes.session.Sessio index=[0, 1, 2, 3], dtype="Float64", ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -320,7 +321,7 @@ def test_geo_st_distance_with_single_geometry_object( ], dtype="Float64", ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -355,7 +356,7 @@ def test_geo_st_intersection_with_geometry_objects(session: bigframes.session.Se index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -394,7 +395,7 @@ def test_geo_st_intersection_with_single_geometry_object( index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -424,7 +425,7 @@ def test_geo_st_intersection_with_similar_geometry_objects( index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( geobf_s_result, expected, check_index_type=False, @@ -465,7 +466,7 @@ def test_geo_st_isclosed(session: bigframes.session.Session): ] expected_series = pd.Series(data=expected_data, dtype="boolean") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, expected_series, # We default to Int64 (nullable) dtype, but pandas defaults to int64 index. diff --git a/tests/system/small/bigquery/test_sql.py b/tests/system/small/bigquery/test_sql.py index c519b427faf..fa43c249658 100644 --- a/tests/system/small/bigquery/test_sql.py +++ b/tests/system/small/bigquery/test_sql.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd import pytest import bigframes.bigquery as bbq import bigframes.dtypes as dtypes import bigframes.pandas as bpd +import bigframes.testing def test_sql_scalar_for_all_scalar_types(scalars_df_null_index): @@ -59,8 +59,8 @@ def test_sql_scalar_for_bool_series(scalars_df_index): series: bpd.Series = scalars_df_index["bool_col"] result = bbq.sql_scalar("CAST({0} AS INT64)", [series]) expected = series.astype(dtypes.INT_DTYPE) - expected.name = None - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + expected.name = result.name + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) @pytest.mark.parametrize( @@ -83,8 +83,8 @@ def test_sql_scalar_outputs_all_scalar_types(scalars_df_index, column_name): series: bpd.Series = scalars_df_index[column_name] result = bbq.sql_scalar("{0}", [series]) expected = series - expected.name = None - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + expected.name = result.name + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) def test_sql_scalar_for_array_series(repeated_df): @@ -114,14 +114,14 @@ def test_sql_scalar_for_array_series(repeated_df): + repeated_df["numeric_list_col"].list.len() + repeated_df["string_list_col"].list.len() ) - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) def test_sql_scalar_outputs_array_series(repeated_df): result = bbq.sql_scalar("{0}", [repeated_df["int_list_col"]]) expected = repeated_df["int_list_col"] - expected.name = None - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + expected.name = result.name + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) def test_sql_scalar_for_struct_series(nested_structs_df): @@ -132,14 +132,14 @@ def test_sql_scalar_for_struct_series(nested_structs_df): expected = nested_structs_df["person"].struct.field( "name" ).str.len() + nested_structs_df["person"].struct.field("age") - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) def test_sql_scalar_outputs_struct_series(nested_structs_df): result = bbq.sql_scalar("{0}", [nested_structs_df["person"]]) expected = nested_structs_df["person"] - expected.name = None - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + expected.name = result.name + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) def test_sql_scalar_for_json_series(json_df): @@ -150,12 +150,12 @@ def test_sql_scalar_for_json_series(json_df): ], ) expected = bbq.json_value(json_df["json_col"], "$.int_value") - expected.name = None - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + expected.name = result.name + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) def test_sql_scalar_outputs_json_series(json_df): result = bbq.sql_scalar("{0}", [json_df["json_col"]]) expected = json_df["json_col"] - expected.name = None - pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + expected.name = result.name + bigframes.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) diff --git a/tests/system/small/bigquery/test_struct.py b/tests/system/small/bigquery/test_struct.py index 58c822f642f..5e51a5fce04 100644 --- a/tests/system/small/bigquery/test_struct.py +++ b/tests/system/small/bigquery/test_struct.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd import pytest import bigframes.bigquery as bbq import bigframes.series as series +import bigframes.testing @pytest.mark.parametrize( @@ -53,9 +53,10 @@ def test_struct_from_dataframe(columns_arg): srs = series.Series( columns_arg, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( srs.to_pandas(), bbq.struct(srs.struct.explode()).to_pandas(), check_index_type=False, check_dtype=False, + check_names=False, # None vs nan version dependent ) diff --git a/tests/system/small/core/test_reshape.py b/tests/system/small/core/test_reshape.py index 0850bf50bb8..4d20ce887a7 100644 --- a/tests/system/small/core/test_reshape.py +++ b/tests/system/small/core/test_reshape.py @@ -13,11 +13,11 @@ # limitations under the License. import pandas as pd -import pandas.testing import pytest from bigframes import session from bigframes.core.reshape import merge +import bigframes.testing @pytest.mark.parametrize( @@ -56,7 +56,7 @@ def test_join_with_index( how=how, ) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) diff --git a/tests/system/small/engines/test_numeric_ops.py b/tests/system/small/engines/test_numeric_ops.py index ef0f8d9d0d8..2a14a649bd0 100644 --- a/tests/system/small/engines/test_numeric_ops.py +++ b/tests/system/small/engines/test_numeric_ops.py @@ -53,6 +53,47 @@ def apply_op_pairwise( return new_arr +def apply_op( + array: array_value.ArrayValue, op: ops.UnaryOp, excluded_cols=[] +) -> array_value.ArrayValue: + exprs = [] + labels = [] + for arg in array.column_ids: + if arg in excluded_cols: + continue + try: + _ = op.output_type(array.get_column_type(arg)) + expr = op.as_expr(arg) + exprs.append(expr) + labels.append(f"{arg}_{op.name}") + except TypeError: + continue + assert len(exprs) > 0 + new_arr, ids = array.compute_values(exprs) + new_arr = new_arr.rename_columns( + {new_col: label for new_col, label in zip(ids, labels)} + ) + return new_arr + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_ceil( + scalars_array_value: array_value.ArrayValue, + engine, +): + arr = apply_op(scalars_array_value, ops.ceil_op) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_floor( + scalars_array_value: array_value.ArrayValue, + engine, +): + arr = apply_op(scalars_array_value, ops.floor_op) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + @pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_project_add( scalars_array_value: array_value.ArrayValue, diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index 040d4d97f64..a0f0f6b48cf 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -20,6 +20,7 @@ import bigframes from bigframes.ml import metrics +import bigframes.testing def test_r2_score_perfect_fit(session): @@ -161,7 +162,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): pd_tpr = tpr.to_pandas() pd_thresholds = thresholds.to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( # skip testing the first value, as it is redundant and inconsistent across sklearn versions pd_thresholds[1:], pd.Series( @@ -171,7 +172,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): ), check_index=False, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_fpr, pd.Series( [0.0, 0.0, 0.0, 0.25, 0.25, 0.5, 0.5, 0.75, 0.75, 0.75, 1.0], @@ -180,7 +181,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): ), check_index_type=False, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_tpr, pd.Series( [ @@ -261,7 +262,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): pd_tpr = tpr.to_pandas() pd_thresholds = thresholds.to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( # skip testing the first value, as it is redundant and inconsistent across sklearn versions pd_thresholds[1:], pd.Series( @@ -271,7 +272,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): ), check_index=False, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_fpr, pd.Series( [0.0, 0.0, 1.0], @@ -280,7 +281,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): ), check_index_type=False, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_tpr, pd.Series( [ @@ -353,7 +354,7 @@ def test_roc_curve_binary_classification_prediction_series(session): pd_tpr = tpr.to_pandas() pd_thresholds = thresholds.to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( # skip testing the first value, as it is redundant and inconsistent across sklearn versions pd_thresholds[1:], pd.Series( @@ -363,7 +364,7 @@ def test_roc_curve_binary_classification_prediction_series(session): ), check_index=False, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_fpr, pd.Series( [0.0, 0.0, 0.0, 0.25, 0.25, 0.5, 0.5, 0.75, 0.75, 0.75, 1.0], @@ -372,7 +373,7 @@ def test_roc_curve_binary_classification_prediction_series(session): ), check_index_type=False, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_tpr, pd.Series( [ @@ -505,7 +506,7 @@ def test_confusion_matrix(session): 2: [0, 1, 2], } ).astype("int64") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( confusion_matrix, expected_pd_df, check_index_type=False ) @@ -523,7 +524,7 @@ def test_confusion_matrix_column_index(session): {1: [1, 0, 1, 0], 2: [0, 0, 2, 0], 3: [0, 0, 0, 0], 4: [0, 1, 0, 1]}, index=[1, 2, 3, 4], ).astype("int64") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( confusion_matrix, expected_pd_df, check_index_type=False ) @@ -542,7 +543,7 @@ def test_confusion_matrix_matches_sklearn(session): pd_df[["y_true"]], pd_df[["y_pred"]] ) expected_pd_df = pd.DataFrame(expected_confusion_matrix) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( confusion_matrix, expected_pd_df, check_index_type=False ) @@ -564,7 +565,7 @@ def test_confusion_matrix_str_matches_sklearn(session): expected_confusion_matrix, index=["ant", "bird", "cat"] ) expected_pd_df.columns = pd.Index(["ant", "bird", "cat"]) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( confusion_matrix, expected_pd_df, check_index_type=False ) @@ -585,7 +586,7 @@ def test_confusion_matrix_series(session): 2: [0, 1, 2], } ).astype("int64") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( confusion_matrix, expected_pd_df, check_index_type=False ) @@ -605,7 +606,9 @@ def test_recall_score(session): expected_index = [0, 1, 2] expected_recall = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(recall, expected_recall, check_index_type=False) + bigframes.testing.assert_series_equal( + recall, expected_recall, check_index_type=False + ) def test_recall_score_matches_sklearn(session): @@ -623,7 +626,9 @@ def test_recall_score_matches_sklearn(session): ) expected_index = [0, 1, 2] expected_recall = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(recall, expected_recall, check_index_type=False) + bigframes.testing.assert_series_equal( + recall, expected_recall, check_index_type=False + ) def test_recall_score_str_matches_sklearn(session): @@ -641,7 +646,9 @@ def test_recall_score_str_matches_sklearn(session): ) expected_index = ["ant", "bird", "cat"] expected_recall = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(recall, expected_recall, check_index_type=False) + bigframes.testing.assert_series_equal( + recall, expected_recall, check_index_type=False + ) def test_recall_score_series(session): @@ -657,7 +664,9 @@ def test_recall_score_series(session): expected_index = [0, 1, 2] expected_recall = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(recall, expected_recall, check_index_type=False) + bigframes.testing.assert_series_equal( + recall, expected_recall, check_index_type=False + ) def test_precision_score(session): @@ -675,7 +684,7 @@ def test_precision_score(session): expected_index = [0, 1, 2] expected_precision = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( precision_score, expected_precision, check_index_type=False ) @@ -698,7 +707,7 @@ def test_precision_score_matches_sklearn(session): ) expected_index = [0, 1, 2] expected_precision = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( precision_score, expected_precision, check_index_type=False ) @@ -720,7 +729,7 @@ def test_precision_score_str_matches_sklearn(session): ) expected_index = ["ant", "bird", "cat"] expected_precision = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( precision_score, expected_precision, check_index_type=False ) @@ -738,7 +747,7 @@ def test_precision_score_series(session): expected_index = [0, 1, 2] expected_precision = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( precision_score, expected_precision, check_index_type=False ) @@ -823,7 +832,7 @@ def test_f1_score(session): expected_index = [0, 1, 2] expected_f1 = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) + bigframes.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) def test_f1_score_matches_sklearn(session): @@ -841,7 +850,7 @@ def test_f1_score_matches_sklearn(session): ) expected_index = [0, 1, 2] expected_f1 = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) + bigframes.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) def test_f1_score_str_matches_sklearn(session): @@ -859,7 +868,7 @@ def test_f1_score_str_matches_sklearn(session): ) expected_index = ["ant", "bird", "cat"] expected_f1 = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) + bigframes.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) def test_f1_score_series(session): @@ -875,7 +884,7 @@ def test_f1_score_series(session): expected_index = [0, 1, 2] expected_f1 = pd.Series(expected_values, index=expected_index) - pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) + bigframes.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) def test_mean_squared_error(session: bigframes.Session): diff --git a/tests/system/small/ml/test_utils.py b/tests/system/small/ml/test_utils.py index b3aa4ed59bc..8d757549005 100644 --- a/tests/system/small/ml/test_utils.py +++ b/tests/system/small/ml/test_utils.py @@ -13,10 +13,10 @@ # limitations under the License. import pandas as pd -import pandas.testing import pytest import bigframes.ml.utils as utils +import bigframes.testing _DATA_FRAME = pd.DataFrame({"column": [1, 2, 3]}) _SERIES = pd.Series([1, 2, 3], name="column") @@ -31,7 +31,7 @@ def test_convert_to_dataframe(session, data): (actual_result,) = utils.batch_convert_to_dataframe(bf_data) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( actual_result.to_pandas(), _DATA_FRAME, check_index_type=False, @@ -46,7 +46,7 @@ def test_convert_to_dataframe(session, data): def test_convert_pandas_to_dataframe(data, session): (actual_result,) = utils.batch_convert_to_dataframe(data, session=session) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( actual_result.to_pandas(), _DATA_FRAME, check_index_type=False, @@ -63,7 +63,7 @@ def test_convert_to_series(session, data): (actual_result,) = utils.batch_convert_to_series(bf_data) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result.to_pandas(), _SERIES, check_index_type=False, check_dtype=False ) @@ -75,6 +75,6 @@ def test_convert_to_series(session, data): def test_convert_pandas_to_series(data, session): (actual_result,) = utils.batch_convert_to_series(data, session=session) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result.to_pandas(), _SERIES, check_index_type=False, check_dtype=False ) diff --git a/tests/system/small/operations/test_dates.py b/tests/system/small/operations/test_dates.py index 9e8da642090..e9f5f07d282 100644 --- a/tests/system/small/operations/test_dates.py +++ b/tests/system/small/operations/test_dates.py @@ -17,10 +17,10 @@ from packaging import version import pandas as pd -import pandas.testing import pytest from bigframes import dtypes +import bigframes.testing def test_date_diff_between_series(session): @@ -35,7 +35,7 @@ def test_date_diff_between_series(session): actual_result = (bf_df["col_1"] - bf_df["col_2"]).to_pandas() expected_result = (pd_df["col_1"] - pd_df["col_2"]).astype(dtypes.TIMEDELTA_DTYPE) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -47,7 +47,7 @@ def test_date_diff_literal_sub_series(scalars_dfs): actual_result = (literal - bf_df["date_col"]).to_pandas() expected_result = (literal - pd_df["date_col"]).astype(dtypes.TIMEDELTA_DTYPE) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -59,7 +59,7 @@ def test_date_diff_series_sub_literal(scalars_dfs): actual_result = (bf_df["date_col"] - literal).to_pandas() expected_result = (pd_df["date_col"] - literal).astype(dtypes.TIMEDELTA_DTYPE) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -70,7 +70,7 @@ def test_date_series_diff_agg(scalars_dfs): actual_result = bf_df["date_col"].diff().to_pandas() expected_result = pd_df["date_col"].diff().astype(dtypes.TIMEDELTA_DTYPE) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -86,6 +86,6 @@ def test_date_can_cast_after_accessor(scalars_dfs): pd.to_datetime(pd_df["date_col"]).dt.isocalendar().week.astype("Int64") ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_dtype=False, check_index_type=False ) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index ad632e1c2ca..9f4b5e57054 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -17,13 +17,12 @@ import numpy from packaging import version -from pandas import testing import pandas as pd import pytest import bigframes.pandas as bpd import bigframes.series -from bigframes.testing.utils import assert_series_equal +from bigframes.testing.utils import assert_frame_equal, assert_series_equal DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)] DATE_COLUMNS = [ @@ -304,7 +303,7 @@ def test_dt_isocalendar(session): actual_result = bf_s.dt.isocalendar().to_pandas() expected_result = pd_s.dt.isocalendar() - testing.assert_frame_equal( + assert_frame_equal( actual_result, expected_result, check_dtype=False, check_index_type=False ) @@ -340,9 +339,7 @@ def test_dt_tz_localize(scalars_dfs, col_name, tz): bf_result = bf_series.dt.tz_localize(tz) pd_result = scalars_pandas_df[col_name].dt.tz_localize(tz) - testing.assert_series_equal( - bf_result.to_pandas(), pd_result, check_index_type=False - ) + assert_series_equal(bf_result.to_pandas(), pd_result, check_index_type=False) @pytest.mark.parametrize( @@ -389,7 +386,7 @@ def test_dt_strftime(scalars_df_index, scalars_pandas_df_index, column, date_for pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_index[column].dt.strftime(date_format).to_pandas() pd_result = scalars_pandas_df_index[column].dt.strftime(date_format) - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal(bf_result, pd_result, check_dtype=False) assert bf_result.dtype == "string[pyarrow]" @@ -401,7 +398,7 @@ def test_dt_strftime_date(): expected_result = pd.Series(["08/15/2014", "08/15/2215", "02/29/2016"]) bf_result = bf_series.dt.strftime("%m/%d/%Y").to_pandas() - pd.testing.assert_series_equal( + assert_series_equal( bf_result, expected_result, check_index_type=False, check_dtype=False ) assert bf_result.dtype == "string[pyarrow]" @@ -417,7 +414,7 @@ def test_dt_strftime_time(): ) bf_result = bf_series.dt.strftime("%X").to_pandas() - pd.testing.assert_series_equal( + assert_series_equal( bf_result, expected_result, check_index_type=False, check_dtype=False ) assert bf_result.dtype == "string[pyarrow]" @@ -557,7 +554,7 @@ def test_timestamp_diff_two_dataframes(scalars_dfs): actual_result = (bf_df - bf_df).to_pandas() expected_result = pd_df - pd_df - testing.assert_frame_equal(actual_result, expected_result) + assert_frame_equal(actual_result, expected_result) def test_timestamp_diff_two_series_with_different_types_raise_error(scalars_dfs): @@ -579,9 +576,12 @@ def test_timestamp_diff_series_sub_literal(scalars_dfs, column, value): bf_series = bf_df[column] pd_series = pd_df[column] - actual_result = (bf_series - value).to_pandas() + # Pandas doesn't handle nulls properly here so we ffill + # overflows for no good reason + # related? https://github.com/apache/arrow/issues/43031 + actual_result = (bf_series.ffill() - value).to_pandas() - expected_result = pd_series - value + expected_result = pd_series.ffill() - value assert_series_equal(actual_result, expected_result) @@ -597,9 +597,12 @@ def test_timestamp_diff_literal_sub_series(scalars_dfs, column, value): bf_series = bf_df[column] pd_series = pd_df[column] - actual_result = (value - bf_series).to_pandas() + # Pandas doesn't handle nulls properly here so we ffill + # overflows for no good reason + # related? https://github.com/apache/arrow/issues/43031 + actual_result = (value - bf_series.ffill()).to_pandas() - expected_result = value - pd_series + expected_result = value - pd_series.ffill() assert_series_equal(actual_result, expected_result) @@ -611,7 +614,12 @@ def test_timestamp_series_diff_agg(scalars_dfs, column): actual_result = bf_series.diff().to_pandas() - expected_result = pd_series.diff() + # overflows for no good reason + # related? https://github.com/apache/arrow/issues/43031 + expected_result = pd_series.ffill().diff() + expected_result = expected_result.mask( + pd_series.isnull() | pd_series.shift(1).isnull() + ) assert_series_equal(actual_result, expected_result) @@ -666,6 +674,6 @@ def test_to_datetime(scalars_dfs, col): ).to_pandas() expected_result = pd.Series(pd.to_datetime(pd_df[col])) - testing.assert_series_equal( + assert_series_equal( actual_result, expected_result, check_dtype=False, check_index_type=False ) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 18c88db8eb5..0329aece058 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -19,49 +19,65 @@ import numpy as np from packaging import version import pandas as pd -import pandas.testing import pyarrow as pa import pytest from bigframes import dtypes +import bigframes.testing + +# Some methods/features used by this test don't exist in pandas 1.x +pytest.importorskip("pandas", minversion="2.0.0") @pytest.fixture(scope="module") def temporal_dfs(session): pandas_df = pd.DataFrame( { - "datetime_col": [ - pd.Timestamp("2025-02-01 01:00:01"), - pd.Timestamp("2019-01-02 02:00:00"), - pd.Timestamp("1997-01-01 19:00:00"), - ], - "timestamp_col": [ - pd.Timestamp("2023-01-01 01:00:01", tz="UTC"), - pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), - pd.Timestamp("2005-03-05 02:00:00", tz="UTC"), - ], + "datetime_col": pd.Series( + [ + pd.Timestamp("2025-02-01 01:00:01"), + pd.Timestamp("2019-01-02 02:00:00"), + pd.Timestamp("1997-01-01 19:00:00"), + ], + dtype=dtypes.DATETIME_DTYPE, + ), + "timestamp_col": pd.Series( + [ + pd.Timestamp("2023-01-01 01:00:01", tz="UTC"), + pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), + pd.Timestamp("2005-03-05 02:00:00", tz="UTC"), + ], + dtype=dtypes.TIMESTAMP_DTYPE, + ), "date_col": pd.Series( [ datetime.date(2000, 1, 1), datetime.date(2001, 2, 3), datetime.date(2020, 9, 30), ], - dtype=pd.ArrowDtype(pa.date32()), + dtype=dtypes.DATE_DTYPE, ), - "timedelta_col_1": [ - pd.Timedelta(5, "s"), - pd.Timedelta(-4, "m"), - pd.Timedelta(5, "h"), - ], - "timedelta_col_2": [ - pd.Timedelta(3, "s"), - pd.Timedelta(-4, "m"), - pd.Timedelta(6, "h"), - ], - "float_col": [1.5, 2, -3], - "int_col": [1, 2, -3], - "positive_int_col": [1, 2, 3], - } + "timedelta_col_1": pd.Series( + [ + pd.Timedelta(5, "s"), + pd.Timedelta(-4, "m"), + pd.Timedelta(5, "h"), + ], + dtype=dtypes.TIMEDELTA_DTYPE, + ), + "timedelta_col_2": pd.Series( + [ + pd.Timedelta(3, "s"), + pd.Timedelta(-4, "m"), + pd.Timedelta(6, "h"), + ], + dtype=dtypes.TIMEDELTA_DTYPE, + ), + "float_col": pd.Series([1.5, 2, -3], dtype=dtypes.FLOAT_DTYPE), + "int_col": pd.Series([1, 2, -3], dtype="Int64"), + "positive_int_col": pd.Series([1, 2, 3], dtype="Int64"), + }, + index=pd.Index(range(3), dtype="Int64"), ) bigframes_df = session.read_pandas(pandas_df) @@ -70,89 +86,106 @@ def temporal_dfs(session): def _assert_series_equal(actual: pd.Series, expected: pd.Series): - """Helper function specifically for timedelta testsing. Don't use it outside of this module.""" - if actual.dtype == dtypes.FLOAT_DTYPE: - pandas.testing.assert_series_equal( - actual, expected.astype("Float64"), check_index_type=False - ) - elif actual.dtype == dtypes.INT_DTYPE: - pandas.testing.assert_series_equal( - actual, expected.astype("Int64"), check_index_type=False - ) - else: - pandas.testing.assert_series_equal( - actual.astype("timedelta64[ns]"), - expected.dt.floor("us"), # in BF the precision is microsecond - check_index_type=False, - ) + """Helper function specifically for timedelta testing. Don't use it outside of this module.""" + bigframes.testing.assert_series_equal( + actual, + expected, + check_index_type=False, + check_dtype=False, + ) @pytest.mark.parametrize( - ("op", "col_1", "col_2"), + ("op", "col_1", "col_2", "arrow_supported"), [ - (operator.add, "timedelta_col_1", "timedelta_col_2"), - (operator.sub, "timedelta_col_1", "timedelta_col_2"), - (operator.truediv, "timedelta_col_1", "timedelta_col_2"), - (operator.floordiv, "timedelta_col_1", "timedelta_col_2"), - (operator.truediv, "timedelta_col_1", "float_col"), - (operator.floordiv, "timedelta_col_1", "float_col"), - (operator.mul, "timedelta_col_1", "float_col"), - (operator.mul, "float_col", "timedelta_col_1"), - (operator.mod, "timedelta_col_1", "timedelta_col_2"), + (operator.add, "timedelta_col_1", "timedelta_col_2", True), + (operator.sub, "timedelta_col_1", "timedelta_col_2", True), + (operator.truediv, "timedelta_col_1", "timedelta_col_2", True), + (operator.floordiv, "timedelta_col_1", "timedelta_col_2", True), + (operator.truediv, "timedelta_col_1", "float_col", False), + (operator.floordiv, "timedelta_col_1", "float_col", False), + (operator.mul, "timedelta_col_1", "float_col", False), + (operator.mul, "float_col", "timedelta_col_1", False), + (operator.mod, "timedelta_col_1", "timedelta_col_2", False), ], ) -def test_timedelta_binary_ops_between_series(temporal_dfs, op, col_1, col_2): +def test_timedelta_binary_ops_between_series( + temporal_dfs, op, col_1, col_2, arrow_supported +): bf_df, pd_df = temporal_dfs actual_result = op(bf_df[col_1], bf_df[col_2]).to_pandas() - expected_result = op(pd_df[col_1], pd_df[col_2]) + if not arrow_supported: + expected_result = pd_df.apply(lambda x: op(x[col_1], x[col_2]), axis=1) + else: + expected_result = op(pd_df[col_1], pd_df[col_2]) _assert_series_equal(actual_result, expected_result) @pytest.mark.parametrize( - ("op", "col", "literal"), + ("op", "col", "literal", "arrow_supported"), [ - (operator.add, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.sub, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.truediv, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.floordiv, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.truediv, "timedelta_col_1", 3), - (operator.floordiv, "timedelta_col_1", 3), - (operator.mul, "timedelta_col_1", 3), - (operator.mul, "float_col", pd.Timedelta(1, "s")), - (operator.mod, "timedelta_col_1", pd.Timedelta(7, "s")), + (operator.add, "timedelta_col_1", pd.Timedelta(2, "s").as_unit("us"), True), + (operator.sub, "timedelta_col_1", pd.Timedelta(2, "s").as_unit("us"), True), + (operator.truediv, "timedelta_col_1", pd.Timedelta(2, "s").as_unit("us"), True), + ( + operator.floordiv, + "timedelta_col_1", + pd.Timedelta(2, "s").as_unit("us"), + False, + ), + (operator.truediv, "timedelta_col_1", 3, True), + (operator.floordiv, "timedelta_col_1", 3, False), + (operator.mul, "timedelta_col_1", 3, True), + (operator.mul, "float_col", pd.Timedelta(1, "s").as_unit("us"), True), + (operator.mod, "timedelta_col_1", pd.Timedelta(7, "s").as_unit("us"), False), ], ) -def test_timedelta_binary_ops_series_and_literal(temporal_dfs, op, col, literal): +def test_timedelta_binary_ops_series_and_literal( + temporal_dfs, op, col, literal, arrow_supported +): bf_df, pd_df = temporal_dfs actual_result = op(bf_df[col], literal).to_pandas() - expected_result = op(pd_df[col], literal) + if not arrow_supported: + expected_result = pd_df[col].map(lambda x: op(x, literal)) + else: + expected_result = op(pd_df[col], literal) _assert_series_equal(actual_result, expected_result) @pytest.mark.parametrize( - ("op", "col", "literal"), + ("op", "col", "literal", "arrow_supported"), [ - (operator.add, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.sub, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.truediv, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.floordiv, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.truediv, "float_col", pd.Timedelta(2, "s")), - (operator.floordiv, "float_col", pd.Timedelta(2, "s")), - (operator.mul, "timedelta_col_1", 3), - (operator.mul, "float_col", pd.Timedelta(1, "s")), - (operator.mod, "timedelta_col_1", pd.Timedelta(7, "s")), + (operator.add, "timedelta_col_1", pd.Timedelta(2, "s").as_unit("us"), True), + (operator.sub, "timedelta_col_1", pd.Timedelta(2, "s").as_unit("us"), True), + (operator.truediv, "timedelta_col_1", pd.Timedelta(2, "s").as_unit("us"), True), + ( + operator.floordiv, + "timedelta_col_1", + pd.Timedelta(2, "s").as_unit("us"), + True, + ), + (operator.truediv, "float_col", pd.Timedelta(2, "s").as_unit("us"), True), + (operator.floordiv, "float_col", pd.Timedelta(2, "s").as_unit("us"), True), + (operator.mul, "timedelta_col_1", 3, True), + (operator.mul, "float_col", pd.Timedelta(1, "s").as_unit("us"), False), + (operator.mod, "timedelta_col_1", pd.Timedelta(7, "s").as_unit("us"), False), ], ) -def test_timedelta_binary_ops_literal_and_series(temporal_dfs, op, col, literal): +def test_timedelta_binary_ops_literal_and_series( + temporal_dfs, op, col, literal, arrow_supported +): bf_df, pd_df = temporal_dfs actual_result = op(literal, bf_df[col]).to_pandas() - expected_result = op(literal, pd_df[col]) + if not arrow_supported: + expected_result = pd_df[col].map(lambda x: op(literal, x)) + else: + expected_result = op(literal, pd_df[col]) _assert_series_equal(actual_result, expected_result) @@ -176,12 +209,10 @@ def test_timedelta_unary_ops(temporal_dfs, op): def test_timestamp_add__ts_series_plus_td_series(temporal_dfs, column, pd_dtype): bf_df, pd_df = temporal_dfs - actual_result = ( - (bf_df[column] + bf_df["timedelta_col_1"]).to_pandas().astype(pd_dtype) - ) + actual_result = (bf_df[column] + bf_df["timedelta_col_1"]).to_pandas() expected_result = pd_df[column] + pd_df["timedelta_col_1"] - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -199,7 +230,7 @@ def test_timestamp_add__ts_series_plus_td_series__explicit_cast(temporal_dfs, co @pytest.mark.parametrize( "literal", [ - pytest.param(pd.Timedelta(1, unit="s"), id="pandas"), + pytest.param(pd.Timedelta(1, unit="s").as_unit("us"), id="pandas"), pytest.param(datetime.timedelta(seconds=1), id="python-datetime"), pytest.param(np.timedelta64(1, "s"), id="numpy"), ], @@ -207,12 +238,10 @@ def test_timestamp_add__ts_series_plus_td_series__explicit_cast(temporal_dfs, co def test_timestamp_add__ts_series_plus_td_literal(temporal_dfs, literal): bf_df, pd_df = temporal_dfs - actual_result = ( - (bf_df["timestamp_col"] + literal).to_pandas().astype("datetime64[ns, UTC]") - ) + actual_result = (bf_df["timestamp_col"] + literal).to_pandas() expected_result = pd_df["timestamp_col"] + literal - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -227,24 +256,22 @@ def test_timestamp_add__ts_series_plus_td_literal(temporal_dfs, literal): def test_timestamp_add__td_series_plus_ts_series(temporal_dfs, column, pd_dtype): bf_df, pd_df = temporal_dfs - actual_result = ( - (bf_df["timedelta_col_1"] + bf_df[column]).to_pandas().astype(pd_dtype) - ) + actual_result = (bf_df["timedelta_col_1"] + bf_df[column]).to_pandas() expected_result = pd_df["timedelta_col_1"] + pd_df[column] - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) def test_timestamp_add__td_literal_plus_ts_series(temporal_dfs): bf_df, pd_df = temporal_dfs - timedelta = pd.Timedelta(1, unit="s") + timedelta = pd.Timedelta(1, unit="s").as_unit("us") - actual_result = (timedelta + bf_df["datetime_col"]).to_pandas().astype(" pd.Timedelta(1, "h"))] - .to_pandas() - .astype(" pd.Timedelta(1, "h")) + ].to_pandas() expected_result = pd_series[(pd_series - timestamp) > pd.Timedelta(1, "h")] - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -540,29 +561,30 @@ def test_timedelta_filtering(session): def test_timedelta_ordering(session): pd_df = pd.DataFrame( { - "col_1": [ - pd.Timestamp("2025-01-01 01:00:00"), - pd.Timestamp("2025-01-01 02:00:00"), - pd.Timestamp("2025-01-01 03:00:00"), - ], - "col_2": [ - pd.Timestamp("2025-01-01 01:00:02"), - pd.Timestamp("2025-01-01 02:00:01"), - pd.Timestamp("2025-01-01 02:59:59"), - ], + "col_1": pd.Series( + [ + pd.Timestamp("2025-01-01 01:00:00"), + pd.Timestamp("2025-01-01 02:00:00"), + pd.Timestamp("2025-01-01 03:00:00"), + ], + dtype=dtypes.TIMESTAMP_DTYPE, + ), + "col_2": pd.Series( + [ + pd.Timestamp("2025-01-01 01:00:02"), + pd.Timestamp("2025-01-01 02:00:01"), + pd.Timestamp("2025-01-01 02:59:59"), + ], + dtype=dtypes.TIMESTAMP_DTYPE, + ), } ) bf_df = session.read_pandas(pd_df) - actual_result = ( - (bf_df["col_2"] - bf_df["col_1"]) - .sort_values() - .to_pandas() - .astype("timedelta64[ns]") - ) + actual_result = (bf_df["col_2"] - bf_df["col_1"]).sort_values().to_pandas() expected_result = (pd_df["col_2"] - pd_df["col_1"]).sort_values() - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -593,7 +615,7 @@ def test_timedelta_agg__timedelta_result(temporal_dfs, agg_func): actual_result = agg_func(bf_df["timedelta_col_1"]) - expected_result = agg_func(pd_df["timedelta_col_1"]).floor("us") + expected_result = agg_func(pd_df["timedelta_col_1"]) assert actual_result == expected_result @@ -629,6 +651,6 @@ def test_timestamp_diff_after_type_casting(temporal_dfs): expected_result = pd_df["timestamp_col"] - pd_df["positive_int_col"].astype( "datetime64[us, UTC]" ) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False, check_dtype=False ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index fa82cce6054..c1692b00a68 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -33,6 +33,7 @@ import bigframes.dtypes as dtypes import bigframes.pandas as bpd import bigframes.series as series +import bigframes.testing from bigframes.testing.utils import ( assert_dfs_equivalent, assert_frame_equal, @@ -133,7 +134,7 @@ def test_df_construct_structs(session): ] ).to_frame() bf_series = session.read_pandas(pd_frame) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_series.to_pandas(), pd_frame, check_index_type=False, check_dtype=False ) @@ -143,7 +144,7 @@ def test_df_construct_local_concat_pd(scalars_pandas_df_index, session): bf_df = session.read_pandas(pd_df) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_df.to_pandas(), pd_df, check_index_type=False, check_dtype=False ) @@ -318,7 +319,7 @@ def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): 3, ["bool_col", "int64_too"], keep=keep ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -336,7 +337,7 @@ def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep) pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -355,7 +356,7 @@ def test_get_columns(scalars_dfs): col_names = ["bool_col", "float64_col", "int64_col"] df_subset = scalars_df.get(col_names) df_pandas = df_subset.to_pandas() - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df[col_names].columns ) @@ -402,7 +403,7 @@ def test_insert(scalars_dfs, loc, column, value, allow_duplicates): bf_df.insert(loc, column, value, allow_duplicates) pd_df.insert(loc, column, value, allow_duplicates) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): @@ -596,7 +597,7 @@ def test_drop_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" df_pandas = scalars_df.drop(columns=col_name).to_pandas() - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns ) @@ -605,7 +606,7 @@ def test_drop_columns(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = ["int64_col", "geography_col", "time_col"] df_pandas = scalars_df.drop(columns=col_names).to_pandas() - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns ) @@ -617,7 +618,7 @@ def test_drop_labels_axis_1(scalars_dfs): pd_result = scalars_pandas_df.drop(labels=labels, axis=1) bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_with_custom_column_labels(scalars_dfs): @@ -644,7 +645,7 @@ def test_df_memory_usage(scalars_dfs): pd_result = scalars_pandas_df.memory_usage() bf_result = scalars_df.memory_usage() - pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + bigframes.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) def test_df_info(scalars_dfs): @@ -743,7 +744,7 @@ def test_select_dtypes(scalars_dfs, include, exclude): pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_index(scalars_dfs): @@ -752,7 +753,7 @@ def test_drop_index(scalars_dfs): pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_pandas_index(scalars_dfs): @@ -762,7 +763,7 @@ def test_drop_pandas_index(scalars_dfs): pd_result = scalars_pandas_df.drop(index=drop_index) bf_result = scalars_df.drop(index=drop_index).to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_bigframes_index(scalars_dfs): @@ -773,10 +774,12 @@ def test_drop_bigframes_index(scalars_dfs): pd_result = scalars_pandas_df.drop(index=drop_pandas_index) bf_result = scalars_df.drop(index=drop_index).to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_bigframes_index_with_na(scalars_dfs): + if pd.__version__.startswith("3"): + pytest.skip("Pandas 3.0 doesn't doesn't support drop with pd.NA values") scalars_df, scalars_pandas_df = scalars_dfs scalars_df = scalars_df.copy() scalars_pandas_df = scalars_pandas_df.copy() @@ -788,7 +791,7 @@ def test_drop_bigframes_index_with_na(scalars_dfs): pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) bf_result = scalars_df.drop(index=drop_index).to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_bigframes_multiindex(scalars_dfs): @@ -809,7 +812,7 @@ def test_drop_bigframes_multiindex(scalars_dfs): bf_result = scalars_df.drop(index=drop_index).to_pandas() pd_result = scalars_pandas_df.drop(index=drop_pandas_index) - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_labels_axis_0(scalars_dfs): @@ -818,7 +821,7 @@ def test_drop_labels_axis_0(scalars_dfs): pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_drop_index_and_columns(scalars_dfs): @@ -827,14 +830,14 @@ def test_drop_index_and_columns(scalars_dfs): pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result) + bigframes.testing.assert_frame_equal(pd_result, bf_result) def test_rename(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name_dict = {"bool_col": 1.2345} df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns ) @@ -844,7 +847,7 @@ def test_df_peek(scalars_dfs_maybe_ordered): peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + bigframes.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 @@ -853,14 +856,14 @@ def test_df_peek_with_large_results_not_allowed(scalars_dfs_maybe_ordered): peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + bigframes.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 def test_df_peek_filtered(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3, force=False) - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + bigframes.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 @@ -875,7 +878,7 @@ def test_df_peek_exception(scalars_dfs): def test_df_peek_force_default(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns ) assert len(peek_result) == 3 @@ -886,7 +889,7 @@ def test_df_peek_reset_index(scalars_dfs): peek_result = ( scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) ) - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns ) assert len(peek_result) == 3 @@ -987,7 +990,7 @@ def test_df_column_name_with_space(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name_dict = {"bool_col": "bool col"} df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns ) @@ -996,7 +999,7 @@ def test_df_column_name_duplicate(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name_dict = {"int64_too": "int64_col"} df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns ) @@ -1007,7 +1010,7 @@ def test_get_df_column_name_duplicate(scalars_dfs): bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + bigframes.testing.assert_index_equal(bf_result.columns, pd_result.columns) @pytest.mark.parametrize( @@ -1124,7 +1127,7 @@ def test_assign_new_column_w_loc(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1146,7 +1149,7 @@ def test_assign_new_column_w_setitem(scalars_dfs, scalar): # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Float64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_assign_new_column_w_setitem_dataframe(scalars_dfs): @@ -1159,7 +1162,7 @@ def test_assign_new_column_w_setitem_dataframe(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): @@ -1185,7 +1188,7 @@ def test_assign_new_column_w_setitem_list(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): @@ -1203,7 +1206,7 @@ def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): @@ -1223,7 +1226,7 @@ def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_assign_new_column_w_setitem_list_error(scalars_dfs): @@ -1265,7 +1268,9 @@ def test_setitem_multicolumn_with_literals(scalars_dfs, key, value): bf_result[key] = value pd_result[key] = value - pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result.to_pandas(), check_dtype=False + ) def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs): @@ -1284,7 +1289,9 @@ def test_setitem_multicolumn_with_dataframes(scalars_dfs): bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2 pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2 - pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result.to_pandas(), check_dtype=False + ) def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs): @@ -1439,7 +1446,7 @@ def test_assign_different_df_w_loc( # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_assign_different_df_w_setitem( @@ -1458,7 +1465,7 @@ def test_assign_different_df_w_setitem( # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_assign_callable_lambda(scalars_dfs): @@ -1528,7 +1535,7 @@ def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh): bf_result = df_result.to_pandas() # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_df_dropna_range_columns(scalars_dfs): @@ -1576,7 +1583,7 @@ def test_df_fillna(scalars_dfs, col, fill_value): bf_result = scalars_df[col].fillna(fill_value).to_pandas() pd_result = scalars_pandas_df[col].fillna(fill_value) - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_df_replace_scalar_scalar(scalars_dfs): @@ -1585,7 +1592,7 @@ def test_df_replace_scalar_scalar(scalars_dfs): pd_result = scalars_pandas_df.replace(555.555, 3) # pandas has narrower result types as they are determined dynamically - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) def test_df_replace_regex_scalar(scalars_dfs): @@ -1593,7 +1600,7 @@ def test_df_replace_regex_scalar(scalars_dfs): bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, ) @@ -1605,7 +1612,7 @@ def test_df_replace_list_scalar(scalars_dfs): pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) # pandas has narrower result types as they are determined dynamically - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, check_dtype=False, @@ -1617,7 +1624,7 @@ def test_df_replace_value_dict(scalars_dfs): bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, ) @@ -1834,7 +1841,7 @@ def test_df_cross_merge(scalars_dfs): ), "cross", ) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) @pytest.mark.parametrize( @@ -1987,7 +1994,7 @@ def test_self_merge_self_w_on_args(): bf_result = bf_df1.merge( bf_df2, left_on=["A", "C"], right_on=["B", "C"], how="inner" ).to_pandas() - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) @pytest.mark.parametrize( @@ -2028,7 +2035,7 @@ def test_get_dtypes(scalars_df_default_index): "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), "duration_col": pd.ArrowDtype(pa.duration("us")), } - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( dtypes, pd.Series(dtypes_dict), ) @@ -2044,7 +2051,7 @@ def test_get_dtypes_array_struct_query(session): ) dtypes = df.dtypes - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( dtypes, pd.Series( { @@ -2064,7 +2071,7 @@ def test_get_dtypes_array_struct_query(session): def test_get_dtypes_array_struct_table(nested_df): dtypes = nested_df.dtypes - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( dtypes, pd.Series( { @@ -2602,7 +2609,7 @@ def test_combine( ) # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -2640,7 +2647,7 @@ def test_df_update(overwrite, filter_func): bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) - pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) + bigframes.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) def test_df_idxmin(): @@ -2652,7 +2659,7 @@ def test_df_idxmin(): bf_result = bf_df.idxmin().to_pandas() pd_result = pd_df.idxmin() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_dtype=False ) @@ -2666,7 +2673,7 @@ def test_df_idxmax(): bf_result = bf_df.idxmax().to_pandas() pd_result = pd_df.idxmax() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_dtype=False ) @@ -2707,8 +2714,12 @@ def test_df_align(join, axis): assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( bf_result2, dataframe.DataFrame ) - pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) - pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + bigframes.testing.assert_frame_equal( + bf_result1.to_pandas(), pd_result1, check_dtype=False + ) + bigframes.testing.assert_frame_equal( + bf_result2.to_pandas(), pd_result2, check_dtype=False + ) def test_combine_first( @@ -2733,7 +2744,7 @@ def test_combine_first( pd_result = pd_df_a.combine_first(pd_df_b) # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -2760,9 +2771,9 @@ def test_df_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + bigframes.testing.assert_index_equal(bf_result.columns, pd_result.columns) # Only check row order in ordered mode. - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, @@ -2804,9 +2815,9 @@ def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + bigframes.testing.assert_index_equal(bf_result.columns, pd_result.columns) # Only check row order in ordered mode. - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, @@ -2827,7 +2838,7 @@ def test_df_corrwith_df(scalars_dfs_maybe_ordered): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -2848,7 +2859,7 @@ def test_df_corrwith_df_numeric_only(scalars_dfs): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -2877,7 +2888,7 @@ def test_df_corrwith_series(scalars_dfs_maybe_ordered): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -3148,7 +3159,7 @@ def test_binop_df_df_binary_op( pd_result = pd_df_a - pd_df_b # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) # Differnt table will only work for explicit index, since default index orders are arbitrary. @@ -3258,9 +3269,9 @@ def test_join_different_table_with_duplicate_column_name( pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r") # Ensure no inplace changes - pd.testing.assert_index_equal(bf_df_a.columns, pd_df_a.columns) - pd.testing.assert_index_equal(bf_df_b.index.to_pandas(), pd_df_b.index) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_index_equal(bf_df_a.columns, pd_df_a.columns) + bigframes.testing.assert_index_equal(bf_df_b.index.to_pandas(), pd_df_b.index) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) @all_joins @@ -3288,14 +3299,14 @@ def test_join_param_on_with_duplicate_column_name_not_on_col( pd_result = pd_df_a.join( pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.sort_index(), pd_result.sort_index(), check_like=True, check_index_type=False, check_names=False, ) - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + bigframes.testing.assert_index_equal(bf_result.columns, pd_result.columns) @pytest.mark.skipif( @@ -3326,14 +3337,14 @@ def test_join_param_on_with_duplicate_column_name_on_col( pd_result = pd_df_a.join( pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.sort_index(), pd_result.sort_index(), check_like=True, check_index_type=False, check_names=False, ) - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + bigframes.testing.assert_index_equal(bf_result.columns, pd_result.columns) @all_joins @@ -3478,7 +3489,7 @@ def test_dataframe_numeric_analytic_op( bf_series = operator(scalars_df_index[columns]) pd_series = operator(scalars_pandas_df_index[columns]) bf_result = bf_series.to_pandas() - pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -3503,7 +3514,7 @@ def test_dataframe_general_analytic_op( bf_series = operator(scalars_df_index[col_names]) pd_series = operator(scalars_pandas_df_index[col_names]) bf_result = bf_series.to_pandas() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_series, bf_result, ) @@ -3521,7 +3532,7 @@ def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): col_names = ["int64_too", "float64_col", "int64_col"] bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, ) @@ -3540,7 +3551,7 @@ def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() # pandas 3.0 does not automatically ffill anymore pd_result = scalars_pandas_df_index[col_names].ffill().pct_change(periods=periods) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, ) @@ -3554,7 +3565,7 @@ def test_dataframe_agg_single_string(scalars_dfs): pd_result = scalars_pandas_df[numeric_cols].agg("sum") assert bf_result.dtype == "Float64" - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -3574,7 +3585,7 @@ def test_dataframe_agg_int_single_string(scalars_dfs, agg): pd_result = scalars_pandas_df[numeric_cols].agg(agg) assert bf_result.dtype == "Int64" - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -3629,7 +3640,7 @@ def test_dataframe_agg_int_multi_string(scalars_dfs): # Pandas may produce narrower numeric types # Pandas has object index type - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -3672,7 +3683,7 @@ def test_df_transpose_repeated_uses_cache(): bf_df = bf_df.transpose() + i pd_df = pd_df.transpose() + i - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_df, bf_df.to_pandas(), check_dtype=False, check_index_type=False ) @@ -3715,7 +3726,7 @@ def test_df_melt_default(scalars_dfs): pd_result = scalars_pandas_df[columns].melt() # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, @@ -3744,7 +3755,7 @@ def test_df_melt_parameterized(scalars_dfs): ) # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, check_dtype=False ) @@ -3797,7 +3808,7 @@ def test_df_pivot(scalars_dfs, values, index, columns): # Pandas produces NaN, where bq dataframes produces pd.NA bf_result = bf_result.fillna(float("nan")) pd_result = pd_result.fillna(float("nan")) - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -3818,7 +3829,7 @@ def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): ) # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -3859,7 +3870,7 @@ def test_df_pivot_table( aggfunc=aggfunc, fill_value=fill_value, ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_column_type=False ) @@ -3939,7 +3950,7 @@ def test__dir__with_rename(scalars_dfs): def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -3963,7 +3974,7 @@ def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index) def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index.iloc[start:stop:step] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -3981,7 +3992,7 @@ def test_iloc_slice_after_cache( scalars_df_index.cache() bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index.iloc[start:stop:step] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4014,7 +4025,7 @@ def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): bf_result = scalars_df_index.iloc[index] pd_result = scalars_pandas_df_index.iloc[index] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -4039,14 +4050,14 @@ def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, ind bf_result = scalars_df_index.iloc[index].to_pandas() pd_result = scalars_pandas_df_index.iloc[index] - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): index = (2, [2, 1, 3, -4]) bf_result = scalars_df_index.iloc[index] pd_result = scalars_pandas_df_index.iloc[index] - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -4100,7 +4111,7 @@ def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4111,7 +4122,7 @@ def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_in bf_result = scalars_df_index.loc[idx_list, ["bool_col", "int64_col"]].to_pandas() pd_result = scalars_pandas_df_index.loc[idx_list, ["bool_col", "int64_col"]] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4120,7 +4131,7 @@ def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_in def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() pd_result = scalars_pandas_df_index.loc[:, "int64_col"] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -4131,7 +4142,7 @@ def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_in pd_result = scalars_pandas_df_index.loc[ :, scalars_pandas_df_index.dtypes == "Int64" ] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4154,7 +4165,7 @@ def test_loc_select_with_column_condition_bf_series( pd_result = scalars_pandas_df_index.loc[ :, scalars_pandas_df_index.nunique() > size_half ] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4168,7 +4179,7 @@ def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_ind index = "Hello, World!" bf_result = scalars_df_index.loc[index] pd_result = scalars_pandas_df_index.loc[index] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -4180,7 +4191,7 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index index = -2345 bf_result = scalars_df_index.loc[index] pd_result = scalars_pandas_df_index.loc[index] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -4194,7 +4205,7 @@ def test_at_with_duplicate(scalars_df_index, scalars_pandas_df_index): index = "Hello, World!" bf_result = scalars_df_index.at[index, "int64_too"] pd_result = scalars_pandas_df_index.at[index, "int64_too"] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4219,7 +4230,7 @@ def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): # pandas uses float64 instead pd_df["new_col"] = pd_df["new_col"].astype("Float64") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_df.to_pandas(), pd_df, ) @@ -4243,7 +4254,7 @@ def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): bf_df.loc[bf_df["int64_too"] == 1, col] = value pd_df.loc[pd_df["int64_too"] == 1, col] = value - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_df.to_pandas(), pd_df, ) @@ -4396,7 +4407,7 @@ def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_ # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): @@ -4409,7 +4420,7 @@ def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df pd_result = pd_result.astype("Float64") pd_result.index = pd_result.index.astype("Float64") - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -4435,7 +4446,7 @@ def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op bf_result = bf_series.to_pandas() pd_series.index = pd_series.index.astype(bf_result.index.dtype) - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + bigframes.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): @@ -4447,7 +4458,7 @@ def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_series = pd_series.astype("Float64") # Pandas has object index type - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + bigframes.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) def test_df_skew_too_few_values(scalars_dfs): @@ -4459,7 +4470,7 @@ def test_df_skew_too_few_values(scalars_dfs): # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) @pytest.mark.parametrize( @@ -4492,7 +4503,7 @@ def test_df_kurt_too_few_values(scalars_dfs): # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) def test_df_kurt(scalars_dfs): @@ -4504,7 +4515,7 @@ def test_df_kurt(scalars_dfs): # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) @pytest.mark.parametrize( @@ -4588,7 +4599,7 @@ def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, @@ -4609,7 +4620,7 @@ def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, @@ -4629,7 +4640,7 @@ def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) # Ignore column ordering as pandas order differently depending on version - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.sort_index(axis=1), pd_result.sort_index(axis=1), ) @@ -4640,7 +4651,7 @@ def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.filter(like="64_col") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4651,7 +4662,7 @@ def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4683,7 +4694,7 @@ def test_df_rows_filter_like(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.filter(like="ello", axis=0) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4697,7 +4708,7 @@ def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.filter(regex="^[GH].*", axis=0) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4728,7 +4739,7 @@ def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4753,7 +4764,7 @@ def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): # Pandas uses float64 as default for newly created empty column, bf uses Float64 pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4768,7 +4779,7 @@ def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_ bf_result = bf.reindex(columns=columns).to_pandas() pd_result = pd_df.reindex(columns=columns) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4858,7 +4869,7 @@ def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Pandas uses float64 as default for newly created empty column, bf uses Float64 pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4869,7 +4880,7 @@ def test_df_values(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.values # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False ) @@ -4879,7 +4890,7 @@ def test_df_to_numpy(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.to_numpy() # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False ) @@ -4889,7 +4900,7 @@ def test_df___array__(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.__array__() # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False ) @@ -4981,7 +4992,7 @@ def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[index_list].to_pandas() pd_result = scalars_pandas_df_index.loc[index_list] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -4993,7 +5004,7 @@ def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[index_list] pd_result = scalars_pandas_df_index.loc[index_list] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5028,7 +5039,7 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): bf_result = scalars_df_index.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5048,7 +5059,7 @@ def test_iloc_list_partial_ordering( bf_result = scalars_df_partial_ordering.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5066,7 +5077,7 @@ def test_iloc_list_multiindex(scalars_dfs): bf_result = scalars_df.iloc[index_list] pd_result = scalars_pandas_df.iloc[index_list] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5087,7 +5098,7 @@ def test_rename_axis(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.rename_axis("newindexname") pd_result = scalars_pandas_df_index.rename_axis("newindexname") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5097,7 +5108,7 @@ def test_rename_axis_nonstring(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.rename_axis((4,)) pd_result = scalars_pandas_df_index.rename_axis((4,)) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5113,7 +5124,7 @@ def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[bf_string_series] pd_result = scalars_pandas_df_index.loc[pd_string_series] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5131,7 +5142,7 @@ def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_multiindex.loc[bf_string_series] pd_result = scalars_pandas_df_multiindex.loc[pd_string_series] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5144,7 +5155,7 @@ def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[bf_index] pd_result = scalars_pandas_df_index.loc[pd_index] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5164,7 +5175,7 @@ def test_loc_bf_index_integer_index_renamed_col( bf_result = scalars_df_index.loc[bf_index] pd_result = scalars_pandas_df_index.loc[pd_index] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.to_pandas(), pd_result, ) @@ -5190,7 +5201,7 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub columns = ["bool_col", "int64_too", "int64_col"] bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_df, bf_df, ) @@ -5217,7 +5228,7 @@ def test_df_drop_duplicates_w_json(json_df, keep): pd_df = json_pandas_df.drop_duplicates(keep=keep) pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_df, bf_df, ) @@ -5242,7 +5253,7 @@ def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): columns = ["bool_col", "int64_too", "int64_col"] bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) - pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + bigframes.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) def test_df_from_dict_columns_orient(): @@ -5313,9 +5324,12 @@ def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_json() + # pandas 3.0 bugged for serializing date col + bf_result = scalars_df_index.drop(columns="date_col").to_json() # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.to_json(default_handler=str) + pd_result = scalars_pandas_df_index.drop(columns="date_col").to_json( + default_handler=str + ) assert bf_result == pd_result @@ -5468,7 +5482,7 @@ def test_df_eval(scalars_dfs, expr): bf_result = scalars_df.eval(expr).to_pandas() pd_result = scalars_pandas_df.eval(expr) - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -5489,7 +5503,7 @@ def test_df_query(scalars_dfs, expr): bf_result = scalars_df.query(expr).to_pandas() pd_result = scalars_pandas_df.query(expr) - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -5514,8 +5528,12 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): subset, normalize=normalize, ascending=ascending, dropna=dropna ) - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bigframes.testing.assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + ignore_order=True, # different pandas versions inconsistent for tie-handling ) @@ -5562,7 +5580,7 @@ def test_df_rank_with_nulls( .astype(pd.Float64Dtype()) ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -5659,7 +5677,7 @@ def test_df_dot_inline(session): pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -5676,7 +5694,7 @@ def test_df_dot( for name in pd_result.columns: pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -5693,7 +5711,7 @@ def test_df_dot_operator( for name in pd_result.columns: pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -5716,7 +5734,7 @@ def test_df_dot_series_inline(): pd_result = pd_result.astype(pd.Int64Dtype()) pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -5732,7 +5750,7 @@ def test_df_dot_series( # Pandas result is object instead of Int64 (nullable) dtype. pd_result = pd_result.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -5748,7 +5766,7 @@ def test_df_dot_operator_series( # Pandas result is object instead of Int64 (nullable) dtype. pd_result = pd_result.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -5890,7 +5908,7 @@ def test_dataframe_explode(col_names, ignore_index, session): bf_materialized = bf_result.to_pandas() execs_post = metrics.execution_count - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_materialized, pd_result, check_index_type=False, @@ -5921,7 +5939,7 @@ def test_dataframe_explode_reserve_order(ignore_index, ordered): pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( pd.Int64Dtype() ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( res if ordered else res.sort_index(), pd_res, check_index_type=False, @@ -5958,6 +5976,11 @@ def test_resample_with_column( ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.2.0") + # TODO: supply a reason why this isn't compatible with pandas 1.x + if pandas.__version__.startswith("3"): + pytest.skip( + "pandas 3.0 behavior diverges for day offsets: https://github.com/pandas-dev/pandas/pull/61985" + ) bf_result = ( scalars_df_index.resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] @@ -5968,7 +5991,9 @@ def test_resample_with_column( pd_result = scalars_pandas_df_index.resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] ].max() - pd.testing.assert_frame_equal( + # TODO: (b/484364312) + pd_result.index.names = bf_result.index.names + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -6007,6 +6032,10 @@ def test_resample_with_index( label, ): # TODO: supply a reason why this isn't compatible with pandas 1.x + if rule == "100d" and pandas.__version__.startswith("3"): + pytest.skip( + "pandas 3.0 behavior diverges for day offsets: https://github.com/pandas-dev/pandas/pull/61985" + ) pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(index_col, append=index_append) scalars_pandas_df_index = scalars_pandas_df_index.set_index( @@ -6023,6 +6052,8 @@ def test_resample_with_index( .resample(rule=rule, level=level, closed=closed, origin=origin, label=label) .min() ) + # TODO: (b/484364312) + pd_result.index.names = bf_result.index.names assert_frame_equal(bf_result, pd_result) @@ -6076,7 +6107,9 @@ def test_resample_start_time(rule, origin, data): pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() - pd.testing.assert_frame_equal( + # TODO: (b/484364312) + pd_result.index.names = bf_result.index.names + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -6101,7 +6134,7 @@ def test_df_astype(scalars_dfs, dtype): bf_result = bf_df.astype(dtype).to_pandas() pd_result = pd_df.astype(dtype) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) def test_df_astype_python_types(scalars_dfs): @@ -6115,7 +6148,7 @@ def test_df_astype_python_types(scalars_dfs): {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()} ) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) def test_astype_invalid_type_fail(scalars_dfs): @@ -6135,11 +6168,16 @@ def test_agg_with_dict_lists_strings(scalars_dfs): bf_result = bf_df.agg(agg_funcs).to_pandas() pd_result = pd_df.agg(agg_funcs) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) +@pytest.mark.skipif( + pandas.__version__.startswith("3"), + # See: https://github.com/python/cpython/issues/112282 + reason="pandas 3.0 miscaculates variance", +) def test_agg_with_dict_lists_callables(scalars_dfs): bf_df, pd_df = scalars_dfs agg_funcs = { @@ -6150,7 +6188,7 @@ def test_agg_with_dict_lists_callables(scalars_dfs): bf_result = bf_df.agg(agg_funcs).to_pandas() pd_result = pd_df.agg(agg_funcs) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -6165,7 +6203,7 @@ def test_agg_with_dict_list_and_str(scalars_dfs): bf_result = bf_df.agg(agg_funcs).to_pandas() pd_result = pd_df.agg(agg_funcs) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -6182,7 +6220,7 @@ def test_agg_with_dict_strs(scalars_dfs): pd_result = pd_df.agg(agg_funcs) pd_result.index = pd_result.index.astype("string[pyarrow]") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -6204,7 +6242,7 @@ def test_df_agg_with_builtins(scalars_dfs): .agg({"int64_col": [len, sum, min, max, list], "bool_col": [all, any, max]}) ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 02acb8d8f25..8813afd1a33 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -12,34 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing from typing import Tuple import google.api_core.exceptions +from google.cloud import bigquery import numpy import numpy.testing import pandas as pd -import pandas.testing import pyarrow as pa import pytest -import bigframes.dtypes as dtypes -from bigframes.testing import utils - -try: - import pandas_gbq # type: ignore -except ImportError: # pragma: NO COVER - # TODO(b/332758806): Run system tests without "extras" - pandas_gbq = None - -import typing - -from google.cloud import bigquery - import bigframes import bigframes.dataframe +import bigframes.dtypes as dtypes import bigframes.enums import bigframes.features import bigframes.pandas as bpd +import bigframes.testing +from bigframes.testing import utils + +pandas_gbq = pytest.importorskip("pandas_gbq") def test_sql_executes(scalars_df_default_index, bigquery_client): @@ -69,7 +62,8 @@ def test_sql_executes(scalars_df_default_index, bigquery_client): .sort_values("rowindex") .reset_index(drop=True) ) - pandas.testing.assert_frame_equal(bf_result, bq_result, check_dtype=False) + bq_result["bytes_col"] = bq_result["bytes_col"].astype(dtypes.BYTES_DTYPE) + bigframes.testing.assert_frame_equal(bf_result, bq_result, check_dtype=False) def test_sql_executes_and_includes_named_index( @@ -100,7 +94,8 @@ def test_sql_executes_and_includes_named_index( .set_index("string_col") .sort_values("rowindex") ) - pandas.testing.assert_frame_equal( + bq_result["bytes_col"] = bq_result["bytes_col"].astype(dtypes.BYTES_DTYPE) + bigframes.testing.assert_frame_equal( bf_result, bq_result, check_dtype=False, check_index_type=False ) @@ -133,7 +128,8 @@ def test_sql_executes_and_includes_named_multiindex( .set_index(["string_col", "bool_col"]) .sort_values("rowindex") ) - pandas.testing.assert_frame_equal( + bq_result["bytes_col"] = bq_result["bytes_col"].astype(dtypes.BYTES_DTYPE) + bigframes.testing.assert_frame_equal( bf_result, bq_result, check_dtype=False, check_index_type=False ) @@ -363,8 +359,8 @@ def test_to_pandas_batches_w_empty_dataframe(session): { "idx1": [], "idx2": [], - "col1": pandas.Series([], dtype="string[pyarrow]"), - "col2": pandas.Series([], dtype="Int64"), + "col1": pd.Series([], dtype="string[pyarrow]"), + "col2": pd.Series([], dtype="Int64"), }, session=session, ).set_index(["idx1", "idx2"], drop=True) @@ -373,7 +369,7 @@ def test_to_pandas_batches_w_empty_dataframe(session): assert len(results) == 1 assert list(results[0].index.names) == ["idx1", "idx2"] assert list(results[0].columns) == ["col1", "col2"] - pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) + bigframes.testing.assert_series_equal(results[0].dtypes, empty.dtypes) @pytest.mark.skipif( @@ -514,8 +510,9 @@ def test_to_csv_index( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") - # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + # read_csv will decode into bytes, numeric inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") + dtype.pop("numeric_col") gcs_df = pd.read_csv( utils.get_first_file_from_wildcard(path), dtype=dtype, @@ -552,8 +549,9 @@ def test_to_csv_tabs( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") - # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + # read_csv will decode into bytes, numeric inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") + dtype.pop("numeric_col") gcs_df = pd.read_csv( utils.get_first_file_from_wildcard(path), sep="\t", @@ -575,7 +573,7 @@ def test_to_csv_tabs( ("index"), [True, False], ) -@pytest.mark.skipif(pandas_gbq is None, reason="required by pd.read_gbq") +@pytest.mark.skipif(pandas_gbq is None, reason="required by pandas_gbq.read_gbq") def test_to_gbq_w_index(scalars_dfs, dataset_id, index): """Test the `to_gbq` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs @@ -588,7 +586,7 @@ def test_to_gbq_w_index(scalars_dfs, dataset_id, index): index_col = None df_in.to_gbq(destination_table, if_exists="replace", index=index) - df_out = pd.read_gbq(destination_table, index_col=index_col) + df_out = pandas_gbq.read_gbq(destination_table, index_col=index_col) if index: df_out = df_out.sort_index() @@ -596,7 +594,7 @@ def test_to_gbq_w_index(scalars_dfs, dataset_id, index): df_out = df_out.sort_values("rowindex_2").reset_index(drop=True) utils.convert_pandas_dtypes(df_out, bytes_col=False) - # pd.read_gbq interprets bytes_col as object, reconvert to pyarrow binary + # pandas_gbq.read_gbq interprets bytes_col as object, reconvert to pyarrow binary df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary())) expected = scalars_pandas_df.copy() expected.index.name = index_col @@ -608,7 +606,7 @@ def test_to_gbq_if_exists_is_fail(scalars_dfs, dataset_id): destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_fails" scalars_df.to_gbq(destination_table) - gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") assert len(gcs_df) == len(scalars_pandas_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) @@ -625,20 +623,20 @@ def test_to_gbq_if_exists_is_replace(scalars_dfs, dataset_id): destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_replace" scalars_df.to_gbq(destination_table) - gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") assert len(gcs_df) == len(scalars_pandas_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) # When replacing a table with same schema scalars_df.to_gbq(destination_table, if_exists="replace") - gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") assert len(gcs_df) == len(scalars_pandas_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) # When replacing a table with different schema partitial_scalars_df = scalars_df.drop(columns=["string_col"]) partitial_scalars_df.to_gbq(destination_table, if_exists="replace") - gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") assert len(gcs_df) == len(partitial_scalars_df) pd.testing.assert_index_equal(gcs_df.columns, partitial_scalars_df.columns) @@ -648,20 +646,20 @@ def test_to_gbq_if_exists_is_append(scalars_dfs, dataset_id): destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_append" scalars_df.to_gbq(destination_table) - gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") assert len(gcs_df) == len(scalars_pandas_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) # When appending to a table with same schema scalars_df.to_gbq(destination_table, if_exists="append") - gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") assert len(gcs_df) == 2 * len(scalars_pandas_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) # When appending to a table with different schema partitial_scalars_df = scalars_df.drop(columns=["string_col"]) partitial_scalars_df.to_gbq(destination_table, if_exists="append") - gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + gcs_df = pandas_gbq.read_gbq(destination_table, index_col="rowindex") assert len(gcs_df) == 3 * len(partitial_scalars_df) pd.testing.assert_index_equal(gcs_df.columns, scalars_df.columns) @@ -846,6 +844,8 @@ def test_to_gbq_w_None_column_names( """Test the `to_gbq` API with None as a column name.""" destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names" + # pandas 3.0 str datatypes produces nan instead of None, so cast to object + # scalars_df_index.columns = scalars_df_index.columns.astype(object) scalars_df_index = scalars_df_index.rename(columns={"int64_too": None}) scalars_df_index.to_gbq(destination_table, if_exists="replace") diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 1d0e05f5ccf..b6c87091915 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -17,7 +17,7 @@ import pytest import bigframes.pandas as bpd -from bigframes.testing.utils import assert_frame_equal +import bigframes.testing # ================= # DataFrame.groupby @@ -51,14 +51,16 @@ def test_dataframe_groupby_numeric_aggregate( pd_result = operator(scalars_pandas_df_index[col_names].groupby("string_col")) bf_result_computed = bf_result.to_pandas() # Pandas std function produces float64, not matching Float64 from bigframes - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False + ) def test_dataframe_groupby_head(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("bool_col").head(2).to_pandas() pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").head(2) - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) def test_dataframe_groupby_len(scalars_df_index, scalars_pandas_df_index): @@ -99,7 +101,7 @@ def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q scalars_df_index[col_names].groupby("string_col").quantile(q) ).to_pandas() pd_result = scalars_pandas_df_index[col_names].groupby("string_col").quantile(q) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -139,7 +141,7 @@ def test_dataframe_groupby_rank( .astype("float64") .astype("Float64") ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -167,7 +169,9 @@ def test_dataframe_groupby_aggregate( pd_result = operator(scalars_pandas_df_index[col_names].groupby("string_col")) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False + ) def test_dataframe_groupby_corr(scalars_df_index, scalars_pandas_df_index): @@ -175,7 +179,7 @@ def test_dataframe_groupby_corr(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[col_names].groupby("bool_col").corr().to_pandas() pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").corr() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -185,7 +189,7 @@ def test_dataframe_groupby_cov(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[col_names].groupby("bool_col").cov().to_pandas() pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").cov() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -205,7 +209,7 @@ def test_dataframe_groupby_agg_string( pd_result = scalars_pandas_df_index[col_names].groupby("string_col").agg("count") bf_result_computed = bf_result.to_pandas(ordered=ordered) - assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered ) @@ -215,7 +219,9 @@ def test_dataframe_groupby_agg_size_string(scalars_df_index, scalars_pandas_df_i bf_result = scalars_df_index[col_names].groupby("string_col").agg("size") pd_result = scalars_pandas_df_index[col_names].groupby("string_col").agg("size") - pd.testing.assert_series_equal(pd_result, bf_result.to_pandas(), check_dtype=False) + bigframes.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False + ) def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): @@ -233,7 +239,7 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): # some inconsistency between versions, so normalize to bigframes behavior pd_result = pd_result.rename({"amin": "min"}, axis="columns") bf_result_computed = bf_result_computed.rename({"amin": "min"}, axis="columns") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, check_index_type=False ) @@ -252,7 +258,9 @@ def test_dataframe_groupby_agg_list_w_column_multi_index( pd_result = pd_df.groupby(level=0).agg(["count", np.min, "size"]) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False + ) @pytest.mark.parametrize( @@ -282,7 +290,7 @@ def test_dataframe_groupby_agg_dict_with_list( ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, check_index_type=False ) @@ -301,7 +309,9 @@ def test_dataframe_groupby_agg_dict_no_lists(scalars_df_index, scalars_pandas_df ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False + ) def test_dataframe_groupby_agg_named(scalars_df_index, scalars_pandas_df_index): @@ -324,7 +334,9 @@ def test_dataframe_groupby_agg_named(scalars_df_index, scalars_pandas_df_index): ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False + ) def test_dataframe_groupby_agg_kw_tuples(scalars_df_index, scalars_pandas_df_index): @@ -344,7 +356,9 @@ def test_dataframe_groupby_agg_kw_tuples(scalars_df_index, scalars_pandas_df_ind ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False + ) @pytest.mark.parametrize( @@ -389,7 +403,7 @@ def test_dataframe_groupby_multi_sum( # BigQuery DataFrames default indices use nullable Int64 always pd_series.index = pd_series.index.astype("Int64") - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_series, bf_result, ) @@ -428,7 +442,9 @@ def test_dataframe_groupby_analytic( ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False + ) @pytest.mark.parametrize( @@ -449,7 +465,9 @@ def test_dataframe_groupby_cumcount( ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_series_equal( + pd_result, bf_result_computed, check_dtype=False + ) def test_dataframe_groupby_size_as_index_false( @@ -459,7 +477,7 @@ def test_dataframe_groupby_size_as_index_false( bf_result_computed = bf_result.to_pandas() pd_result = scalars_pandas_df_index.groupby("string_col", as_index=False).size() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, check_index_type=False ) @@ -471,7 +489,9 @@ def test_dataframe_groupby_size_as_index_true( pd_result = scalars_pandas_df_index.groupby("string_col", as_index=True).size() bf_result_computed = bf_result.to_pandas() - pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_series_equal( + pd_result, bf_result_computed, check_dtype=False + ) def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index): @@ -479,21 +499,20 @@ def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[col_names].groupby("bool_col").skew().to_pandas() pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").skew() - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +@pytest.mark.skipif( + not pd.__version__.startswith("3"), + reason="groupby.kurt not supported on legacy pandas versions", +) def test_dataframe_groupby_kurt(scalars_df_index, scalars_pandas_df_index): col_names = ["float64_col", "int64_col", "bool_col"] bf_result = scalars_df_index[col_names].groupby("bool_col").kurt().to_pandas() # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 - pd_result = ( - scalars_pandas_df_index[col_names] - .groupby("bool_col") - .apply(pd.Series.kurt) - .drop("bool_col", axis=1) - ) + pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").kurt() - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -509,7 +528,7 @@ def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index, order pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1) bf_result_computed = bf_result.to_pandas(ordered=ordered) - assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered ) @@ -526,7 +545,7 @@ def test_dataframe_groupby_getitem( scalars_pandas_df_index[col_names].groupby("string_col")["int64_col"].min() ) - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) def test_dataframe_groupby_getitem_error( @@ -557,7 +576,7 @@ def test_dataframe_groupby_getitem_list( scalars_pandas_df_index[col_names].groupby("string_col")[col_names].min() ) - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) def test_dataframe_groupby_getitem_list_error( @@ -590,11 +609,15 @@ def test_dataframe_groupby_nonnumeric_with_mean(): bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, check_index_type=False, check_dtype=False ) +@pytest.mark.skipif( + pd.__version__.startswith("3"), + reason="value_counts behavior change b/485962498", +) @pytest.mark.parametrize( ("subset", "normalize", "ascending", "dropna", "as_index"), [ @@ -631,10 +654,10 @@ def test_dataframe_groupby_value_counts( ) if as_index: - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) else: pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -660,7 +683,7 @@ def test_dataframe_groupby_first( .groupby(scalars_pandas_df_index.int64_col % 2) .first(numeric_only=numeric_only, min_count=min_count) ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, ) @@ -684,7 +707,7 @@ def test_dataframe_groupby_last( pd_result = scalars_pandas_df_index.groupby( scalars_pandas_df_index.int64_col % 2 ).last(numeric_only=numeric_only, min_count=min_count) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result, ) @@ -720,7 +743,7 @@ def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index, ag ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result_computed, check_dtype=False, check_names=False ) @@ -738,7 +761,7 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, check_names=False ) @@ -793,7 +816,7 @@ def test_series_groupby_rank( .astype("float64") .astype("Float64") ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) @@ -808,7 +831,7 @@ def test_series_groupby_head(scalars_df_index, scalars_pandas_df_index, dropna): pd_result = scalars_pandas_df_index.groupby("bool_col", dropna=dropna)[ "int64_too" ].head(1) - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): @@ -823,7 +846,7 @@ def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): pd.Series.kurt ) - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index): @@ -837,7 +860,9 @@ def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index): ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) + bigframes.testing.assert_series_equal( + pd_result, bf_result_computed, check_dtype=False + ) def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): @@ -853,7 +878,7 @@ def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): .skew() ) - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -868,11 +893,15 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): scalars_df_index.groupby("string_col")["int64_col"].quantile(q) ).to_pandas() pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].quantile(q) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) +@pytest.mark.skipif( + pd.__version__.startswith("3"), + reason="Pandas 3 change value_counts behavior", +) @pytest.mark.parametrize( ("normalize", "ascending", "dropna"), [ @@ -905,7 +934,7 @@ def test_series_groupby_value_counts( pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts( normalize=normalize, ascending=ascending, dropna=dropna ) - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -926,7 +955,7 @@ def test_series_groupby_first( pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].first( numeric_only=numeric_only, min_count=min_count ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -950,4 +979,4 @@ def test_series_groupby_last( pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].last( numeric_only=numeric_only, min_count=min_count ) - pd.testing.assert_series_equal(pd_result, bf_result) + bigframes.testing.assert_series_equal(pd_result, bf_result) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a28e02a54fa..ed901f9562e 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -17,7 +17,7 @@ import pytest import bigframes.pandas as bpd -from bigframes.testing.utils import assert_frame_equal +import bigframes.testing # Sample MultiIndex for testing DataFrames where() method. _MULTI_INDEX = pandas.MultiIndex.from_tuples( @@ -58,7 +58,7 @@ def test_multi_index_from_arrays(): names=[" 1index 1", "_1index 2"], ) assert bf_idx.names == pd_idx.names - pandas.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + bigframes.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) def test_read_pandas_multi_index_axes(): @@ -90,7 +90,7 @@ def test_read_pandas_multi_index_axes(): bf_df = bpd.DataFrame(pandas_df) bf_df_computed = bf_df.to_pandas() - pandas.testing.assert_frame_equal(bf_df_computed, pandas_df) + bigframes.testing.assert_frame_equal(bf_df_computed, pandas_df) # Row Multi-index tests @@ -98,7 +98,7 @@ def test_set_multi_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.set_index(["bool_col", "int64_too"]).to_pandas() pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"]) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -127,7 +127,7 @@ def test_df_reset_multi_index(scalars_df_index, scalars_pandas_df_index, level, if pd_result.index.dtype != bf_result.index.dtype: pd_result.index = pd_result.index.astype(bf_result.index.dtype) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -160,9 +160,9 @@ def test_series_reset_multi_index( pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) if drop: - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) else: - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_series_multi_index_idxmin(scalars_df_index, scalars_pandas_df_index): @@ -187,7 +187,7 @@ def test_binop_series_series_matching_multi_indices( bf_result = bf_left["int64_col"] + bf_right["int64_too"] pd_result = pd_left["int64_col"] + pd_right["int64_too"] - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.sort_index().to_pandas(), pd_result.sort_index() ) @@ -203,7 +203,7 @@ def test_binop_df_series_matching_multi_indices( bf_result = bf_left[["int64_col", "int64_too"]].add(bf_right["int64_too"], axis=0) pd_result = pd_left[["int64_col", "int64_too"]].add(pd_right["int64_too"], axis=0) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.sort_index().to_pandas(), pd_result.sort_index() ) @@ -217,7 +217,7 @@ def test_binop_multi_index_mono_index(scalars_df_index, scalars_pandas_df_index) bf_result = bf_left["int64_col"] + bf_right["int64_too"] pd_result = pd_left["int64_col"] + pd_right["int64_too"] - pandas.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_binop_overlapping_multi_indices(scalars_df_index, scalars_pandas_df_index): @@ -229,7 +229,7 @@ def test_binop_overlapping_multi_indices(scalars_df_index, scalars_pandas_df_ind bf_result = bf_left["int64_col"] + bf_right["int64_too"] pd_result = pd_left["int64_col"] + pd_right["int64_too"] - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.sort_index().to_pandas(), pd_result.sort_index() ) @@ -245,7 +245,7 @@ def test_concat_compatible_multi_indices(scalars_df_index, scalars_pandas_df_ind bf_result = bpd.concat([bf_left, bf_right]) pd_result = pandas.concat([pd_left, pd_right]) - pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) def test_concat_multi_indices_ignore_index(scalars_df_index, scalars_pandas_df_index): @@ -260,7 +260,7 @@ def test_concat_multi_indices_ignore_index(scalars_df_index, scalars_pandas_df_i # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) - pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -277,7 +277,7 @@ def test_multi_index_loc_multi_row(scalars_df_index, scalars_pandas_df_index, ke ) pd_result = scalars_pandas_df_index.set_index(["int64_too", "string_col"]).loc[key] - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_multi_index_loc_single_row(scalars_df_index, scalars_pandas_df_index): @@ -288,7 +288,7 @@ def test_multi_index_loc_single_row(scalars_df_index, scalars_pandas_df_index): (2, "capitalize, This ") ] - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_multi_index_getitem_bool(scalars_df_index, scalars_pandas_df_index): @@ -298,7 +298,7 @@ def test_multi_index_getitem_bool(scalars_df_index, scalars_pandas_df_index): bf_result = bf_frame[bf_frame["int64_col"] > 0].to_pandas() pd_result = pd_frame[pd_frame["int64_col"] > 0] - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -318,7 +318,7 @@ def test_df_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, lev bf_result = bf_frame.droplevel(level).to_pandas() pd_result = pd_frame.droplevel(level) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -338,7 +338,7 @@ def test_series_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, bf_result = bf_frame["string_col"].droplevel(level).to_pandas() pd_result = pd_frame["string_col"].droplevel(level) - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -357,7 +357,7 @@ def test_multi_index_drop(scalars_df_index, scalars_pandas_df_index, labels, lev bf_result = bf_frame.drop(labels=labels, axis="index", level=level).to_pandas() pd_result = pd_frame.drop(labels=labels, axis="index", level=level) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -382,7 +382,7 @@ def test_df_multi_index_reorder_levels( bf_result = bf_frame.reorder_levels(order).to_pandas() pd_result = pd_frame.reorder_levels(order) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -407,7 +407,7 @@ def test_series_multi_index_reorder_levels( bf_result = bf_frame["string_col"].reorder_levels(order).to_pandas() pd_result = pd_frame["string_col"].reorder_levels(order) - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_df_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): @@ -417,7 +417,7 @@ def test_df_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): bf_result = bf_frame.swaplevel().to_pandas() pd_result = pd_frame.swaplevel() - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_series_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): @@ -427,7 +427,7 @@ def test_series_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index) bf_result = bf_frame["string_col"].swaplevel(0, 2).to_pandas() pd_result = pd_frame["string_col"].swaplevel(0, 2) - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_multi_index_series_groupby(scalars_df_index, scalars_pandas_df_index): @@ -443,7 +443,7 @@ def test_multi_index_series_groupby(scalars_df_index, scalars_pandas_df_index): pd_frame["float64_col"].groupby([pd_frame.int64_col % 2, "bool_col"]).mean() ) - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -470,7 +470,7 @@ def test_multi_index_series_groupby_level( .mean() ) - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_multi_index_dataframe_groupby(scalars_df_index, scalars_pandas_df_index): @@ -485,7 +485,7 @@ def test_multi_index_dataframe_groupby(scalars_df_index, scalars_pandas_df_index numeric_only=True ) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -514,13 +514,14 @@ def test_multi_index_dataframe_groupby_level_aggregate( ) # For as_index=False, pandas will drop index levels used as groupings # In the future, it will include this in the result, bigframes already does this behavior - if not as_index: - for col in index_cols: - if col in bf_result.columns: - bf_result = bf_result.drop(col, axis=1) + if not pandas.__version__.startswith("3"): + if not as_index: + for col in index_cols: + if col in bf_result.columns: + bf_result = bf_result.drop(col, axis=1) # Pandas will have int64 index, while bigquery will have Int64 when resetting - pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) @pytest.mark.parametrize( @@ -553,7 +554,7 @@ def test_multi_index_dataframe_groupby_level_analytic( .cumsum(numeric_only=True) ) - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) all_joins = pytest.mark.parametrize( @@ -583,7 +584,7 @@ def test_multi_index_dataframe_join(scalars_dfs, how): (["bool_col", "rowindex_2"]) )[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_frame_equal(bf_result, pd_result, ignore_order=True) + bigframes.testing.assert_frame_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -604,7 +605,7 @@ def test_multi_index_dataframe_join_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_frame_equal(bf_result, pd_result, ignore_order=True) + bigframes.testing.assert_frame_equal(bf_result, pd_result, ignore_order=True) def test_multi_index_dataframe_where_series_cond_none_other( @@ -632,7 +633,7 @@ def test_multi_index_dataframe_where_series_cond_none_other( bf_result = dataframe_bf.where(series_cond_bf).to_pandas() pd_result = dataframe_pd.where(series_cond_pd) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, @@ -668,7 +669,7 @@ def test_multi_index_dataframe_where_series_cond_dataframe_other( bf_result = dataframe_bf.where(series_cond_bf, dataframe_other_bf).to_pandas() pd_result = dataframe_pd.where(series_cond_pd, dataframe_other_pd) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, @@ -700,7 +701,7 @@ def test_multi_index_dataframe_where_dataframe_cond_constant_other( bf_result = dataframe_bf.where(dataframe_cond_bf, other).to_pandas() pd_result = dataframe_pd.where(dataframe_cond_pd, other) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, @@ -733,7 +734,7 @@ def test_multi_index_dataframe_where_dataframe_cond_dataframe_other( bf_result = dataframe_bf.where(dataframe_cond_bf, dataframe_other_bf).to_pandas() pd_result = dataframe_pd.where(dataframe_cond_pd, dataframe_other_pd) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, @@ -765,7 +766,7 @@ def test_multi_index_series_groupby_level_aggregate( .mean() ) - pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -792,7 +793,7 @@ def test_multi_index_series_groupby_level_analytic( .cumsum() ) - pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) def test_multi_index_series_rename_dict_same_type( @@ -807,7 +808,7 @@ def test_multi_index_series_rename_dict_same_type( "string_col" ].rename({1: 100, 2: 200}) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -825,7 +826,7 @@ def test_multi_index_df_reindex(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.set_index(["rowindex_2", "string_col"]).reindex( index=new_index ) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -843,15 +844,15 @@ def test_column_multi_index_getitem(scalars_df_index, scalars_pandas_df_index): bf_a = bf_df["a"].to_pandas() pd_a = pd_df["a"] - pandas.testing.assert_frame_equal(bf_a, pd_a) + bigframes.testing.assert_frame_equal(bf_a, pd_a) bf_b = bf_df["b"].to_pandas() pd_b = pd_df["b"] - pandas.testing.assert_frame_equal(bf_b, pd_b) + bigframes.testing.assert_frame_equal(bf_b, pd_b) bf_fullkey = bf_df[("a", "int64_too")].to_pandas() pd_fullkey = pd_df[("a", "int64_too")] - pandas.testing.assert_series_equal(bf_fullkey, pd_fullkey) + bigframes.testing.assert_series_equal(bf_fullkey, pd_fullkey) def test_column_multi_index_concat(scalars_df_index, scalars_pandas_df_index): @@ -876,7 +877,7 @@ def test_column_multi_index_concat(scalars_df_index, scalars_pandas_df_index): bf_result = bpd.concat([bf_df1, bf_df2, bf_df1]).to_pandas() pd_result = pandas.concat([pd_df1, pd_df2, pd_df1]) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_column_multi_index_drop(scalars_df_index, scalars_pandas_df_index): @@ -889,7 +890,7 @@ def test_column_multi_index_drop(scalars_df_index, scalars_pandas_df_index): bf_a = bf_df.drop(("a", "int64_too"), axis=1).to_pandas() pd_a = pd_df.drop(("a", "int64_too"), axis=1) - pandas.testing.assert_frame_equal(bf_a, pd_a) + bigframes.testing.assert_frame_equal(bf_a, pd_a) @pytest.mark.parametrize( @@ -913,7 +914,7 @@ def test_column_multi_index_assign(scalars_df_index, scalars_pandas_df_index, ke pd_result = pd_df.assign(**kwargs) # Pandas assign results in non-nullable dtype - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index): @@ -927,7 +928,7 @@ def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index): bf_result = bf_df.rename(columns={"b": "c"}).to_pandas() pd_result = pd_df.rename(columns={"b": "c"}) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -957,7 +958,7 @@ def test_column_multi_index_reset_index( # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index): @@ -971,7 +972,7 @@ def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index) bf_result = (bf_df[("a", "a")] + 3).to_pandas() pd_result = pd_df[("a", "a")] + 3 - pandas.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_column_multi_index_any(): @@ -988,7 +989,7 @@ def test_column_multi_index_any(): pd_result = pd_df.isna().any() bf_result = bf_df.isna().any().to_pandas() - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result.reset_index(drop=False), pd_result.reset_index(drop=False), check_dtype=False, @@ -1008,7 +1009,7 @@ def test_column_multi_index_agg(scalars_df_index, scalars_pandas_df_index): # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) def test_column_multi_index_prefix_suffix(scalars_df_index, scalars_pandas_df_index): @@ -1022,7 +1023,7 @@ def test_column_multi_index_prefix_suffix(scalars_df_index, scalars_pandas_df_in bf_result = bf_df.add_prefix("prefixed_").add_suffix("_suffixed").to_pandas() pd_result = pd_df.add_prefix("prefixed_").add_suffix("_suffixed") - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index): @@ -1038,7 +1039,7 @@ def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index): bf_result = bf_df.cumsum().to_pandas() pd_result = pd_df.cumsum() - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -1071,7 +1072,7 @@ def test_column_multi_index_stack(level): # Pandas produces NaN, where bq dataframes produces pd.NA # Column ordering seems to depend on pandas version assert isinstance(pd_result, pandas.DataFrame) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -1099,7 +1100,7 @@ def test_column_multi_index_melt(): pd_result = pd_df.melt() # BigFrames uses different string and int types, but values are identical - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_index_type=False, check_dtype=False ) @@ -1121,7 +1122,7 @@ def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): # Pandas produces NaN, where bq dataframes produces pd.NA # Column ordering seems to depend on pandas version - pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): @@ -1142,7 +1143,7 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -1165,7 +1166,7 @@ def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses string, Pandas uses object. - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -1244,7 +1245,7 @@ def test_column_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index) bf_result = bf_df.droplevel(1, axis=1).to_pandas() pd_result = pd_df.droplevel(1, axis=1) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_df_column_multi_index_reindex(scalars_df_index, scalars_pandas_df_index): @@ -1266,7 +1267,7 @@ def test_df_column_multi_index_reindex(scalars_df_index, scalars_pandas_df_index # Pandas uses float64 as default for newly created empty column, bf uses Float64 pd_result[("z", "a")] = pd_result[("z", "a")].astype(pandas.Float64Dtype()) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, ) @@ -1285,7 +1286,7 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i bf_result = bf_df.reorder_levels([-2, -1, 0], axis=1).to_pandas() pd_result = pd_df.reorder_levels([-2, -1, 0], axis=1) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1302,7 +1303,7 @@ def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level): ["team_name", "position"], append=True ).unstack(level=level) - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -1319,7 +1320,7 @@ def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level): "number" ].unstack(level=level) - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_column_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): @@ -1335,7 +1336,7 @@ def test_column_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index) bf_result = bf_df.swaplevel(-3, -1, axis=1).to_pandas() pd_result = pd_df.swaplevel(-3, -1, axis=1) - pandas.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_df_multi_index_dot_not_supported(): @@ -1409,7 +1410,7 @@ def test_explode_w_column_multi_index(): assert isinstance(pd_df, pandas.DataFrame) assert isinstance(pd_df["col0"], pandas.DataFrame) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( df["col0"].explode("col00").to_pandas(), pd_df["col0"].explode("col00"), check_dtype=False, @@ -1427,7 +1428,7 @@ def test_explode_w_multi_index(): df = bpd.DataFrame(data, index=multi_index, columns=columns) pd_df = df.to_pandas() - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( df.explode("col00").to_pandas(), pd_df.explode("col00"), check_dtype=False, @@ -1451,7 +1452,7 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index # Pandas produces pd.NA, where bq dataframes produces NaN pd_result["c"] = pd_result["c"].replace(pandas.NA, np.nan) - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -1482,6 +1483,6 @@ def test_multiindex_eq_const(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.set_index(col_name).index == (2, False) pd_result = scalars_pandas_df_index.set_index(col_name).index == (2, False) - pandas.testing.assert_index_equal( + bigframes.testing.assert_index_equal( pandas.Index(pd_result, dtype="boolean"), bf_result.to_pandas() ) diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py index 490f9271142..d04fb81a0a2 100644 --- a/tests/system/small/test_numpy.py +++ b/tests/system/small/test_numpy.py @@ -16,6 +16,8 @@ import pandas as pd import pytest +import bigframes.testing + @pytest.mark.parametrize( ("opname",), @@ -45,7 +47,7 @@ def test_series_ufuncs(floats_pd, floats_bf, opname): bf_result = getattr(np, opname)(floats_bf).to_pandas() pd_result = getattr(np, opname)(floats_pd) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result, nulls_are_nan=True) @pytest.mark.parametrize( @@ -79,7 +81,7 @@ def test_df_ufuncs(scalars_dfs, opname): ): pd_result["int64_col"] = pd_result["int64_col"].astype(pd.Float64Dtype()) - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result, nulls_are_nan=True) @pytest.mark.parametrize( @@ -99,7 +101,7 @@ def test_df_binary_ufuncs(scalars_dfs, opname): bf_result = op(scalars_df[["float64_col", "int64_col"]], 5.1).to_pandas() pd_result = op(scalars_pandas_df[["float64_col", "int64_col"]], 5.1) - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result, nulls_are_nan=True) # Operations tested here don't work on full dataframe in numpy+pandas @@ -131,7 +133,7 @@ def test_series_binary_ufuncs(scalars_dfs, x, y, opname): bf_result = op(scalars_df[x], scalars_df[y]).to_pandas() pd_result = op(scalars_pandas_df[x], scalars_pandas_df[y]) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result, nulls_are_nan=True) def test_series_binary_ufuncs_reverse(scalars_dfs): @@ -141,7 +143,7 @@ def test_series_binary_ufuncs_reverse(scalars_dfs): bf_result = np.subtract(5.1, scalars_df["int64_col"]).to_pandas() pd_result = np.subtract(5.1, scalars_pandas_df["int64_col"]) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result, nulls_are_nan=True) def test_df_binary_ufuncs_reverse(scalars_dfs): @@ -154,4 +156,4 @@ def test_df_binary_ufuncs_reverse(scalars_dfs): scalars_pandas_df[["float64_col", "int64_col"]], ) - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result, nulls_are_nan=True) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a1c0dc9851f..ccef51b1e93 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -21,7 +21,8 @@ import pytz import bigframes.pandas as bpd -from bigframes.testing.utils import assert_frame_equal +import bigframes.testing +from bigframes.testing.utils import assert_frame_equal, assert_series_equal @pytest.mark.parametrize( @@ -64,7 +65,7 @@ def test_concat_series(scalars_dfs): ] ) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -115,17 +116,25 @@ def test_get_dummies_dataframe_duplicate_labels(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs scalars_renamed_df = scalars_df.rename( - columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + columns={ + "int64_too": "int64_col", + "float64_col": "dup_col", + "string_col": "dup_col", + } ) scalars_renamed_pandas_df = scalars_pandas_df.rename( - columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + columns={ + "int64_too": "int64_col", + "float64_col": "dup_col", + "string_col": "dup_col", + } ) bf_result = bpd.get_dummies( - scalars_renamed_df, columns=["int64_col", None], dtype=bool + scalars_renamed_df, columns=["int64_col", "dup_col"], dtype=bool ) pd_result = pd.get_dummies( - scalars_renamed_pandas_df, columns=["int64_col", None], dtype=bool + scalars_renamed_pandas_df, columns=["int64_col", "dup_col"], dtype=bool ) # dtype argument above is needed for pandas v1 only @@ -532,7 +541,9 @@ def _convert_pandas_category(pd_s: pd.Series): f"Input must be a pandas Series with categorical data: {pd_s.dtype}" ) - if pd.api.types.is_object_dtype(pd_s.cat.categories.dtype): + if pd.api.types.is_object_dtype( + pd_s.cat.categories.dtype + ) or pd.api.types.is_string_dtype(pd_s.cat.categories.dtype): return pd_s.astype(pd.StringDtype(storage="pyarrow")) if not isinstance(pd_s.cat.categories.dtype, pd.IntervalDtype): @@ -548,9 +559,9 @@ def _convert_pandas_category(pd_s: pd.Series): right_key = "right_inclusive" subtype = pd_s.cat.categories.dtype.subtype # type: ignore - if pd.api.types.is_float_dtype(subtype): + if pd.api.types.is_float_dtype(subtype): # type: ignore interval_dtype = pa.float64() - elif pd.api.types.is_integer_dtype(subtype): + elif pd.api.types.is_integer_dtype(subtype): # type: ignore interval_dtype = pa.int64() else: raise ValueError(f"Unknown category type: {subtype}") @@ -591,7 +602,7 @@ def test_cut_for_array(): bf_result = bpd.cut(sc, x) pd_result = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -610,7 +621,7 @@ def test_cut_by_int_bins(scalars_dfs, labels, right): bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels, right=right) pd_result = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_cut_by_int_bins_w_labels(scalars_dfs): @@ -621,7 +632,7 @@ def test_cut_by_int_bins_w_labels(scalars_dfs): bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels) pd_result = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -664,7 +675,7 @@ def test_cut_by_numeric_breaks(scalars_dfs, breaks, right, labels): ).to_pandas() pd_result_converted = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result, pd_result_converted) + bigframes.testing.assert_series_equal(bf_result, pd_result_converted) def test_cut_by_numeric_breaks_w_labels(scalars_dfs): @@ -676,7 +687,7 @@ def test_cut_by_numeric_breaks_w_labels(scalars_dfs): bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) pd_result = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -716,7 +727,7 @@ def test_cut_by_interval_bins(scalars_dfs, bins, right, labels): pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels, right=right) pd_result_converted = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result, pd_result_converted) + bigframes.testing.assert_series_equal(bf_result, pd_result_converted) def test_cut_by_interval_bins_w_labels(scalars_dfs): @@ -728,7 +739,7 @@ def test_cut_by_interval_bins_w_labels(scalars_dfs): bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) pd_result = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -745,7 +756,7 @@ def test_cut_by_edge_cases_bins(scalars_dfs, bins, labels): pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels) pd_result_converted = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal(bf_result, pd_result_converted) + bigframes.testing.assert_series_equal(bf_result, pd_result_converted) def test_cut_empty_array_raises_error(): @@ -774,7 +785,7 @@ def test_qcut(scalars_dfs, q): bf_result = bpd.qcut(scalars_df["float64_col"], q, labels=False, duplicates="drop") pd_result = pd_result.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -816,10 +827,12 @@ def test_to_datetime_iterable(arg, utc, unit, format): .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) - pd_result = pd.Series( - pd.to_datetime(arg, utc=utc, unit=unit, format=format) - ).dt.floor("us") - pd.testing.assert_series_equal( + pd_result = ( + pd.Series(pd.to_datetime(arg, utc=utc, unit=unit, format=format)) + .dt.floor("us") + .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") + ) + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) @@ -830,8 +843,10 @@ def test_to_datetime_series(scalars_dfs): bf_result = ( bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") ) - pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s")) - pd.testing.assert_series_equal( + pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s")).astype( + "datetime64[s]" + ) + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) @@ -852,8 +867,12 @@ def test_to_datetime_series(scalars_dfs): ) def test_to_datetime_unit_param(arg, unit): bf_result = bpd.to_datetime(arg, unit=unit).to_pandas().astype("datetime64[ns]") - pd_result = pd.Series(pd.to_datetime(arg, unit=unit)).dt.floor("us") - pd.testing.assert_series_equal( + pd_result = ( + pd.Series(pd.to_datetime(arg, unit=unit)) + .dt.floor("us") + .astype("datetime64[ns]") + ) + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) @@ -873,8 +892,12 @@ def test_to_datetime_format_param(arg, utc, format): .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) - pd_result = pd.Series(pd.to_datetime(arg, utc=utc, format=format)).dt.floor("us") - pd.testing.assert_series_equal( + pd_result = ( + pd.Series(pd.to_datetime(arg, utc=utc, format=format)) + .dt.floor("us") + .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") + ) + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) @@ -921,13 +944,18 @@ def test_to_datetime_format_param(arg, utc, format): ], ) def test_to_datetime_string_inputs(arg, utc, output_in_utc, format): + normalized_type = "datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]" + bf_result = ( - bpd.to_datetime(arg, utc=utc, format=format) - .to_pandas() - .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") + bpd.to_datetime(arg, utc=utc, format=format).to_pandas().astype(normalized_type) ) - pd_result = pd.Series(pd.to_datetime(arg, utc=utc, format=format)).dt.floor("us") - pd.testing.assert_series_equal( + pd_result = ( + pd.Series(pd.to_datetime(arg, utc=utc, format=format)) + .dt.floor("us") + .astype(normalized_type) + ) + + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) @@ -964,13 +992,14 @@ def test_to_datetime_string_inputs(arg, utc, output_in_utc, format): ], ) def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc): - bf_result = ( - bpd.to_datetime(arg, utc=utc) - .to_pandas() - .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") + normalized_type = "datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]" + + bf_result = bpd.to_datetime(arg, utc=utc).to_pandas().astype(normalized_type) + pd_result = ( + pd.Series(pd.to_datetime(arg, utc=utc)).dt.floor("us").astype(normalized_type) ) - pd_result = pd.Series(pd.to_datetime(arg, utc=utc)).dt.floor("us") - pd.testing.assert_series_equal( + + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) @@ -1019,10 +1048,8 @@ def test_to_timedelta_with_bf_integer_series(session, unit): .astype("timedelta64[ns]") ) - expected_result = pd.to_timedelta(pd_series, unit) - pd.testing.assert_series_equal( - actual_result, expected_result, check_index_type=False - ) + expected_result = pd.to_timedelta(pd_series, unit).astype("timedelta64[ns]") + assert_series_equal(actual_result, expected_result, check_index_type=False) def test_to_timedelta_with_bf_float_series_value_rounded_down(session): @@ -1034,8 +1061,10 @@ def test_to_timedelta_with_bf_float_series_value_rounded_down(session): .astype("timedelta64[ns]") ) - expected_result = pd.Series([pd.Timedelta(1, "us"), pd.Timedelta(2, "us")]) - pd.testing.assert_series_equal( + expected_result = pd.Series([pd.Timedelta(1, "us"), pd.Timedelta(2, "us")]).astype( + "timedelta64[ns]" + ) + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -1055,8 +1084,8 @@ def test_to_timedelta_with_list_like_input(session, input): .astype("timedelta64[ns]") ) - expected_result = pd.Series(pd.to_timedelta(input, "s")) - pd.testing.assert_series_equal( + expected_result = pd.Series(pd.to_timedelta(input, "s")).astype("timedelta64[ns]") + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -1086,7 +1115,7 @@ def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs): bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]") ) - expected_result = pd.to_timedelta(pd_series, unit="s") - pd.testing.assert_series_equal( + expected_result = pd.to_timedelta(pd_series, unit="s").astype("timedelta64[ns]") + bigframes.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f5408dc323d..7019bcff109 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -32,6 +32,7 @@ import bigframes.features import bigframes.pandas import bigframes.series as series +import bigframes.testing from bigframes.testing.utils import ( assert_frame_equal, assert_series_equal, @@ -47,7 +48,7 @@ def test_series_construct_copy(scalars_dfs): pd_result = pd.Series( scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" ) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_nullable_ints(): @@ -62,7 +63,7 @@ def test_series_construct_nullable_ints(): ) expected = pd.Series([1, 3, pd.NA], dtype=pd.Int64Dtype(), index=expected_index) - pd.testing.assert_series_equal(bf_result, expected) + bigframes.testing.assert_series_equal(bf_result, expected) def test_series_construct_timestamps(): @@ -74,7 +75,7 @@ def test_series_construct_timestamps(): bf_result = series.Series(datetimes).to_pandas() pd_result = pd.Series(datetimes, dtype=pd.ArrowDtype(pa.timestamp("us"))) - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) def test_series_construct_copy_with_index(scalars_dfs): @@ -91,7 +92,7 @@ def test_series_construct_copy_with_index(scalars_dfs): dtype="Float64", index=scalars_pandas_df["int64_too"], ) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_copy_index(scalars_dfs): @@ -108,7 +109,7 @@ def test_series_construct_copy_index(scalars_dfs): dtype="Float64", index=scalars_pandas_df["int64_too"], ) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_pandas(scalars_dfs): @@ -120,7 +121,7 @@ def test_series_construct_pandas(scalars_dfs): scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" ) assert bf_result.shape == pd_result.shape - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_series_construct_from_list(): @@ -130,7 +131,7 @@ def test_series_construct_from_list(): # BigQuery DataFrame default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_reindex(): @@ -141,7 +142,7 @@ def test_series_construct_reindex(): # BigQuery DataFrame default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_from_list_w_index(): @@ -155,7 +156,7 @@ def test_series_construct_from_list_w_index(): # BigQuery DataFrame default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_empty(session: bigframes.Session): @@ -176,7 +177,7 @@ def test_series_construct_scalar_no_index(): # BigQuery DataFrame default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_scalar_w_index(): @@ -188,7 +189,7 @@ def test_series_construct_scalar_w_index(): # BigQuery DataFrame default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_nan(): @@ -198,7 +199,7 @@ def test_series_construct_nan(): pd_result.index = pd_result.index.astype("Int64") pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_scalar_w_bf_index(): @@ -209,7 +210,7 @@ def test_series_construct_scalar_w_bf_index(): pd_result = pd_result.astype("string[pyarrow]") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_construct_from_list_escaped_strings(): @@ -225,7 +226,7 @@ def test_series_construct_from_list_escaped_strings(): # BigQuery DataFrame default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_series_construct_geodata(): @@ -240,7 +241,7 @@ def test_series_construct_geodata(): series = bigframes.pandas.Series(pd_series) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_series, series.to_pandas(), check_index_type=False ) @@ -258,7 +259,7 @@ def test_series_construct_w_dtype(dtype): expected = pd.Series(data, dtype=dtype) expected.index = expected.index.astype("Int64") series = bigframes.pandas.Series(data, dtype=dtype) - pd.testing.assert_series_equal(series.to_pandas(), expected) + bigframes.testing.assert_series_equal(series.to_pandas(), expected) def test_series_construct_w_dtype_for_struct(): @@ -275,7 +276,7 @@ def test_series_construct_w_dtype_for_struct(): series = bigframes.pandas.Series(data, dtype=dtype) expected = pd.Series(data, dtype=dtype) expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(series.to_pandas(), expected) + bigframes.testing.assert_series_equal(series.to_pandas(), expected) def test_series_construct_w_dtype_for_array_string(): @@ -293,7 +294,7 @@ def test_series_construct_w_dtype_for_array_string(): else: check_dtype = False - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( series.to_pandas(), expected, check_dtype=check_dtype ) @@ -313,7 +314,7 @@ def test_series_construct_w_dtype_for_array_struct(): else: check_dtype = False - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( series.to_pandas(), expected, check_dtype=check_dtype ) @@ -323,7 +324,7 @@ def test_series_construct_local_unordered_has_sequential_index(unordered_session ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session ) expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) - pd.testing.assert_index_equal(series.index.to_pandas(), expected) + bigframes.testing.assert_index_equal(series.index.to_pandas(), expected) @pytest.mark.parametrize( @@ -385,14 +386,14 @@ def test_series_construct_w_nested_json_dtype(): ), ) - pd.testing.assert_series_equal(s.to_pandas(), s2.to_pandas()) + bigframes.testing.assert_series_equal(s.to_pandas(), s2.to_pandas()) def test_series_keys(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_col"].keys().to_pandas() pd_result = scalars_pandas_df["int64_col"].keys() - pd.testing.assert_index_equal(bf_result, pd_result) + bigframes.testing.assert_index_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -537,7 +538,7 @@ def test_series___getitem__(scalars_dfs, index_col, key): scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) bf_result = scalars_df[col_name][key] pd_result = scalars_pandas_df[col_name][key] - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -550,6 +551,8 @@ def test_series___getitem__(scalars_dfs, index_col, key): ), ) def test_series___getitem___with_int_key(scalars_dfs, key): + if pd.__version__.startswith("3."): + pytest.skip("pandas 3.0 dropped getitem with int key") col_name = "int64_too" index_col = "string_col" scalars_df, scalars_pandas_df = scalars_dfs @@ -589,7 +592,7 @@ def test_series___setitem__(scalars_dfs, index_col, key, value): bf_series[key] = value pd_series[key] = value - pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + bigframes.testing.assert_series_equal(bf_series.to_pandas(), pd_series) @pytest.mark.parametrize( @@ -614,7 +617,7 @@ def test_series___setitem___with_int_key_numeric(scalars_dfs, key, value): bf_series[key] = value pd_series[key] = value - pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + bigframes.testing.assert_series_equal(bf_series.to_pandas(), pd_series) def test_series___setitem___with_default_index(scalars_dfs): @@ -711,7 +714,7 @@ def test_series_replace_scalar_scalar(scalars_dfs): ) pd_result = scalars_pandas_df[col_name].replace("Hello, World!", "Howdy, Planet!") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -727,7 +730,7 @@ def test_series_replace_regex_scalar(scalars_dfs): "^H.l", "Howdy, Planet!", regex=True ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -745,7 +748,7 @@ def test_series_replace_list_scalar(scalars_dfs): ["Hello, World!", "T"], "Howdy, Planet!" ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -757,7 +760,7 @@ def test_series_replace_nans_with_pd_na(scalars_dfs): bf_result = scalars_df[col_name].replace({pd.NA: "UNKNOWN"}).to_pandas() pd_result = scalars_pandas_df[col_name].replace({pd.NA: "UNKNOWN"}) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -782,7 +785,7 @@ def test_series_replace_dict(scalars_dfs, replacement_dict): bf_result = scalars_df[col_name].replace(replacement_dict).to_pandas() pd_result = scalars_pandas_df[col_name].replace(replacement_dict) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -838,7 +841,7 @@ def test_series_dropna(scalars_dfs, ignore_index): col_name = "string_col" bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas() pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index) - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) @pytest.mark.parametrize( @@ -874,7 +877,7 @@ def test_series_agg_multi_string(scalars_dfs): # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) @pytest.mark.parametrize( @@ -991,7 +994,7 @@ def test_mode_stat(scalars_df_index, scalars_pandas_df_index, col_name): ## Mode implicitly resets index, and bigframes default indices use nullable Int64 pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -1159,7 +1162,7 @@ def test_mods(scalars_dfs, col_x, col_y, method): else: bf_result = bf_series.astype("Float64").to_pandas() pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) - pd.testing.assert_series_equal(pd_result, bf_result) + bigframes.testing.assert_series_equal(pd_result, bf_result) # We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this @@ -1224,16 +1227,16 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method): ) # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. if bf_div_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + bigframes.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) else: - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_div_result, bf_div_result.astype("Float64").to_pandas() ) if bf_mod_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + bigframes.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) else: - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_mod_result, bf_mod_result.astype("Float64").to_pandas() ) @@ -1265,16 +1268,16 @@ def test_divmods_scalars(scalars_dfs, col_x, other, method): pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. if bf_div_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + bigframes.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) else: - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_div_result, bf_div_result.astype("Float64").to_pandas() ) if bf_mod_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + bigframes.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) else: - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_mod_result, bf_mod_result.astype("Float64").to_pandas() ) @@ -1347,7 +1350,9 @@ def test_series_add_different_table_default_index( + scalars_df_2_default_index["float64_col"].to_pandas() ) # TODO(swast): Can remove sort_index() when there's default ordering. - pd.testing.assert_series_equal(bf_result.sort_index(), pd_result.sort_index()) + bigframes.testing.assert_series_equal( + bf_result.sort_index(), pd_result.sort_index() + ) def test_series_add_different_table_with_index( @@ -1358,7 +1363,7 @@ def test_series_add_different_table_with_index( # When index values are unique, we can emulate with values from the same # DataFrame. pd_result = scalars_pandas_df["float64_col"] + scalars_pandas_df["int64_col"] - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): @@ -1377,7 +1382,7 @@ def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): # BigQuery DataFrames default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): @@ -1396,7 +1401,7 @@ def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # reset_index should maintain the original ordering. - pd.testing.assert_frame_equal(bf_result, pd_result) + bigframes.testing.assert_frame_equal(bf_result, pd_result) def test_series_reset_index_duplicates_error(scalars_df_index): @@ -1415,7 +1420,7 @@ def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index): # BigQuery DataFrames default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -1442,7 +1447,7 @@ def test_reset_index_no_drop(scalars_df_index, scalars_pandas_df_index, name): # BigQuery DataFrames default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) def test_copy(scalars_df_index, scalars_pandas_df_index): @@ -1459,7 +1464,7 @@ def test_copy(scalars_df_index, scalars_pandas_df_index): pd_series.loc[0] = 3.4 assert bf_copy.to_pandas().loc[0] != bf_series.to_pandas().loc[0] - pd.testing.assert_series_equal(bf_copy.to_pandas(), pd_copy) + bigframes.testing.assert_series_equal(bf_copy.to_pandas(), pd_copy) def test_isin_raise_error(scalars_df_index, scalars_pandas_df_index): @@ -1500,7 +1505,7 @@ def test_isin(scalars_dfs, col_name, test_set): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[col_name].isin(test_set).to_pandas() pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -1540,7 +1545,7 @@ def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session): scalars_df[col_name].isin(series.Series(test_set, session=session)).to_pandas() ) pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -1558,7 +1563,7 @@ def test_isin_bigframes_index(scalars_dfs, session): .isin(pd.Index(["Hello, World!", "Hi", "こんにちは"])) .astype("boolean") ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result, ) @@ -1603,7 +1608,7 @@ def test_isin_bigframes_values_as_predicate( pd_predicate = scalars_pandas_df[col_name].isin(test_set) pd_result = scalars_pandas_df[pd_predicate] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( pd_result.reset_index(), bf_result.reset_index(), ) @@ -1704,10 +1709,10 @@ def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): pd_series.loc[2] = "This value isn't in the test data." bf_result = bf_series.to_pandas() pd_result = pd_series - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) # Per Copy-on-Write semantics, other references to the original DataFrame # should remain unchanged. - pd.testing.assert_series_equal(bf_original.to_pandas(), pd_original) + bigframes.testing.assert_series_equal(bf_original.to_pandas(), pd_original) def test_at_setitem_row_label_scalar(scalars_dfs): @@ -1718,7 +1723,7 @@ def test_at_setitem_row_label_scalar(scalars_dfs): pd_series.at[1] = 1000 bf_result = bf_series.to_pandas() pd_result = pd_series.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_ne_obj_series(scalars_dfs): @@ -1890,6 +1895,10 @@ def test_series_binop_w_other_types(scalars_dfs, other): bf_result = (scalars_df["int64_col"].head(3) + other).to_pandas() pd_result = scalars_pandas_df["int64_col"].head(3) + other + if isinstance(other, pd.Series): + # pandas 3.0 preserves series name, bigframe, earlier pandas do not + pd_result.index.name = bf_result.index.name + assert_series_equal( bf_result, pd_result, @@ -1998,7 +2007,7 @@ def test_series_quantile(scalars_dfs): pd_result = pd_series.quantile([0.0, 0.4, 0.6, 1.0]) bf_result = bf_series.quantile([0.0, 0.4, 0.6, 1.0]) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False ) @@ -2047,7 +2056,7 @@ def test_cumprod(scalars_dfs): col_name = "float64_col" bf_result = scalars_df[col_name].cumprod() pd_result = scalars_pandas_df[col_name].cumprod() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_result, bf_result.to_pandas(), ) @@ -2148,7 +2157,7 @@ def test_groupby_level_sum(scalars_dfs): bf_series = scalars_df[col_name].groupby(level=0).sum() pd_series = scalars_pandas_df[col_name].groupby(level=0).sum() # TODO(swast): Update groupby to use index based on group by key(s). - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_series.sort_index(), bf_series.to_pandas().sort_index(), ) @@ -2162,7 +2171,7 @@ def test_groupby_level_list_sum(scalars_dfs): bf_series = scalars_df[col_name].groupby(level=["rowindex"]).sum() pd_series = scalars_pandas_df[col_name].groupby(level=["rowindex"]).sum() # TODO(swast): Update groupby to use index based on group by key(s). - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_series.sort_index(), bf_series.to_pandas().sort_index(), ) @@ -2279,7 +2288,7 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator) scalars_pandas_df_index[col_name].groupby(scalars_pandas_df_index[group_key]) ).astype(bf_series.dtype) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_series, bf_series, ) @@ -2295,7 +2304,7 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator) def test_drop_label(scalars_df_index, scalars_pandas_df_index, label, col_name): bf_series = scalars_df_index[col_name].drop(label).to_pandas() pd_series = scalars_pandas_df_index[col_name].drop(label) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_series, bf_series, ) @@ -2305,7 +2314,7 @@ def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" bf_series = scalars_df_index[col_name].drop([1, 3]).to_pandas() pd_series = scalars_pandas_df_index[col_name].drop([1, 3]) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_series, bf_series, ) @@ -2329,7 +2338,7 @@ def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_name): bf_series = scalars_df_index[col_name].drop_duplicates(keep=keep).to_pandas() pd_series = scalars_pandas_df_index[col_name].drop_duplicates(keep=keep) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd_series, bf_series, ) @@ -2366,7 +2375,7 @@ def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): def test_duplicated(scalars_df_index, scalars_pandas_df_index, keep, col_name): bf_series = scalars_df_index[col_name].duplicated(keep=keep).to_pandas() pd_series = scalars_pandas_df_index[col_name].duplicated(keep=keep) - pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + bigframes.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) def test_shape(scalars_dfs): @@ -2500,7 +2509,7 @@ def test_head_then_scalar_operation(scalars_dfs): bf_result = (scalars_df["float64_col"].head(1) + 4).to_pandas() pd_result = scalars_pandas_df["float64_col"].head(1) + 4 - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2516,7 +2525,7 @@ def test_head_then_series_operation(scalars_dfs): "float64_col" ].head(2) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2527,7 +2536,7 @@ def test_series_peek(scalars_dfs): peek_result = scalars_df["float64_col"].peek(n=3, force=False) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( peek_result, scalars_pandas_df["float64_col"].reindex_like(peek_result), ) @@ -2546,7 +2555,7 @@ def test_series_peek_with_large_results_not_allowed(scalars_dfs): # The metrics won't be fully updated when we call query_and_wait. print(session.slot_millis_sum - slot_millis_sum) assert session.slot_millis_sum - slot_millis_sum < 500 - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( peek_result, scalars_pandas_df["float64_col"].reindex_like(peek_result), ) @@ -2560,7 +2569,7 @@ def test_series_peek_multi_index(scalars_dfs): pd_series = scalars_pandas_df.set_index(["string_col", "bool_col"])["float64_col"] pd_series.name = ("2-part", "name") peek_result = bf_series.peek(n=3, force=False) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( peek_result, pd_series.reindex_like(peek_result), ) @@ -2572,7 +2581,7 @@ def test_series_peek_filtered(scalars_dfs): n=3, force=False ) pd_result = scalars_pandas_df[scalars_pandas_df.int64_col > 0]["float64_col"] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( peek_result, pd_result.reindex_like(peek_result), ) @@ -2588,7 +2597,7 @@ def test_series_peek_force(scalars_dfs): peek_result = df_filtered.peek(n=3, force=True) pd_cumsum_df = scalars_pandas_df[["int64_col", "int64_too"]].cumsum() pd_result = pd_cumsum_df[pd_cumsum_df.int64_col > 0]["int64_too"] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( peek_result, pd_result.reindex_like(peek_result), ) @@ -2604,7 +2613,7 @@ def test_series_peek_force_float(scalars_dfs): peek_result = df_filtered.peek(n=3, force=True) pd_cumsum_df = scalars_pandas_df[["int64_col", "float64_col"]].cumsum() pd_result = pd_cumsum_df[pd_cumsum_df.float64_col > 0]["float64_col"] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( peek_result, pd_result.reindex_like(peek_result), ) @@ -2616,7 +2625,7 @@ def test_shift(scalars_df_index, scalars_pandas_df_index): # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].shift().astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2627,7 +2636,7 @@ def test_series_ffill(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[col_name].ffill(limit=1).to_pandas() pd_result = scalars_pandas_df_index[col_name].ffill(limit=1) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2638,7 +2647,7 @@ def test_series_bfill(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[col_name].bfill(limit=2).to_pandas() pd_result = scalars_pandas_df_index[col_name].bfill(limit=2) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2653,7 +2662,7 @@ def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2674,7 +2683,7 @@ def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): .astype(pd.Int64Dtype()) ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2693,7 +2702,7 @@ def test_series_nlargest(scalars_df_index, scalars_pandas_df_index, keep): bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).to_pandas() pd_result = scalars_pandas_df_index[col_name].nlargest(4, keep=keep) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2716,7 +2725,7 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods): .astype(pd.Int64Dtype()) ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2735,7 +2744,7 @@ def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index["int64_col"].ffill().pct_change(periods=periods) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2754,7 +2763,7 @@ def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).to_pandas() pd_result = scalars_pandas_df_index[col_name].nsmallest(2, keep=keep) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2803,7 +2812,7 @@ def test_series_rank( .astype(pd.Float64Dtype()) ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2815,7 +2824,7 @@ def test_cast_float_to_int(scalars_df_index, scalars_pandas_df_index): # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2827,7 +2836,7 @@ def test_cast_float_to_bool(scalars_df_index, scalars_pandas_df_index): # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].astype(pd.BooleanDtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2845,7 +2854,7 @@ def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): .astype(pd.Float64Dtype()) ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2874,7 +2883,7 @@ def test_nested_analytic_ops_align(scalars_df_index, scalars_pandas_df_index): + pd_series.expanding().max() ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2890,7 +2899,7 @@ def test_cumsum_int_filtered(scalars_df_index, scalars_pandas_df_index): # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA pd_result = pd_col[pd_col > -2].cumsum().astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2902,7 +2911,7 @@ def test_cumsum_float(scalars_df_index, scalars_pandas_df_index): # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Float64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2913,7 +2922,7 @@ def test_cummin_int(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[col_name].cummin().to_pandas() pd_result = scalars_pandas_df_index[col_name].cummin() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2924,7 +2933,7 @@ def test_cummax_int(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index[col_name].cummax().to_pandas() pd_result = scalars_pandas_df_index[col_name].cummax() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2957,7 +2966,7 @@ def test_value_counts(scalars_dfs, kwargs): bf_result = s.value_counts(**kwargs).to_pandas() pd_result = pd_s.value_counts(**kwargs) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -2995,7 +3004,7 @@ def test_value_counts_w_cut(scalars_dfs): pd_result = pd_cut.value_counts() pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result.astype(pd.Int64Dtype()), ) @@ -3006,7 +3015,7 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[1:].iloc[1:] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3035,7 +3044,7 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3071,7 +3080,7 @@ def test_series_add_prefix(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index["int64_too"].add_prefix("prefix_") # Index will be object type in pandas, string type in bigframes, but same values - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, @@ -3084,7 +3093,7 @@ def test_series_add_suffix(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index["int64_too"].add_suffix("_suffix") # Index will be object type in pandas, string type in bigframes, but same values - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, @@ -3112,7 +3121,7 @@ def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index["float64_col"].filter(like="ello") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3126,7 +3135,7 @@ def test_series_filter_regex(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index["float64_col"].filter(regex="^[GH].*") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3141,7 +3150,7 @@ def test_series_reindex(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3168,7 +3177,7 @@ def test_series_reindex_like(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3184,7 +3193,7 @@ def test_where_with_series(scalars_df_index, scalars_pandas_df_index): scalars_pandas_df_index["bool_col"], scalars_pandas_df_index["int64_too"] ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3209,7 +3218,7 @@ def test_where_with_different_indices(scalars_df_index, scalars_pandas_df_index) ) ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3223,7 +3232,7 @@ def test_where_with_default(scalars_df_index, scalars_pandas_df_index): scalars_pandas_df_index["bool_col"] ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3243,7 +3252,7 @@ def _is_positive(x): cond=_is_positive, other=lambda x: x * 10 ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3292,7 +3301,7 @@ def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): upper_pd = scalars_pandas_df_index["int64_too"].iloc[:5] + 1 pd_result = col_pd.clip(lower_pd, upper_pd) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3307,7 +3316,7 @@ def test_clip_filtered_one_sided(scalars_df_index, scalars_pandas_df_index): lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 pd_result = col_pd.clip(lower_pd, None) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3337,7 +3346,7 @@ def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusi ) pd_result = scalars_pandas_df_index["int64_col"].between(left, right, inclusive) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result.astype(pd.BooleanDtype()), ) @@ -3375,7 +3384,7 @@ def test_series_case_when(scalars_dfs_maybe_ordered): bf_result = bf_series.case_when(bf_conditions).to_pandas() pd_result = pd_series.case_when(pd_conditions) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result.astype(pd.Int64Dtype()), ) @@ -3411,7 +3420,7 @@ def test_series_case_when_change_type(scalars_dfs_maybe_ordered): bf_result = bf_series.case_when(bf_conditions).to_pandas() pd_result = pd_series.case_when(pd_conditions) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result.astype("string[pyarrow]"), ) @@ -3440,7 +3449,7 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), scalars_pandas_df_index["int64_col"], check_dtype=False, @@ -3453,7 +3462,7 @@ def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): scalars_df_index["int64_col"].to_csv(path) gcs_df = pd.read_csv(get_first_file_from_wildcard(path)) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), scalars_pandas_df_index["int64_col"], check_dtype=False, @@ -3582,7 +3591,7 @@ def test_series_values(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index["int64_too"].values # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( pd.Series(bf_result), pd.Series(pd_result), check_dtype=False ) @@ -3615,7 +3624,7 @@ def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_po ascending=ascending, na_position=na_position ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3628,7 +3637,7 @@ def test_series_sort_values_inplace(scalars_df_index, scalars_pandas_df_index): bf_result = bf_series.to_pandas() pd_result = scalars_pandas_df_index["int64_col"].sort_values(ascending=False) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3647,7 +3656,7 @@ def test_sort_index(scalars_df_index, scalars_pandas_df_index, ascending): ) pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=ascending) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3659,7 +3668,7 @@ def test_series_sort_index_inplace(scalars_df_index, scalars_pandas_df_index): bf_result = bf_series.to_pandas() pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=False) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3711,7 +3720,7 @@ def _ten_times(x): cond=lambda x: x > 0, other=_ten_times ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -3820,7 +3829,7 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, erro pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_astype_python(session): @@ -3831,7 +3840,7 @@ def test_series_astype_python(session): index=pd.Index([0, 1, 2, 3], dtype="Int64"), ) result = session.read_pandas(input).astype(float, errors="null").to_pandas() - pd.testing.assert_series_equal(result, exepcted) + bigframes.testing.assert_series_equal(result, exepcted) def test_astype_safe(session): @@ -3842,7 +3851,7 @@ def test_astype_safe(session): index=pd.Index([0, 1, 2, 3], dtype="Int64"), ) result = session.read_pandas(input).astype("Float64", errors="null").to_pandas() - pd.testing.assert_series_equal(result, exepcted) + bigframes.testing.assert_series_equal(result, exepcted) def test_series_astype_w_invalid_error(session): @@ -3863,7 +3872,7 @@ def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): .apply(lambda x: None if pd.isna(x) else math.trunc(x)) .astype(to_type) ) - pd.testing.assert_series_equal(bf_result, pd_result) + bigframes.testing.assert_series_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -3881,7 +3890,7 @@ def test_date_time_astype_int( pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_index[column].astype(to_type).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) assert bf_result.dtype == "Int64" @@ -3892,7 +3901,7 @@ def test_string_astype_int(session): pd_result = pd_series.astype("Int64") bf_result = bf_series.astype("Int64").to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) def test_string_astype_float(session): @@ -3905,7 +3914,7 @@ def test_string_astype_float(session): pd_result = pd_series.astype("Float64") bf_result = bf_series.astype("Float64").to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) def test_string_astype_date(session): @@ -3925,7 +3934,7 @@ def test_string_astype_date(session): pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) def test_string_astype_datetime(session): @@ -3938,7 +3947,7 @@ def test_string_astype_datetime(session): pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) def test_string_astype_timestamp(session): @@ -3957,7 +3966,7 @@ def test_string_astype_timestamp(session): pd.ArrowDtype(pa.timestamp("us", tz="UTC")) ).to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + bigframes.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) def test_timestamp_astype_string(session): @@ -3979,7 +3988,7 @@ def test_timestamp_astype_string(session): ) bf_result = bf_series.astype(pa.string()).to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, expected_result, check_index_type=False, check_dtype=False ) assert bf_result.dtype == "string[pyarrow]" @@ -3995,7 +4004,7 @@ def test_float_astype_json(errors, session): expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) expected_result.index = expected_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), expected_result) def test_float_astype_json_str(session): @@ -4007,7 +4016,7 @@ def test_float_astype_json_str(session): expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) expected_result.index = expected_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), expected_result) @pytest.mark.parametrize("errors", ["raise", "null"]) @@ -4024,7 +4033,7 @@ def test_string_astype_json(errors, session): assert bf_result.dtype == dtypes.JSON_DTYPE pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_string_astype_json_in_safe_mode(session): @@ -4035,7 +4044,7 @@ def test_string_astype_json_in_safe_mode(session): expected = pd.Series([None], dtype=dtypes.JSON_DTYPE) expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), expected) def test_string_astype_json_raise_error(session): @@ -4073,7 +4082,7 @@ def test_json_astype_others(data, to_type, errors, session): load_data = [json.loads(item) if item is not None else None for item in data] expected = pd.Series(load_data, dtype=to_type) expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), expected) @pytest.mark.parametrize( @@ -4107,7 +4116,7 @@ def test_json_astype_others_in_safe_mode(data, to_type, session): expected = pd.Series([None, None], dtype=to_type) expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + bigframes.testing.assert_series_equal(bf_result.to_pandas(), expected) @pytest.mark.parametrize( @@ -4130,7 +4139,7 @@ def test_loc_bool_series_explicit_index(scalars_df_index, scalars_pandas_df_inde bf_result = scalars_df_index.string_col.loc[scalars_df_index.bool_col].to_pandas() pd_result = scalars_pandas_df_index.string_col.loc[scalars_pandas_df_index.bool_col] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, ) @@ -4191,7 +4200,7 @@ def test_rename(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.string_col.rename("newname") pd_result = scalars_pandas_df_index.string_col.rename("newname") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4201,7 +4210,7 @@ def test_rename_nonstring(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.string_col.rename((4, 2)) pd_result = scalars_pandas_df_index.string_col.rename((4, 2)) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4213,7 +4222,7 @@ def test_rename_dict_same_type(scalars_df_index, scalars_pandas_df_index): pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4223,7 +4232,7 @@ def test_rename_axis(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.string_col.rename_axis("newindexname") pd_result = scalars_pandas_df_index.string_col.rename_axis("newindexname") - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4240,7 +4249,7 @@ def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.string_col.loc[index_list] pd_result = scalars_pandas_df_index.string_col.loc[index_list] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4252,7 +4261,7 @@ def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.bool_col.loc[index_list] pd_result = scalars_pandas_df_index.bool_col.loc[index_list] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4268,7 +4277,7 @@ def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_multiindex.int64_too.loc[index_list] pd_result = scalars_pandas_df_multiindex.int64_too.loc[index_list] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4280,7 +4289,7 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.string_col.iloc[index_list] pd_result = scalars_pandas_df_index.string_col.iloc[index_list] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4294,7 +4303,7 @@ def test_iloc_list_nameless(scalars_df_index, scalars_pandas_df_index): pd_series = scalars_pandas_df_index.string_col.rename(None) pd_result = pd_series.iloc[index_list] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4309,7 +4318,7 @@ def test_loc_list_nameless(scalars_df_index, scalars_pandas_df_index): pd_series = scalars_pandas_df_index.string_col.rename(None) pd_result = pd_series.loc[index_list] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4325,7 +4334,7 @@ def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.date_col.loc[bf_string_series] pd_result = scalars_pandas_df_index.date_col.loc[pd_string_series] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4343,7 +4352,7 @@ def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_multiindex.int64_too.loc[bf_string_series] pd_result = scalars_pandas_df_multiindex.int64_too.loc[pd_string_series] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4356,7 +4365,7 @@ def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.date_col.loc[bf_index] pd_result = scalars_pandas_df_index.date_col.loc[pd_index] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4370,7 +4379,7 @@ def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_ind index = "Hello, World!" bf_result = scalars_df_index.date_col.loc[index] pd_result = scalars_pandas_df_index.date_col.loc[index] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4455,7 +4464,7 @@ def test_map_dict_input(scalars_dfs): pd_result = pd_result.astype("Int64") # pandas type differences bf_result = scalars_df.string_col.map(local_map) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4474,7 +4483,7 @@ def test_map_series_input(scalars_dfs): pd_result = scalars_pandas_df.int64_too.map(pd_map_series) bf_result = scalars_df.int64_too.map(bf_map_series) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result.to_pandas(), pd_result, ) @@ -4561,7 +4570,7 @@ def test_apply_lambda(scalars_dfs, col, lambda_): bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() pd_col = scalars_pandas_df[col] - if pd.__version__[:3] in ("2.2", "2.3"): + if pd.__version__[:3] in ("2.2", "2.3", "3.0"): pd_result = pd_col.apply(lambda_, by_row=False) else: pd_result = pd_col.apply(lambda_) @@ -4654,7 +4663,7 @@ def foo(x): pd_col = scalars_pandas_df["int64_col"] - if pd.__version__[:3] in ("2.2", "2.3"): + if pd.__version__[:3] in ("2.2", "2.3", "3.0"): pd_result = pd_col.apply(foo, by_row=False) else: pd_result = pd_col.apply(foo) @@ -4735,7 +4744,7 @@ def foo(x: int, y: int, df): def test_series_explode(data): s = bigframes.pandas.Series(data) pd_s = s.to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( s.explode().to_pandas(), pd_s.explode(), check_index_type=False, @@ -4781,7 +4790,7 @@ def test_series_explode_w_index(index, ignore_index): s = bigframes.pandas.Series(data, index=index) pd_s = pd.Series(data, index=index) # TODO(b/340885567): fix type error - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( s.explode(ignore_index=ignore_index).to_pandas(), # type: ignore pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), # type: ignore check_index_type=False, @@ -4806,7 +4815,7 @@ def test_series_explode_reserve_order(ignore_index, ordered): # TODO(b/340885567): fix type error pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) # type: ignore pd_res.index = pd_res.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( res if ordered else res.sort_index(), pd_res, ) @@ -4828,7 +4837,7 @@ def test_series_construct_empty_array(): dtype=pd.ArrowDtype(pa.list_(pa.float64())), index=pd.Index([0], dtype=pd.Int64Dtype()), ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( expected, s.to_pandas(), ) @@ -4845,7 +4854,7 @@ def test_series_construct_empty_array(): ) def test_series_explode_null(data): s = bigframes.pandas.Series(data) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( s.explode().to_pandas(), s.to_pandas().explode(), check_dtype=False, @@ -4870,7 +4879,9 @@ def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, ] bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() - pd.testing.assert_series_equal(bf_result, pd_result) + # TODO: (b/484364312) + pd_result.index.names = bf_result.index.names + bigframes.testing.assert_series_equal(bf_result, pd_result) def test_series_struct_get_field_by_attribute( @@ -4882,13 +4893,13 @@ def test_series_struct_get_field_by_attribute( bf_series = nested_structs_df["person"] df_series = nested_structs_pandas_df["person"] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_series.address.city.to_pandas(), df_series.struct.field("address").struct.field("city"), check_dtype=False, check_index=False, ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_series.address.country.to_pandas(), df_series.struct.field("address").struct.field("country"), check_dtype=False, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 0501df3f8c9..9d37f23f187 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -37,6 +37,7 @@ import bigframes.dtypes import bigframes.ml.linear_model import bigframes.session.execution_spec +import bigframes.testing from bigframes.testing import utils all_write_engines = pytest.mark.parametrize( @@ -326,7 +327,7 @@ def test_read_gbq_w_anonymous_query_results_table(session: bigframes.Session): df = session.read_gbq(destination, index_col="name") result = df.to_pandas() expected.index = expected.index.astype(result.index.dtype) - pd.testing.assert_frame_equal(result, expected, check_dtype=False) + bigframes.testing.assert_frame_equal(result, expected, check_dtype=False) def test_read_gbq_w_primary_keys_table( @@ -349,7 +350,7 @@ def test_read_gbq_w_primary_keys_table( # Verify that the DataFrame is already sorted by primary keys. sorted_result = result.sort_values(primary_keys) - pd.testing.assert_frame_equal(result, sorted_result) + bigframes.testing.assert_frame_equal(result, sorted_result) # Verify that we're working from a snapshot rather than a copy of the table. assert "FOR SYSTEM_TIME AS OF" in df.sql @@ -388,7 +389,7 @@ def test_read_gbq_w_primary_keys_table_and_filters( # Verify that the DataFrame is already sorted by primary keys. sorted_result = result.sort_values(primary_keys) - pd.testing.assert_frame_equal(result, sorted_result) + bigframes.testing.assert_frame_equal(result, sorted_result) @pytest.mark.parametrize( @@ -533,7 +534,9 @@ def test_read_gbq_w_ambigous_name( .to_pandas() ) pd_df = pd.DataFrame({"x": [2, 1], "ambiguous_name": [20, 10]}) - pd.testing.assert_frame_equal(df, pd_df, check_dtype=False, check_index_type=False) + bigframes.testing.assert_frame_equal( + df, pd_df, check_dtype=False, check_index_type=False + ) def test_read_gbq_table_clustered_with_filter(session: bigframes.Session): @@ -768,8 +771,8 @@ def test_read_gbq_w_json_and_compare_w_pandas_json(session): dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), ) pd_df.index = pd_df.index.astype("Int64") - pd.testing.assert_series_equal(df.dtypes, pd_df.dtypes) - pd.testing.assert_series_equal(df["json_col"].to_pandas(), pd_df["json_col"]) + bigframes.testing.assert_series_equal(df.dtypes, pd_df.dtypes) + bigframes.testing.assert_series_equal(df["json_col"].to_pandas(), pd_df["json_col"]) def test_read_gbq_w_json_in_struct(session): @@ -867,7 +870,7 @@ def test_read_pandas(session, scalars_dfs): result = df.to_pandas() expected = scalars_pandas_df - pd.testing.assert_frame_equal(result, expected) + bigframes.testing.assert_frame_equal(result, expected) def test_read_pandas_series(session): @@ -876,7 +879,7 @@ def test_read_pandas_series(session): pd_series = pd.Series([3, 1, 4, 1, 5], dtype=pd.Int64Dtype(), index=idx) bf_series = session.read_pandas(pd_series) - pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + bigframes.testing.assert_series_equal(bf_series.to_pandas(), pd_series) def test_read_pandas_index(session): @@ -884,7 +887,7 @@ def test_read_pandas_index(session): pd_idx: pd.Index = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) bf_idx = session.read_pandas(pd_idx) - pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + bigframes.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) def test_read_pandas_w_unsupported_mixed_dtype(session): @@ -914,7 +917,7 @@ def test_read_pandas_col_label_w_space(session: bigframes.Session): ) result = session.read_pandas(expected).to_pandas() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( result, expected, check_index_type=False, check_dtype=False ) @@ -922,7 +925,7 @@ def test_read_pandas_col_label_w_space(session: bigframes.Session): def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index): df = session.read_pandas(scalars_pandas_df_multi_index) result = df.to_pandas() - pd.testing.assert_frame_equal(result, scalars_pandas_df_multi_index) + bigframes.testing.assert_frame_equal(result, scalars_pandas_df_multi_index) def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default_index): @@ -930,7 +933,7 @@ def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default pandas_df["rowid"] = np.arange(pandas_df.shape[0]) df_roundtrip = session.read_pandas(pandas_df).to_pandas() - pd.testing.assert_frame_equal(df_roundtrip, pandas_df, check_dtype=False) + bigframes.testing.assert_frame_equal(df_roundtrip, pandas_df, check_dtype=False) def test_read_pandas_tokyo( @@ -969,12 +972,14 @@ def test_read_pandas_timedelta_dataframes(session, write_engine): expected_result = pandas_df.astype(bigframes.dtypes.TIMEDELTA_DTYPE) expected_result.index = expected_result.index.astype(bigframes.dtypes.INT_DTYPE) - pd.testing.assert_frame_equal(actual_result, expected_result) + bigframes.testing.assert_frame_equal(actual_result, expected_result) @all_write_engines def test_read_pandas_timedelta_series(session, write_engine): - expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d")) + expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d")).astype( + "timedelta64[ns]" + ) actual_result = ( session.read_pandas(expected_series, write_engine=write_engine) @@ -982,15 +987,15 @@ def test_read_pandas_timedelta_series(session, write_engine): .astype("timedelta64[ns]") ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_series, check_index_type=False ) @all_write_engines def test_read_pandas_timedelta_index(session, write_engine): - expected_index = pd.to_timedelta( - [1, 2, 3], unit="d" + expected_index = pd.to_timedelta([1, 2, 3], unit="d").astype( + "timedelta64[ns]" ) # to_timedelta returns an index actual_result = ( @@ -999,7 +1004,7 @@ def test_read_pandas_timedelta_index(session, write_engine): .astype("timedelta64[ns]") ) - pd.testing.assert_index_equal(actual_result, expected_index) + bigframes.testing.assert_index_equal(actual_result, expected_index) @all_write_engines @@ -1018,7 +1023,9 @@ def test_read_pandas_json_dataframes(session, write_engine): expected_df, write_engine=write_engine ).to_pandas() - pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False) + bigframes.testing.assert_frame_equal( + actual_result, expected_df, check_index_type=False + ) @all_write_engines @@ -1034,7 +1041,7 @@ def test_read_pandas_json_series(session, write_engine): actual_result = session.read_pandas( expected_series, write_engine=write_engine ).to_pandas() - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_series, check_index_type=False ) @@ -1062,7 +1069,7 @@ def test_read_pandas_json_index(session, write_engine): actual_result = session.read_pandas( expected_index, write_engine=write_engine ).to_pandas() - pd.testing.assert_index_equal(actual_result, expected_index) + bigframes.testing.assert_index_equal(actual_result, expected_index) @pytest.mark.parametrize( @@ -1121,7 +1128,7 @@ def test_read_pandas_w_nested_json(session, write_engine): .to_pandas() .reset_index(drop=True) ) - pd.testing.assert_series_equal(bq_s, pd_s) + bigframes.testing.assert_series_equal(bq_s, pd_s) @pytest.mark.parametrize( @@ -1203,7 +1210,7 @@ def test_read_pandas_w_nested_json_index(session, write_engine): ), ) bq_idx = session.read_pandas(pd_idx, write_engine=write_engine).to_pandas() - pd.testing.assert_index_equal(bq_idx, pd_idx) + bigframes.testing.assert_index_equal(bq_idx, pd_idx) @all_write_engines @@ -1217,13 +1224,13 @@ def test_read_csv_for_gcs_file_w_write_engine(session, df_and_gcs_csv, write_eng write_engine=write_engine, dtype=scalars_df.dtypes.to_dict(), ) - pd.testing.assert_frame_equal(pd_df.to_pandas(), scalars_df.to_pandas()) + bigframes.testing.assert_frame_equal(pd_df.to_pandas(), scalars_df.to_pandas()) if write_engine in ("default", "bigquery_load"): bf_df = session.read_csv( path, engine="bigquery", index_col="rowindex", write_engine=write_engine ) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @pytest.mark.parametrize( @@ -1251,8 +1258,8 @@ def test_read_csv_for_local_file_w_sep(session, df_and_local_csv, sep): pd_df = session.read_csv( buffer, index_col="rowindex", sep=sep, dtype=scalars_df.dtypes.to_dict() ) - pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @pytest.mark.parametrize( @@ -1284,7 +1291,7 @@ def test_read_csv_for_index_col_w_false(session, df_and_local_csv, index_col): # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index("rowindex").sort_index() pd_df = pd_df.set_index("rowindex") - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @pytest.mark.parametrize( @@ -1307,7 +1314,7 @@ def test_read_csv_for_index_col(session, df_and_gcs_csv, index_col): ) assert bf_df.shape == pd_df.shape - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @pytest.mark.parametrize( @@ -1360,7 +1367,7 @@ def test_read_csv_for_gcs_wildcard_path(session, df_and_gcs_csv): assert bf_df.shape == pd_df.shape assert bf_df.columns.tolist() == pd_df.columns.tolist() - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns): @@ -1379,7 +1386,7 @@ def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns): # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index(names[0]).sort_index() pd_df = pd_df.set_index(names[0]) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_for_names_more_than_columns_can_raise_error( @@ -1408,7 +1415,7 @@ def test_read_csv_for_names_less_than_columns(session, df_and_gcs_csv_for_two_co # Pandas's index name is None, while BigFrames's index name is "rowindex". pd_df.index.name = "rowindex" - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_for_names_less_than_columns_raise_error_when_index_col_set( @@ -1446,7 +1453,7 @@ def test_read_csv_for_names_and_index_col( assert bf_df.shape == pd_df.shape assert bf_df.columns.tolist() == pd_df.columns.tolist() - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_df.to_pandas(), pd_df.to_pandas(), check_index_type=False ) @@ -1478,7 +1485,7 @@ def test_read_csv_for_names_and_usecols( # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index(names[0]).sort_index() pd_df = pd_df.set_index(names[0]) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_for_names_and_invalid_usecols( @@ -1525,7 +1532,7 @@ def test_read_csv_for_names_and_usecols_and_indexcol( assert bf_df.shape == pd_df.shape assert bf_df.columns.tolist() == pd_df.columns.tolist() - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_for_names_less_than_columns_and_same_usecols( @@ -1548,7 +1555,7 @@ def test_read_csv_for_names_less_than_columns_and_same_usecols( # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index(names[0]).sort_index() pd_df = pd_df.set_index(names[0]) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_for_names_less_than_columns_and_mismatched_usecols( @@ -1593,7 +1600,7 @@ def test_read_csv_for_dtype(session, df_and_gcs_csv_for_two_columns): # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index("rowindex").sort_index() pd_df = pd_df.set_index("rowindex") - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_for_dtype_w_names(session, df_and_gcs_csv_for_two_columns): @@ -1613,7 +1620,7 @@ def test_read_csv_for_dtype_w_names(session, df_and_gcs_csv_for_two_columns): # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index("a").sort_index() pd_df = pd_df.set_index("a") - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @pytest.mark.parametrize( @@ -1680,8 +1687,8 @@ def test_read_csv_for_gcs_file_w_header(session, df_and_gcs_csv, header): # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index("rowindex").sort_index() pd_df = pd_df.set_index("rowindex") - pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_w_usecols(session, df_and_local_csv): @@ -1709,7 +1716,7 @@ def test_read_csv_w_usecols(session, df_and_local_csv): # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index("rowindex").sort_index() pd_df = pd_df.set_index("rowindex") - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_w_usecols_and_indexcol(session, df_and_local_csv): @@ -1735,7 +1742,7 @@ def test_read_csv_w_usecols_and_indexcol(session, df_and_local_csv): assert bf_df.shape == pd_df.shape assert bf_df.columns.tolist() == pd_df.columns.tolist() - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_w_indexcol_not_in_usecols(session, df_and_local_csv): @@ -1790,10 +1797,10 @@ def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index): bf_df = session.read_csv( path, engine="bigquery", index_col="rowindex", encoding="ISO-8859-1" ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_df.to_pandas(), penguins_pandas_df_default_index ) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + bigframes.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_pickle_local(session, penguins_pandas_df_default_index, tmp_path): @@ -1802,7 +1809,9 @@ def test_read_pickle_local(session, penguins_pandas_df_default_index, tmp_path): penguins_pandas_df_default_index.to_pickle(path) df = session.read_pickle(path) - pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas()) + bigframes.testing.assert_frame_equal( + penguins_pandas_df_default_index, df.to_pandas() + ) def test_read_pickle_buffer(session, penguins_pandas_df_default_index): @@ -1811,7 +1820,9 @@ def test_read_pickle_buffer(session, penguins_pandas_df_default_index): buffer.seek(0) df = session.read_pickle(buffer) - pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas()) + bigframes.testing.assert_frame_equal( + penguins_pandas_df_default_index, df.to_pandas() + ) def test_read_pickle_series_buffer(session): @@ -1830,7 +1841,9 @@ def test_read_pickle_gcs(session, penguins_pandas_df_default_index, gcs_folder): penguins_pandas_df_default_index.to_pickle(path) df = session.read_pickle(path) - pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas()) + bigframes.testing.assert_frame_equal( + penguins_pandas_df_default_index, df.to_pandas() + ) @pytest.mark.parametrize( @@ -1903,7 +1916,7 @@ def test_read_parquet_gcs( assert df_out.size != 0 pd_df_in = df_in.to_pandas() pd_df_out = df_out.to_pandas() - pd.testing.assert_frame_equal(pd_df_in, pd_df_out) + bigframes.testing.assert_frame_equal(pd_df_in, pd_df_out) @pytest.mark.parametrize( @@ -1953,7 +1966,7 @@ def test_read_parquet_gcs_compressed( assert df_out.size != 0 pd_df_in = df_in.to_pandas() pd_df_out = df_out.to_pandas() - pd.testing.assert_frame_equal(pd_df_in, pd_df_out) + bigframes.testing.assert_frame_equal(pd_df_in, pd_df_out) @pytest.mark.parametrize( @@ -1998,7 +2011,7 @@ def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): df = session.read_json(read_path, lines=True, orient="records", engine="bigquery") # The auto detects of BigQuery load job does not preserve any ordering of columns for json. - pd.testing.assert_index_equal( + bigframes.testing.assert_index_equal( df.columns.sort_values(), scalars_df.columns.sort_values() ) @@ -2023,7 +2036,7 @@ def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): ] ) assert df.shape[0] == scalars_df.shape[0] - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( df.dtypes.sort_index(), scalars_df.dtypes.sort_index() ) @@ -2049,7 +2062,7 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): orient="records", ) - pd.testing.assert_index_equal(df.columns, scalars_df.columns) + bigframes.testing.assert_index_equal(df.columns, scalars_df.columns) # The auto detects of BigQuery load job have restrictions to detect the bytes, # numeric and geometry types, so they're skipped here. @@ -2063,7 +2076,7 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df = scalars_df.drop(columns=["date_col", "datetime_col", "time_col"]) assert df.shape[0] == scalars_df.shape[0] - pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) + bigframes.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) @pytest.mark.parametrize( @@ -2211,7 +2224,7 @@ def _assert_query_dry_run_stats_are_valid(result: pd.Series): ] ) - pd.testing.assert_index_equal(result.index, expected_index) + bigframes.testing.assert_index_equal(result.index, expected_index) assert result["columnCount"] + result["indexLevel"] > 0 @@ -2231,5 +2244,5 @@ def _assert_table_dry_run_stats_are_valid(result: pd.Series): ] ) - pd.testing.assert_index_equal(result.index, expected_index) + bigframes.testing.assert_index_equal(result.index, expected_index) assert result["columnCount"] == len(result["columnDtypes"]) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index c7ff0ca1dd4..c8db041fec2 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -261,7 +261,9 @@ def test_resample_with_index(unordered_session, rule, origin, data): assert isinstance(bf_result.index, bpd.DatetimeIndex) assert isinstance(pd_result.index, pd.DatetimeIndex) - pd.testing.assert_frame_equal( + # TODO: (b/484364312) + pd_result.index.name = bf_result.index.name + assert_frame_equal( bf_result.to_pandas(), pd_result, check_index_type=False, @@ -290,4 +292,4 @@ def test_unordered_df_pivot( # Pandas produces NaN, where bq dataframes produces pd.NA bf_result = bf_result.fillna(float("nan")) pd_result = pd_result.fillna(float("nan")) - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_window.py b/tests/system/small/test_window.py index 29ab581f76f..843ac2a5812 100644 --- a/tests/system/small/test_window.py +++ b/tests/system/small/test_window.py @@ -19,6 +19,7 @@ import pytest from bigframes import dtypes +import bigframes.testing @pytest.fixture(scope="module") @@ -61,7 +62,9 @@ def test_dataframe_rolling_closed_param(rows_rolling_dfs, closed): actual_result = bf_df.rolling(window=3, closed=closed).sum().to_pandas() expected_result = pd_df.rolling(window=3, closed=closed).sum() - pd.testing.assert_frame_equal(actual_result, expected_result, check_dtype=False) + bigframes.testing.assert_frame_equal( + actual_result, expected_result, check_dtype=False + ) @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) @@ -80,7 +83,7 @@ def test_dataframe_groupby_rolling_closed_param(rows_rolling_dfs, closed): expected_result = ( pd_df.groupby(pd_df["int64_too"] % 2).rolling(window=3, closed=closed).sum() ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( actual_result[check_columns], expected_result, check_dtype=False ) @@ -91,7 +94,9 @@ def test_dataframe_rolling_on(rows_rolling_dfs): actual_result = bf_df.rolling(window=3, on="int64_too").sum().to_pandas() expected_result = pd_df.rolling(window=3, on="int64_too").sum() - pd.testing.assert_frame_equal(actual_result, expected_result, check_dtype=False) + bigframes.testing.assert_frame_equal( + actual_result, expected_result, check_dtype=False + ) def test_dataframe_rolling_on_invalid_column_raise_error(rows_rolling_dfs): @@ -116,7 +121,7 @@ def test_dataframe_groupby_rolling_on(rows_rolling_dfs): expected_result = ( pd_df.groupby(pd_df["int64_too"] % 2).rolling(window=3, on="float64_col").sum() ) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( actual_result[check_columns], expected_result, check_dtype=False ) @@ -135,7 +140,9 @@ def test_series_rolling_closed_param(rows_rolling_series, closed): actual_result = bf_series.rolling(window=3, closed=closed).sum().to_pandas() expected_result = df_series.rolling(window=3, closed=closed).sum() - pd.testing.assert_series_equal(actual_result, expected_result, check_dtype=False) + bigframes.testing.assert_series_equal( + actual_result, expected_result, check_dtype=False + ) @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) @@ -152,7 +159,9 @@ def test_series_groupby_rolling_closed_param(rows_rolling_series, closed): expected_result = ( df_series.groupby(df_series % 2).rolling(window=3, closed=closed).sum() ) - pd.testing.assert_series_equal(actual_result, expected_result, check_dtype=False) + bigframes.testing.assert_series_equal( + actual_result, expected_result, check_dtype=False + ) @pytest.mark.parametrize( @@ -186,7 +195,9 @@ def test_series_window_agg_ops(rows_rolling_series, windowing, agg_op): actual_result = agg_op(windowing(bf_series)).to_pandas() expected_result = agg_op(windowing(pd_series)) - pd.testing.assert_series_equal(expected_result, actual_result, check_dtype=False) + bigframes.testing.assert_series_equal( + expected_result, actual_result, check_dtype=False + ) @pytest.mark.parametrize( @@ -225,7 +236,7 @@ def test_dataframe_window_agg_ops(scalars_dfs, windowing, agg_op): bf_result = agg_op(windowing(bf_df)).to_pandas() pd_result = agg_op(windowing(pd_df)) - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -258,6 +269,10 @@ def test_dataframe_window_agg_ops(scalars_dfs, windowing, agg_op): ], ) def test_dataframe_window_agg_func(scalars_dfs, windowing, func): + if pd.__version__.startswith("3"): + pytest.skip( + "pandas 3.0 bugged for this case 'Length of values (8) does not match length of index (9)'" + ) bf_df, pd_df = scalars_dfs target_columns = ["int64_too", "float64_col", "bool_col", "int64_col"] index_column = "bool_col" @@ -268,7 +283,7 @@ def test_dataframe_window_agg_func(scalars_dfs, windowing, func): pd_result = windowing(pd_df).agg(func) - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) def test_series_window_agg_single_func(scalars_dfs): @@ -281,7 +296,7 @@ def test_series_window_agg_single_func(scalars_dfs): pd_result = pd_series.expanding().agg("sum") - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) def test_series_window_agg_multi_func(scalars_dfs): @@ -294,7 +309,7 @@ def test_series_window_agg_multi_func(scalars_dfs): pd_result = pd_series.expanding().agg(["sum", np.mean]) - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + bigframes.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) @@ -320,7 +335,7 @@ def test_series_range_rolling(range_rolling_dfs, window, closed, ascending): .rolling(window=window, closed=closed) .min() ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_dtype=False, check_index=False ) @@ -341,7 +356,7 @@ def test_series_groupby_range_rolling(range_rolling_dfs): expected_result = ( pd_series.sort_index().groupby(pd_series % 2 == 0).rolling(window="3s").min() ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_dtype=False, check_index=False ) @@ -372,7 +387,7 @@ def test_dataframe_range_rolling(range_rolling_dfs, window, closed, ascending): # Need to cast Pandas index type. Otherwise it uses DatetimeIndex that # does not exist in BigFrame expected_result.index = expected_result.index.astype(dtypes.TIMESTAMP_DTYPE) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( actual_result, expected_result, check_dtype=False, @@ -389,7 +404,7 @@ def test_dataframe_range_rolling_on(range_rolling_dfs): # Need to specify the column order because Pandas (seemingly) # re-arranges columns alphabetically cols = ["ts_col", "int_col", "float_col"] - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( actual_result[cols], expected_result[cols], check_dtype=False, @@ -413,7 +428,7 @@ def test_dataframe_groupby_range_rolling(range_rolling_dfs): pd_df.sort_values(on).groupby("int_col").rolling(window="3s", on=on).min() ) expected_result.index = expected_result.index.set_names("index", level=1) - pd.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( actual_result, expected_result, check_dtype=False, @@ -440,7 +455,7 @@ def test_range_rolling_order_info_lookup(range_rolling_dfs): .rolling(window="3s") .count() ) - pd.testing.assert_series_equal( + bigframes.testing.assert_series_equal( actual_result, expected_result, check_dtype=False, check_index=False ) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_timedelta/out.sql index 52bec0921ab..a733ed81278 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_div_timedelta/out.sql @@ -2,5 +2,9 @@ SELECT `rowindex`, `timestamp_col`, `int64_col`, - CAST(FLOOR(IEEE_DIVIDE(86400000000, `int64_col`)) AS INT64) AS `timedelta_div_numeric` + CAST(IF( + IEEE_DIVIDE(86400000000, `int64_col`) > 0, + FLOOR(IEEE_DIVIDE(86400000000, `int64_col`)), + CEIL(IEEE_DIVIDE(86400000000, `int64_col`)) + ) AS INT64) AS `timedelta_div_numeric` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_timedelta/out.sql index 46a9640df36..8285d1e7d4a 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_timedelta/out.sql @@ -3,6 +3,14 @@ SELECT `timestamp_col`, `int64_col`, `duration_col`, - CAST(FLOOR(`duration_col` * `int64_col`) AS INT64) AS `timedelta_mul_numeric`, - CAST(FLOOR(`int64_col` * `duration_col`) AS INT64) AS `numeric_mul_timedelta` + CAST(IF( + `duration_col` * `int64_col` > 0, + FLOOR(`duration_col` * `int64_col`), + CEIL(`duration_col` * `int64_col`) + ) AS INT64) AS `timedelta_mul_numeric`, + CAST(IF( + `int64_col` * `duration_col` > 0, + FLOOR(`int64_col` * `duration_col`), + CEIL(`int64_col` * `duration_col`) + ) AS INT64) AS `numeric_mul_timedelta` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/tests/unit/core/test_groupby.py b/tests/unit/core/test_groupby.py index 4bef581b2f7..faee007c3d8 100644 --- a/tests/unit/core/test_groupby.py +++ b/tests/unit/core/test_groupby.py @@ -18,7 +18,7 @@ import bigframes.core.utils as utils import bigframes.pandas as bpd -from bigframes.testing.utils import assert_series_equal +import bigframes.testing pytest.importorskip("polars") pytest.importorskip("pandas", minversion="2.0.0") @@ -33,7 +33,7 @@ def test_groupby_df_iter_by_key_singular(polars_session): bf_result = bf_group_df.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -47,7 +47,7 @@ def test_groupby_df_iter_by_key_list(polars_session): bf_result = bf_group_df.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -69,7 +69,7 @@ def test_groupby_df_iter_by_key_list_multiple(polars_session): bf_result = bf_group_df.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -85,7 +85,7 @@ def test_groupby_df_iter_by_level_singular(polars_session): bf_result = bf_group_df.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -109,7 +109,7 @@ def test_groupby_df_iter_by_level_list_one_item(polars_session): assert bf_key == tuple(pd_key) else: assert bf_key == (pd_key,) - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -131,7 +131,7 @@ def test_groupby_df_iter_by_level_list_multiple(polars_session): bf_result = bf_group_df.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_frame_equal( + bigframes.testing.assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -149,7 +149,7 @@ def test_groupby_series_iter_by_level_singular(polars_session): bf_result = bf_group_series.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -175,7 +175,7 @@ def test_groupby_series_iter_by_level_list_one_item(polars_session): assert bf_key == tuple(pd_key) else: assert bf_key == (pd_key,) - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -199,7 +199,7 @@ def test_groupby_series_iter_by_level_list_multiple(polars_session): bf_result = bf_group_df.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -218,7 +218,7 @@ def test_groupby_series_iter_by_series(polars_session): bf_result = bf_group_series.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -237,7 +237,7 @@ def test_groupby_series_iter_by_series_list_one_item(polars_session): bf_result = bf_group_series.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -259,6 +259,6 @@ def test_groupby_series_iter_by_series_list_multiple(polars_session): bf_result = bf_group_series.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - assert_series_equal( + bigframes.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index 263fc82e3e5..dd2c83bd3bb 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -4116,9 +4116,12 @@ def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_json() + # pandas 3.0 bugged for serializing date col + bf_result = scalars_df_index.drop(columns="date_col").to_json() # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.to_json(default_handler=str) + pd_result = scalars_pandas_df_index.drop(columns="date_col").to_json( + default_handler=str + ) assert bf_result == pd_result diff --git a/third_party/bigframes_vendored/ibis/expr/operations/numeric.py b/third_party/bigframes_vendored/ibis/expr/operations/numeric.py index 384323c5965..d8eecff532b 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/numeric.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/numeric.py @@ -158,8 +158,6 @@ class Round(Value): def dtype(self): if self.arg.dtype.is_decimal(): return self.arg.dtype - elif self.digits is None: - return dt.int64 else: return dt.double From c25a6d0151380dde74368a35e13deb7a930b494f Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 24 Feb 2026 17:43:24 -0500 Subject: [PATCH 16/29] feat: add support for Python 3.14 (#2232) Co-authored-by: Lingqing Gan Co-authored-by: Shuowei Li Co-authored-by: Tim Swena Co-authored-by: Trevor Bergeron --- .github/workflows/lint.yml | 2 +- .github/workflows/unittest.yml | 4 +- .pre-commit-config.yaml | 2 +- CONTRIBUTING.rst | 10 ++-- GEMINI.md | 4 +- bigframes/_config/auth.py | 6 +-- bigframes/core/block_transforms.py | 1 - bigframes/core/blocks.py | 4 +- .../ibis_compiler/scalar_op_registry.py | 8 ---- bigframes/core/expression_factoring.py | 6 +-- bigframes/core/indexes/base.py | 1 - bigframes/core/indexes/datetimes.py | 1 - bigframes/core/window/rolling.py | 1 - bigframes/ml/cluster.py | 1 - bigframes/ml/decomposition.py | 1 - bigframes/ml/ensemble.py | 3 -- bigframes/ml/imported.py | 9 ++-- bigframes/ml/impute.py | 1 - bigframes/ml/model_selection.py | 1 - bigframes/ml/preprocessing.py | 1 - bigframes/ml/sql.py | 15 ++++-- bigframes/operations/blob.py | 3 +- bigframes/session/__init__.py | 4 +- bigframes/session/bq_caching_executor.py | 1 - bigframes/session/loader.py | 1 - .../getting_started_bq_dataframes.ipynb | 14 ------ notebooks/location/regionalized.ipynb | 14 ------ .../remote_functions/remote_function.ipynb | 15 ------ .../remote_function_usecases.ipynb | 14 ------ noxfile.py | 47 ++++++------------- samples/polars/noxfile.py | 2 +- samples/snippets/noxfile.py | 2 +- setup.py | 1 + .../large/functions/test_managed_function.py | 10 ---- .../large/functions/test_remote_function.py | 6 --- tests/system/load/test_llm.py | 1 - tests/system/small/ml/test_metrics.py | 1 - tests/system/small/ml/test_model_selection.py | 8 +++- tests/system/small/test_anywidget.py | 2 - tests/system/small/test_dataframe.py | 3 -- tests/system/small/test_dataframe_io.py | 2 - tests/system/small/test_index_io.py | 2 - tests/system/small/test_null_index.py | 1 - tests/system/small/test_pandas.py | 8 ++-- tests/system/small/test_pandas_options.py | 1 - tests/system/small/test_series.py | 1 - tests/system/small/test_series_io.py | 1 - tests/system/small/test_session.py | 2 - .../sqlglot/expressions/test_string_ops.py | 1 - .../compile/sqlglot/test_compile_concat.py | 1 - .../functions/test_remote_function_utils.py | 2 - tests/unit/test_dataframe_polars.py | 2 - tests/unit/test_series_polars.py | 1 - .../sklearn/preprocessing/_encoder.py | 3 +- 54 files changed, 70 insertions(+), 189 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7914b72651e..b848262c3aa 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.14" - name: Install nox run: | python -m pip install --upgrade setuptools pip wheel diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 2455f7abc4c..f3e3dae6c5e 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python: ['3.10', '3.11', '3.12', '3.13'] + python: ['3.10', '3.11', '3.12', '3.13', '3.14'] steps: - name: Checkout uses: actions/checkout@v4 @@ -48,7 +48,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.14" - name: Install coverage run: | python -m pip install --upgrade setuptools pip wheel diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 096bdeb2a78..2dc978f0032 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,7 +32,7 @@ repos: - id: isort name: isort (python) - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 23.7.0 hooks: - id: black - repo: https://github.com/pycqa/flake8 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7ac410bbf7a..97442e8dc5a 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows. + 3.10, 3.11, 3.12, 3.13 and 3.14 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests. - To run a single unit test:: - $ nox -s unit-3.13 -- -k + $ nox -s unit-3.14 -- -k .. note:: @@ -143,12 +143,12 @@ Running System Tests $ nox -s system # Run a single system test - $ nox -s system-3.13 -- -k + $ nox -s system-3.14 -- -k .. note:: - System tests are only configured to run under Python 3.10, 3.11, 3.12 and 3.13. + System tests are only configured to run under Python 3.10, 3.12 and 3.14. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -262,11 +262,13 @@ We support: - `Python 3.11`_ - `Python 3.12`_ - `Python 3.13`_ +- `Python 3.14`_ .. _Python 3.10: https://docs.python.org/3.10/ .. _Python 3.11: https://docs.python.org/3.11/ .. _Python 3.12: https://docs.python.org/3.12/ .. _Python 3.13: https://docs.python.org/3.13/ +.. _Python 3.14: https://docs.python.org/3.14/ Supported versions can be found in our ``noxfile.py`` `config`_. diff --git a/GEMINI.md b/GEMINI.md index 0d447f17a48..1c8cff33870 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -13,7 +13,7 @@ We use `nox` to instrument our tests. - To run a single unit test: ```bash - nox -r -s unit-3.13 -- -k + nox -r -s unit-3.14 -- -k ``` - Ignore this step if you lack access to Google Cloud resources. To run system @@ -23,7 +23,7 @@ We use `nox` to instrument our tests. $ nox -r -s system # Run a single system test - $ nox -r -s system-3.13 -- -k + $ nox -r -s system-3.14 -- -k - The codebase must have better coverage than it had previously after each change. You can test coverage via `nox -s unit system cover` (takes a long diff --git a/bigframes/_config/auth.py b/bigframes/_config/auth.py index 1574fc48835..ccb5fcbedb8 100644 --- a/bigframes/_config/auth.py +++ b/bigframes/_config/auth.py @@ -30,9 +30,9 @@ _cached_project_default: Optional[str] = None -def get_default_credentials_with_project() -> tuple[ - google.auth.credentials.Credentials, Optional[str] -]: +def get_default_credentials_with_project() -> ( + tuple[google.auth.credentials.Credentials, Optional[str]] +): global _AUTH_LOCK, _cached_credentials, _cached_project_default with _AUTH_LOCK: diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 5c6395d1714..ac6bbcb115c 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -618,7 +618,6 @@ def skew( skew_column_ids: typing.Sequence[str], grouping_column_ids: typing.Sequence[str] = (), ) -> blocks.Block: - original_columns = skew_column_ids column_labels = block.select_columns(original_columns).column_labels diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index ff7f2b9899b..239eedf6d3c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -657,7 +657,6 @@ def _get_sampling_option( sampling_method: Optional[str] = None, random_state: Optional[int] = None, ) -> sampling_options.SamplingOptions: - if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): raise NotImplementedError( f"The downsampling method {sampling_method} is not implemented, " @@ -700,7 +699,8 @@ def to_pandas_batches( """Download results one message at a time. page_size and max_results determine the size and number of batches, - see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result""" + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result + """ under_10gb = ( (not allow_large_results) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index d89c239cf4d..3ae98a267e1 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1006,7 +1006,6 @@ def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp): else timestamp(x) ) elif x.type() == ibis_dtypes.Timestamp(None): # type: ignore - return timestamp(x) else: # Numerical inputs. @@ -1908,7 +1907,6 @@ def struct_op_impl( def ai_generate( *values: ibis_types.Value, op: ops.AIGenerate ) -> ibis_types.StructValue: - return ai_ops.AIGenerate( _construct_prompt(values, op.prompt_context), # type: ignore op.connection_id, # type: ignore @@ -1923,7 +1921,6 @@ def ai_generate( def ai_generate_bool( *values: ibis_types.Value, op: ops.AIGenerateBool ) -> ibis_types.StructValue: - return ai_ops.AIGenerateBool( _construct_prompt(values, op.prompt_context), # type: ignore op.connection_id, # type: ignore @@ -1937,7 +1934,6 @@ def ai_generate_bool( def ai_generate_int( *values: ibis_types.Value, op: ops.AIGenerateInt ) -> ibis_types.StructValue: - return ai_ops.AIGenerateInt( _construct_prompt(values, op.prompt_context), # type: ignore op.connection_id, # type: ignore @@ -1951,7 +1947,6 @@ def ai_generate_int( def ai_generate_double( *values: ibis_types.Value, op: ops.AIGenerateDouble ) -> ibis_types.StructValue: - return ai_ops.AIGenerateDouble( _construct_prompt(values, op.prompt_context), # type: ignore op.connection_id, # type: ignore @@ -1963,7 +1958,6 @@ def ai_generate_double( @scalar_op_compiler.register_nary_op(ops.AIIf, pass_op=True) def ai_if(*values: ibis_types.Value, op: ops.AIIf) -> ibis_types.StructValue: - return ai_ops.AIIf( _construct_prompt(values, op.prompt_context), # type: ignore op.connection_id, # type: ignore @@ -1974,7 +1968,6 @@ def ai_if(*values: ibis_types.Value, op: ops.AIIf) -> ibis_types.StructValue: def ai_classify( *values: ibis_types.Value, op: ops.AIClassify ) -> ibis_types.StructValue: - return ai_ops.AIClassify( _construct_prompt(values, op.prompt_context), # type: ignore op.categories, # type: ignore @@ -1984,7 +1977,6 @@ def ai_classify( @scalar_op_compiler.register_nary_op(ops.AIScore, pass_op=True) def ai_score(*values: ibis_types.Value, op: ops.AIScore) -> ibis_types.StructValue: - return ai_ops.AIScore( _construct_prompt(values, op.prompt_context), # type: ignore op.connection_id, # type: ignore diff --git a/bigframes/core/expression_factoring.py b/bigframes/core/expression_factoring.py index b58330f5a45..208fc78ebdb 100644 --- a/bigframes/core/expression_factoring.py +++ b/bigframes/core/expression_factoring.py @@ -381,9 +381,9 @@ def graph_extract_scalar_exprs() -> Sequence[nodes.ColumnDef]: # TODO: We can prune expressions that won't be reused here, return tuple(nodes.ColumnDef(expr, id) for id, expr in results.items()) - def graph_extract_window_expr() -> Optional[ - Tuple[Sequence[nodes.ColumnDef], window_spec.WindowSpec] - ]: + def graph_extract_window_expr() -> ( + Optional[Tuple[Sequence[nodes.ColumnDef], window_spec.WindowSpec]] + ): for id in graph.sinks: next_def = by_id[id] if isinstance(next_def.expression, agg_expressions.WindowExpression): diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 011639ed9ef..f7438307f7e 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -211,7 +211,6 @@ def is_monotonic_increasing(self) -> bool: @property @validations.requires_ordering() def is_monotonic_decreasing(self) -> bool: - return typing.cast( bool, self._block.is_monotonic_decreasing(self._block.index_columns), diff --git a/bigframes/core/indexes/datetimes.py b/bigframes/core/indexes/datetimes.py index ec5174e8a9a..763e44be095 100644 --- a/bigframes/core/indexes/datetimes.py +++ b/bigframes/core/indexes/datetimes.py @@ -28,7 +28,6 @@ @docs.inherit_docs(vendored_pandas_datetime_index.DatetimeIndex) class DatetimeIndex(Index): - # Must be above 5000 for pandas to delegate to bigframes for binops __pandas_priority__ = 12000 diff --git a/bigframes/core/window/rolling.py b/bigframes/core/window/rolling.py index 97af59edf5b..9946da39940 100644 --- a/bigframes/core/window/rolling.py +++ b/bigframes/core/window/rolling.py @@ -218,7 +218,6 @@ def create_range_window( grouping_keys: Sequence[str] = tuple(), drop_null_groups: bool = True, ) -> Window: - if on is None: # Rolling on index index_dtypes = block.index.dtypes diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index f371be0cf38..827b9158626 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -44,7 +44,6 @@ class KMeans( base.UnsupervisedTrainablePredictor, bigframes_vendored.sklearn.cluster._kmeans.KMeans, ): - __doc__ = bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__ def __init__( diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index ca5ff102b44..7497ecbcbfe 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -226,7 +226,6 @@ def __init__( # TODO: Add support for hyperparameter tuning. l2_reg: float = 1.0, ): - feedback_type = feedback_type.lower() # type: ignore if feedback_type not in ("explicit", "implicit"): raise ValueError("Expected feedback_type to be `explicit` or `implicit`.") diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 7cd7079dfbd..67a51b702bb 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -213,7 +213,6 @@ class XGBClassifier( base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.xgboost.sklearn.XGBClassifier, ): - __doc__ = bigframes_vendored.xgboost.sklearn.XGBClassifier.__doc__ def __init__( @@ -370,7 +369,6 @@ class RandomForestRegressor( base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, ): - __doc__ = bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor.__doc__ def __init__( @@ -536,7 +534,6 @@ class RandomForestClassifier( base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, ): - __doc__ = bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier.__doc__ def __init__( diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 56b5d6735c9..9362303dcd1 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -73,7 +73,8 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: Input DataFrame. Schema is defined by the model. Returns: - bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model.""" + bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model. + """ if not self._bqml_model: if self.model_path is None: @@ -152,7 +153,8 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: Input DataFrame or Series. Schema is defined by the model. Returns: - bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model. + """ if not self._bqml_model: if self.model_path is None: @@ -271,7 +273,8 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: Input DataFrame or Series. Schema is defined by the model. Returns: - bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model.""" + bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model. + """ if not self._bqml_model: if self.model_path is None: diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index b3da895201d..d9be5832612 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -33,7 +33,6 @@ class SimpleImputer( base.Transformer, bigframes_vendored.sklearn.impute._base.SimpleImputer, ): - __doc__ = bigframes_vendored.sklearn.impute._base.SimpleImputer.__doc__ def __init__( diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 7dd13ec4d73..d93a761fef1 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -41,7 +41,6 @@ def train_test_split( stratify: Union[bpd.Series, None] = None, shuffle: bool = True, ) -> List[Union[bpd.DataFrame, bpd.Series]]: - if test_size is None: if train_size is None: test_size = 0.25 diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 22a3e7e2227..4d1050ae99b 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -328,7 +328,6 @@ def _compile_to_sql( ] elif self.strategy == "quantile": - return [ self._base_sql_generator.ml_quantile_bucketize( column, self.n_bins, f"kbinsdiscretizer_{column}" diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 2937368c92c..09a46b235d9 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -160,7 +160,8 @@ def ml_one_hot_encoder( name: str, ) -> str: """Encode ML.ONE_HOT_ENCODER for BQML. - https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params.""" + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params. + """ return f"""ML.ONE_HOT_ENCODER({sql_utils.identifier(numeric_expr_sql)}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {sql_utils.identifier(name)}""" def ml_label_encoder( @@ -171,14 +172,16 @@ def ml_label_encoder( name: str, ) -> str: """Encode ML.LABEL_ENCODER for BQML. - https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params. + """ return f"""ML.LABEL_ENCODER({sql_utils.identifier(numeric_expr_sql)}, {top_k}, {frequency_threshold}) OVER() AS {sql_utils.identifier(name)}""" def ml_polynomial_expand( self, columns: Iterable[str], degree: int, name: str ) -> str: """Encode ML.POLYNOMIAL_EXPAND. - https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand""" + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand + """ return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {sql_utils.identifier(name)}""" def ml_distance( @@ -190,7 +193,8 @@ def ml_distance( name: str, ) -> str: """Encode ML.DISTANCE for BQML. - https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance""" + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance + """ return f"""SELECT *, ML.DISTANCE({sql_utils.identifier(col_x)}, {sql_utils.identifier(col_y)}, '{type}') AS {sql_utils.identifier(name)} FROM ({source_sql})""" def ai_forecast( @@ -199,7 +203,8 @@ def ai_forecast( options: Mapping[str, Union[int, float, bool, Iterable[str]]], ): """Encode AI.FORECAST. - https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast""" + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast + """ named_parameters_sql = self.build_named_parameters(**options) return f"""SELECT * FROM AI.FORECAST(({source_sql}),{named_parameters_sql})""" diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index fd8509672dd..120979261f6 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -79,7 +79,8 @@ def metadata(self) -> bigframes.series.Series: """Retrieve the metadata of the Blob. Returns: - bigframes.series.Series: JSON metadata of the Blob. Contains fields: content_type, md5_hash, size and updated(time).""" + bigframes.series.Series: JSON metadata of the Blob. Contains fields: content_type, md5_hash, size and updated(time). + """ series_to_check = bigframes.series.Series(self._data._block) # Check if it's a struct series from a verbose operation if dtypes.is_struct_like(series_to_check.dtype): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 23f4178f3dd..c42270c4ddc 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1383,7 +1383,6 @@ def read_json( write_engine=write_engine, ) if engine == "bigquery": - if dtype is not None: raise NotImplementedError( "BigQuery engine does not support the dtype arguments." @@ -2262,7 +2261,8 @@ def _create_bq_connection( iam_role: Optional[str] = None, ) -> str: """Create the connection with the session settings and try to attach iam role to the connection SA. - If any of project, location or connection isn't specified, use the session defaults. Returns fully-qualified connection name.""" + If any of project, location or connection isn't specified, use the session defaults. Returns fully-qualified connection name. + """ connection = self.bq_connection if not connection else connection connection = bigframes.clients.get_canonical_bq_connection_id( connection_id=connection, diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 5ef91a4b6f2..943eee0c12d 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -633,7 +633,6 @@ def _execute_plan_gbq( create_table = True if not cache_spec.cluster_cols: - offsets_id = bigframes.core.identifiers.ColumnId( bigframes.core.guid.generate_guid() ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index bfef5f809d9..9d222a3755a 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -1453,7 +1453,6 @@ def _start_query_with_job( def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: - """ For backwards-compatibility, convert any previously client-side only parameters such as timeoutMs to the property name expected by the REST API. diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index fa88cf65bbb..f9fb950c534 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -1329,20 +1329,6 @@ "Running your own Python functions (or being able to bring your packages) and using them at scale is a challenge many data scientists face. BigQuery DataFrames makes it easy to deploy [remote functions](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_remote_function) that run scalar Python functions at BigQuery scale. These functions are persisted as [BigQuery remote functions](https://cloud.google.com/bigquery/docs/remote-functions) that you can then re-use." ] }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "# Python 3.13 is not yet a supported runtime for remote functions.\n", - "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", - "if sys.version_info >= (3, 13, 0):\n", - " sys.exit(0)" - ] - }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb index 066cd181364..23313ec0c4c 100644 --- a/notebooks/location/regionalized.ipynb +++ b/notebooks/location/regionalized.ipynb @@ -1339,20 +1339,6 @@ "# Using the Remote Functions" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "# Python 3.13 is not yet a supported runtime for remote functions.\n", - "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", - "if sys.version_info >= (3, 13, 0):\n", - " sys.exit(0)" - ] - }, { "attachments": {}, "cell_type": "markdown", diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb index e2bc88ecae7..4c0524d4026 100644 --- a/notebooks/remote_functions/remote_function.ipynb +++ b/notebooks/remote_functions/remote_function.ipynb @@ -1,20 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "bcff4fc4", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "# Python 3.13 is not yet a supported runtime for remote functions.\n", - "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", - "if sys.version_info >= (3, 13, 0):\n", - " sys.exit(0)" - ] - }, { "cell_type": "code", "execution_count": 19, diff --git a/notebooks/remote_functions/remote_function_usecases.ipynb b/notebooks/remote_functions/remote_function_usecases.ipynb index 03ae6520952..e3a94160ad9 100644 --- a/notebooks/remote_functions/remote_function_usecases.ipynb +++ b/notebooks/remote_functions/remote_function_usecases.ipynb @@ -21,20 +21,6 @@ "# limitations under the License." ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "# Python 3.13 is not yet a supported runtime for remote functions.\n", - "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", - "if sys.version_info >= (3, 13, 0):\n", - " sys.exit(0)" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/noxfile.py b/noxfile.py index 89e9c7684f2..888f9fd765a 100644 --- a/noxfile.py +++ b/noxfile.py @@ -27,14 +27,11 @@ import nox import nox.sessions -BLACK_VERSION = "black==22.3.0" +BLACK_VERSION = "black==23.7.0" FLAKE8_VERSION = "flake8==7.1.2" ISORT_VERSION = "isort==5.12.0" MYPY_VERSION = "mypy==1.15.0" -# TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751) -LATEST_FULLY_SUPPORTED_PYTHON = "3.12" - # Notebook tests should match colab and BQ Studio. # Check with import sys; sys.version_info # on a fresh notebook runtime. @@ -57,13 +54,9 @@ "setup.py", ] -DEFAULT_PYTHON_VERSION = "3.10" - -# Cloud Run Functions supports Python versions up to 3.12 -# https://cloud.google.com/run/docs/runtimes/python -E2E_TEST_PYTHON_VERSION = "3.12" +DEFAULT_PYTHON_VERSION = "3.14" -UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] +UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", PYTEST_VERSION, @@ -78,13 +71,14 @@ # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. "3.13": ["tests", "polars", "scikit-learn", "anywidget"], + "3.14": ["tests", "polars", "scikit-learn", "anywidget"], } # 3.11 is used by colab. # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search # bigframes/windows-docker, internally. -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", @@ -106,8 +100,9 @@ # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. "3.10": ["tests", "scikit-learn", "anywidget"], - LATEST_FULLY_SUPPORTED_PYTHON: ["tests", "scikit-learn", "polars", "anywidget"], + "3.12": ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars", "anywidget"], + "3.14": ["tests", "polars", "anywidget"], } LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -123,7 +118,7 @@ # from GitHub actions. "unit_noextras", "system-3.10", # No extras. - f"system-{LATEST_FULLY_SUPPORTED_PYTHON}", # All extras. + f"system-{DEFAULT_PYTHON_VERSION}", # All extras. "cover", # TODO(b/401609005): remove "cleanup", @@ -257,7 +252,7 @@ def unit_noextras(session): run_unit(session, install_test_extra=False) -@nox.session(python=DEFAULT_PYTHON_VERSION) +@nox.session(python="3.10") def mypy(session): """Run type checks with mypy.""" # Editable mode is not compatible with mypy when there are multiple @@ -400,7 +395,7 @@ def system(session: nox.sessions.Session): ) -@nox.session(python=LATEST_FULLY_SUPPORTED_PYTHON) +@nox.session(python=DEFAULT_PYTHON_VERSION) def system_noextras(session: nox.sessions.Session): """Run the system test suite.""" run_system( @@ -411,7 +406,7 @@ def system_noextras(session: nox.sessions.Session): ) -@nox.session(python=LATEST_FULLY_SUPPORTED_PYTHON) +@nox.session(python="3.12") def doctest(session: nox.sessions.Session): """Run the system test suite.""" run_system( @@ -439,7 +434,7 @@ def doctest(session: nox.sessions.Session): ) -@nox.session(python=E2E_TEST_PYTHON_VERSION) +@nox.session(python=DEFAULT_PYTHON_VERSION) def e2e(session: nox.sessions.Session): """Run the large tests in system test suite.""" run_system( @@ -537,7 +532,7 @@ def docs(session): ) -@nox.session(python=DEFAULT_PYTHON_VERSION) +@nox.session(python="3.10") def docfx(session): """Build the docfx yaml files for this library.""" @@ -734,20 +729,6 @@ def notebook(session: nox.Session): "notebooks/dataframes/anywidget_mode.ipynb", ] - # TODO: remove exception for Python 3.13 cloud run adds a runtime for it (internal issue 333742751) - # TODO: remove exception for Python 3.13 if nbmake adds support for - # sys.exit(0) or pytest.skip(...). - # See: https://github.com/treebeardtech/nbmake/issues/134 - if session.python == "3.13": - denylist.extend( - [ - "notebooks/getting_started/getting_started_bq_dataframes.ipynb", - "notebooks/remote_functions/remote_function_usecases.ipynb", - "notebooks/remote_functions/remote_function_vertex_claude_model.ipynb", - "notebooks/remote_functions/remote_function.ipynb", - ] - ) - # Convert each Path notebook object to a string using a list comprehension, # and remove tests that we choose not to test. notebooks = [str(nb) for nb in notebooks_list] @@ -922,7 +903,7 @@ def benchmark(session: nox.Session): ) -@nox.session(python="3.10") +@nox.session(python=DEFAULT_PYTHON_VERSION) def release_dry_run(session): env = {} diff --git a/samples/polars/noxfile.py b/samples/polars/noxfile.py index 494639d2fa5..782da043299 100644 --- a/samples/polars/noxfile.py +++ b/samples/polars/noxfile.py @@ -88,7 +88,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index 494639d2fa5..782da043299 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -88,7 +88,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/setup.py b/setup.py index 32fa628458b..2179fe3e964 100644 --- a/setup.py +++ b/setup.py @@ -144,6 +144,7 @@ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", "Topic :: Internet", ], diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 732123ec847..a74ff292732 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -32,7 +32,6 @@ def test_managed_function_array_output(session, scalars_dfs, dataset_id): try: - with warnings.catch_warnings(record=True) as record: @session.udf( @@ -85,7 +84,6 @@ def featurize(x: int) -> list[float]: def test_managed_function_series_apply(session, dataset_id, scalars_dfs): try: - # An explicit name with "def" in it is used to test the robustness of # the user code extraction logic, which depends on that term. bq_name = f"{prefixer.create_prefix()}_def_to_test_code_extraction" @@ -145,7 +143,6 @@ def test_managed_function_series_apply_array_output( scalars_dfs, ): try: - with pytest.warns(bfe.PreviewWarning, match="udf is in preview."): @session.udf(dataset=dataset_id, name=prefixer.create_prefix()) @@ -233,7 +230,6 @@ def add(x: int, y: int) -> int: def test_managed_function_series_combine_array_output(session, dataset_id, scalars_dfs): try: - # The type hints in this function's signature has conflicts. The # `input_types` and `output_type` arguments from udf decorator take # precedence and will be used instead. @@ -451,7 +447,6 @@ def foo(x, y, z): return [str(x), str(y), z] try: - assert getattr(foo, "is_row_processor") is False assert getattr(foo, "input_dtypes") == expected_dtypes assert getattr(foo, "output_dtype") == pandas.ArrowDtype( @@ -771,7 +766,6 @@ def analyze(row): "\nenvironment may not precisely match your local environment." ), ): - analyze_mf = session.udf( input_types=pandas.Series, output_type=str, @@ -1087,7 +1081,6 @@ def analyze(s: pandas.Series, x: bool, y: float) -> str: def test_managed_function_df_where_mask(session, dataset_id, scalars_dfs): try: - # The return type has to be bool type for callable where condition. def is_sum_positive(a, b): return a + b > 0 @@ -1154,7 +1147,6 @@ def is_sum_positive(a, b): def test_managed_function_df_where_mask_series(session, dataset_id, scalars_dfs): try: - # The return type has to be bool type for callable where condition. def is_sum_positive_series(s): return s["int64_col"] + s["int64_too"] > 0 @@ -1254,7 +1246,6 @@ def the_sum(s: pandas.Series) -> int: def test_managed_function_series_where_mask_map(session, dataset_id, scalars_dfs): try: - # The return type has to be bool type for callable where condition. def _is_positive(s): return s + 1000 > 0 @@ -1307,7 +1298,6 @@ def _is_positive(s): def test_managed_function_series_apply_args(session, dataset_id, scalars_dfs): try: - with pytest.warns(bfe.PreviewWarning, match="udf is in preview."): @session.udf(dataset=dataset_id, name=prefixer.create_prefix()) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index f8c4c472a9c..4b5d143c157 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -842,7 +842,6 @@ def test_remote_function_with_external_package_dependencies( session, scalars_dfs, dataset_id, bq_cf_connection ): try: - # The return type hint in this function's signature has conflict. The # `output_type` argument from remote_function decorator takes precedence # and will be used instead. @@ -897,7 +896,6 @@ def test_remote_function_with_explicit_name_reuse( session, scalars_dfs, dataset_id, bq_cf_connection ): try: - dirs_to_cleanup = [] # Define a user code @@ -1251,7 +1249,6 @@ def test_remote_function_via_session_custom_sa(scalars_dfs): rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) try: - # TODO(shobs): Figure out why the default ingress setting # (internal-only) does not work here @rf_session.remote_function( @@ -1324,7 +1321,6 @@ def test_remote_function_via_session_custom_build_sa( rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) try: - # TODO(shobs): Figure out why the default ingress setting # (internal-only) does not work here @rf_session.remote_function( @@ -3007,7 +3003,6 @@ def foo(x: int) -> int: @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_df_where_mask(session, dataset_id, scalars_dfs): try: - # The return type has to be bool type for callable where condition. def is_sum_positive(a, b): return a + b > 0 @@ -3086,7 +3081,6 @@ def the_sum(a, b): @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_df_where_mask_series(session, dataset_id, scalars_dfs): try: - # The return type has to be bool type for callable where condition. def is_sum_positive_series(s: pandas.Series) -> bool: return s["int64_col"] + s["int64_too"] > 0 diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 22a6f0bfd2e..4e8bc65912f 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -147,7 +147,6 @@ def test_claude3_text_generator_predict_with_params_success( def test_claude3_text_generator_predict_multi_col_success( llm_text_df, session, bq_connection ): - llm_text_df["additional_col"] = 1 claude3_text_generator_model = llm.Claude3TextGenerator( model_name="claude-3-haiku", connection_name=bq_connection, session=session diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index a0f0f6b48cf..5c595898164 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -807,7 +807,6 @@ def test_precision_score_binary_default_arguments(session): def test_precision_score_binary_invalid_input_raise_error( session, y_true, y_pred, pos_label ): - bf_y_true = session.read_pandas(y_true) bf_y_pred = session.read_pandas(y_pred) diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index ebce6e405a5..cbea1e20a0a 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -323,7 +323,13 @@ def test_train_test_split_value_error(penguins_df_default_index, train_size, tes ) def test_train_test_split_stratify(df_fixture, request): df = request.getfixturevalue(df_fixture) - X = df[["species", "island", "culmen_length_mm",]].rename( + X = df[ + [ + "species", + "island", + "culmen_length_mm", + ] + ].rename( columns={"species": "x_species"} ) # Keep "species" col just for easy checking. Rename to avoid conflicts. y = df[["species"]] diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index fad8f5b2b50..b9f7b87f5ea 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -406,7 +406,6 @@ def test_widget_with_empty_dataframe_should_have_zero_row_count( def test_widget_with_empty_dataframe_should_render_table_headers( empty_bf_df: bf.dataframe.DataFrame, ): - """ @@ -422,7 +421,6 @@ def test_widget_with_empty_dataframe_should_render_table_headers( """ with bigframes.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget widget = TableWidget(empty_bf_df) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index c1692b00a68..8caeabb98bc 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -949,7 +949,6 @@ def test_repr_w_display_options(scalars_dfs, session): with bigframes.option_context( "display.max_rows", 10, "display.max_columns", 5, "display.max_colwidth", 10 ): - # When there are 10 or fewer rows, the outputs should be identical except for the extra note. actual = scalars_df.head(10).__repr__() executions_post = metrics.execution_count @@ -2691,7 +2690,6 @@ def test_df_idxmax(): ], ) def test_df_align(join, axis): - index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") @@ -5084,7 +5082,6 @@ def test_iloc_list_multiindex(scalars_dfs): def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): - index_list: List[int] = [] bf_result = scalars_df_index.iloc[index_list] diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 8813afd1a33..cce230ae17d 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -253,7 +253,6 @@ def test_to_pandas_override_global_option(scalars_df_index): # Direct call to_pandas uses global default setting (allow_large_results=True), # table has 'bqdf' prefix. with bigframes.option_context("compute.allow_large_results", True): - scalars_df_index.to_pandas() table_id = scalars_df_index._query_job.destination.table_id assert table_id is not None @@ -324,7 +323,6 @@ def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): def test_to_arrow_override_global_option(scalars_df_index): # Direct call to_arrow uses global default setting (allow_large_results=True), with bigframes.option_context("compute.allow_large_results", True): - scalars_df_index.to_arrow() table_id = scalars_df_index._query_job.destination.table_id assert table_id is not None diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py index 306b15e67a2..b4d7c06da52 100644 --- a/tests/system/small/test_index_io.py +++ b/tests/system/small/test_index_io.py @@ -18,7 +18,6 @@ def test_to_pandas_override_global_option(scalars_df_index): with bigframes.option_context("compute.allow_large_results", True): - bf_index = scalars_df_index.index # Direct call to_pandas uses global default setting (allow_large_results=True), @@ -43,7 +42,6 @@ def test_to_pandas_dry_run(scalars_df_index): def test_to_numpy_override_global_option(scalars_df_index): with bigframes.option_context("compute.allow_large_results", True): - bf_index = scalars_df_index.index # Direct call to_numpy uses global default setting (allow_large_results=True), diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index 4aa7ba8c77c..eb9dc114dde 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -381,7 +381,6 @@ def test_null_index_df_concat(scalars_df_null_index, scalars_pandas_df_default_i def test_null_index_map_dict_input( scalars_df_null_index, scalars_pandas_df_default_index ): - local_map = dict() # construct a local map, incomplete to cover behavior for s in scalars_pandas_df_default_index.string_col[:-3]: diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index ccef51b1e93..d83955ecde7 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -102,7 +102,7 @@ def test_get_dummies_dataframe(scalars_dfs, kwargs): # dtype argument above is needed for pandas v1 only # adjust for expected dtype differences - for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + for column_name, type_name in zip(pd_result.columns, pd_result.dtypes): if type_name == "bool": pd_result[column_name] = pd_result[column_name].astype("boolean") @@ -139,7 +139,7 @@ def test_get_dummies_dataframe_duplicate_labels(scalars_dfs): # dtype argument above is needed for pandas v1 only # adjust for expected dtype differences - for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + for column_name, type_name in zip(pd_result.columns, pd_result.dtypes): if type_name == "bool": pd_result[column_name] = pd_result[column_name].astype("boolean") @@ -156,7 +156,7 @@ def test_get_dummies_series(scalars_dfs): # dtype argument above is needed for pandas v1 only # adjust for expected dtype differences - for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + for column_name, type_name in zip(pd_result.columns, pd_result.dtypes): if type_name == "bool": # pragma: NO COVER pd_result[column_name] = pd_result[column_name].astype("boolean") pd_result.columns = pd_result.columns.astype(object) @@ -177,7 +177,7 @@ def test_get_dummies_series_nameless(scalars_dfs): # dtype argument above is needed for pandas v1 only # adjust for expected dtype differences - for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + for column_name, type_name in zip(pd_result.columns, pd_result.dtypes): if type_name == "bool": # pragma: NO COVER pd_result[column_name] = pd_result[column_name].astype("boolean") pd_result.columns = pd_result.columns.astype(object) diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index 7a750ddfd3c..a2a90f3fe52 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -50,7 +50,6 @@ def test_read_gbq_start_sets_session_location( query_prefix, reset_default_session_and_location, ): - # Form query as a table name or a SQL depending on the test scenario query_tokyo = test_data_tables_tokyo["scalars"] query = test_data_tables["scalars"] diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 7019bcff109..51d0cc61f04 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3011,7 +3011,6 @@ def test_value_counts_w_cut(scalars_dfs): def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[1:].iloc[1:] diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index 426679d37d0..2f1780812ae 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -22,7 +22,6 @@ def test_to_pandas_override_global_option(scalars_df_index): with bigframes.option_context("compute.allow_large_results", True): - bf_series = scalars_df_index["int64_col"] # Direct call to_pandas uses global default setting (allow_large_results=True) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 9d37f23f187..922f73a0ce1 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -874,7 +874,6 @@ def test_read_pandas(session, scalars_dfs): def test_read_pandas_series(session): - idx: pd.Index = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) pd_series = pd.Series([3, 1, 4, 1, 5], dtype=pd.Int64Dtype(), index=idx) bf_series = session.read_pandas(pd_series) @@ -883,7 +882,6 @@ def test_read_pandas_series(session): def test_read_pandas_index(session): - pd_idx: pd.Index = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) bf_idx = session.read_pandas(pd_idx) diff --git a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py index b1fbbb0fc9b..fff2cc06df4 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_string_ops.py @@ -183,7 +183,6 @@ def test_rstrip(scalar_types_df: bpd.DataFrame, snapshot): def test_startswith(scalar_types_df: bpd.DataFrame, snapshot): - col_name = "string_col" bf_df = scalar_types_df[[col_name]] ops_map = { diff --git a/tests/unit/core/compile/sqlglot/test_compile_concat.py b/tests/unit/core/compile/sqlglot/test_compile_concat.py index c176b2e1164..80cf16558b5 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_concat.py +++ b/tests/unit/core/compile/sqlglot/test_compile_concat.py @@ -28,7 +28,6 @@ def test_compile_concat(scalar_types_df: bpd.DataFrame, snapshot): def test_compile_concat_filter_sorted(scalar_types_df: bpd.DataFrame, snapshot): - scalars_array_value = scalar_types_df._block.expr input_1 = scalars_array_value.select_columns(["float64_col", "int64_col"]).order_by( [ordering.ascending_over("int64_col")] diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py index 812d65bbad2..e200e7c12a1 100644 --- a/tests/unit/functions/test_remote_function_utils.py +++ b/tests/unit/functions/test_remote_function_utils.py @@ -441,7 +441,6 @@ def test_has_conflict_output_type_no_annotation(): ), ) def test_get_bigframes_metadata(metadata_options, metadata_string): - assert _utils.get_bigframes_metadata(**metadata_options) == metadata_string @@ -514,7 +513,6 @@ def test_get_bigframes_metadata_array_type_not_serializable(output_type): def test_get_python_output_type_from_bigframes_metadata( metadata_string, python_output_type ): - assert ( _utils.get_python_output_type_from_bigframes_metadata(metadata_string) == python_output_type diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index dd2c83bd3bb..4521b6a861d 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -1999,7 +1999,6 @@ def test_df_idxmax(): ], ) def test_df_align(join, axis): - index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") @@ -3926,7 +3925,6 @@ def test_iloc_list_multiindex(scalars_dfs): def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): - index_list: List[int] = [] bf_result = scalars_df_index.iloc[index_list] diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 516a46d4dd1..494e2499dbc 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -3025,7 +3025,6 @@ def test_value_counts_w_cut(scalars_dfs): def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[1:].iloc[1:] diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 64a5786f17d..1301ef329ab 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -84,5 +84,6 @@ def transform(self, X): Returns: bigframes.dataframe.DataFrame: The result is categorized as index: number, value: number, - where index is the position of the dict seeing the category, and value is 0 or 1.""" + where index is the position of the dict seeing the category, and value is 0 or 1. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From fa97675f13fece7bc5f8f517fd1b72f1286e8b73 Mon Sep 17 00:00:00 2001 From: bhandarivijay-png Date: Wed, 25 Feb 2026 20:24:04 +0000 Subject: [PATCH 17/29] chore: Migrate gsutil usage to gcloud storage (#2412) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Automated: Migrate {target_path} from gsutil to gcloud storage This CL is part of the on going effort to migrate from the legacy `gsutil` tool to the new and improved `gcloud storage` command-line interface. `gcloud storage` is the recommended and modern tool for interacting with Google Cloud Storage, offering better performance, unified authentication, and a more consistent command structure with other `gcloud` components. 🚀 ### Automation Details This change was **generated automatically** by an agent that targets users of `gsutil`. The transformations applied are based on the [gsutil to gcloud storage migration guide](http://go/gsutil-gcloud-storage-migration-guide). ### ⚠️ Action Required: Please Review and Test Carefully While we have based the automation on the migration guide, every use case is unique. **It is crucial that you thoroughly test these changes in environments appropriate to your use-case before merging.** Be aware of potential differences between `gsutil` and `gcloud storage` that could impact your workflows. For instance, the structure of command output may have changed, requiring updates to any scripts that parse it. Similarly, command behavior can differ subtly; the `gcloud storage rsync` command has a different file deletion logic than `gsutil rsync`, which could lead to unintended file deletions. Our migration guides can help guide you through a list of mappings and some notable differences between the two tools. Standard presubmit tests are run as part of this CL's workflow. **If you need to target an additional test workflow or require assistance with testing, please let us know.** Please verify that all your Cloud Storage operations continue to work as expected to avoid any potential disruptions in production. ### Support and Collaboration The `GCS CLI` team is here to help! If you encounter any issues, have a complex use case that this automated change doesn't cover, or face any other blockers, please don't hesitate to reach out. We are happy to work with you to test and adjust these changes as needed. **Contact:** `gcs-cli-hyd@google.com` We appreciate your partnership in this important migration effort! #gsutil-migration Co-authored-by: Shenyang Cai Co-authored-by: Shuowei Li --- .kokoro/release-nightly.sh | 6 +++--- .kokoro/trampoline_v2.sh | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 124e4b8b486..df583f010d8 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -93,9 +93,9 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \ gs://dl-platform-colab/bigframes/ \ gs://bigframes-wheels/; do - gsutil cp -v dist/* ${gcs_path} - gsutil cp -v LICENSE ${gcs_path} - gsutil -m cp -r -v "notebooks/" ${gcs_path}notebooks/ + gcloud storage cp --print-created-message dist/* ${gcs_path} + gcloud storage cp --print-created-message LICENSE ${gcs_path} + gcloud storage cp --recursive --print-created-message "notebooks/" ${gcs_path}notebooks/ done diff --git a/.kokoro/trampoline_v2.sh b/.kokoro/trampoline_v2.sh index 35fa529231d..d03f92dfc48 100755 --- a/.kokoro/trampoline_v2.sh +++ b/.kokoro/trampoline_v2.sh @@ -26,8 +26,8 @@ # To run this script, first download few files from gcs to /dev/shm. # (/dev/shm is passed into the container as KOKORO_GFILE_DIR). # -# gsutil cp gs://cloud-devrel-kokoro-resources/python-docs-samples/secrets_viewer_service_account.json /dev/shm -# gsutil cp gs://cloud-devrel-kokoro-resources/python-docs-samples/automl_secrets.txt /dev/shm +# gcloud storage cp gs://cloud-devrel-kokoro-resources/python-docs-samples/secrets_viewer_service_account.json /dev/shm +# gcloud storage cp gs://cloud-devrel-kokoro-resources/python-docs-samples/automl_secrets.txt /dev/shm # # Then run the script. # .kokoro/trampoline_v2.sh From bf68d0cbb43d88defd686f71afdb522d24cfa17c Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 26 Feb 2026 14:25:43 -0800 Subject: [PATCH 18/29] tests: remove gemini 2.0 exp model tests, move llm tests to large suite (#2484) removed 2 notebooks that are outdated. --- ...bq_dataframes_llm_claude3_museum_art.ipynb | 1019 ----------------- .../bq_dataframes_llm_code_generation.ipynb | 5 +- .../bq_dataframes_llm_gemini_2.ipynb | 377 ------ noxfile.py | 2 - tests/system/large/ml/test_llm.py | 592 +++++++++- tests/system/large/ml/test_multimodal_llm.py | 63 +- tests/system/small/ml/test_llm.py | 611 ---------- tests/system/small/ml/test_multimodal_llm.py | 84 -- tests/system/small/test_iceberg.py | 49 - 9 files changed, 648 insertions(+), 2154 deletions(-) delete mode 100644 notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb delete mode 100644 notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb delete mode 100644 tests/system/small/ml/test_llm.py delete mode 100644 tests/system/small/ml/test_multimodal_llm.py delete mode 100644 tests/system/small/test_iceberg.py diff --git a/notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb b/notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb deleted file mode 100644 index a1bb1e9d89d..00000000000 --- a/notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb +++ /dev/null @@ -1,1019 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "9A9NkTRTfo2I" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8fK_rdvvx1iZ" - }, - "source": [ - "## Overview\n", - "\n", - "## Objective\n", - "\n", - "This notebook shows how to conecct BigQuery dataset to Claude models on Vertex AI using BigQuery DataFrames.\n", - "\n", - "### Claude on Vertex AI\n", - "\n", - "Anthropic Claude models on Vertex AI offer fully managed and serverless models. To use a Claude model on Vertex AI, send a request directly to the Vertex AI API endpoint.\n", - "\n", - "For more information, see the [Use Claude](https://cloud.devsite.corp.google.com/vertex-ai/generative-ai/docs/third-party-models/use-claude) documentation.\n", - "\n", - "### BigQuery DataFrames\n", - "BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine. BigQuery DataFrames is an open-source package.\n", - "\n", - "For more information, see this documentation\n", - "https://cloud.google.com/bigquery/docs/reference/bigquery-dataframes\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nwYvaaW25jYS" - }, - "source": [ - "### Getting Started\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hVi8v2mxBkeG" - }, - "source": [ - "#### Authenticate your notebook environment (Colab only)\n", - "If you are running this notebook on Google Colab, uncomment and run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "OHfMDNI76_Pz" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gI3KlxQQ_F_T" - }, - "source": [ - "## Using Anthropic's Vertex SDK + BQ for *Python*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "E0x3GO6M_O3_" - }, - "source": [ - "### Getting Started\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_CJrqUvqAfR7" - }, - "source": [ - "#### Install the latest bigframes package if bigframes version < 1.15.0\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "executionInfo": { - "elapsed": 11539, - "status": "ok", - "timestamp": 1724257409246, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "fi_HLdat_Pce", - "outputId": "020149f0-9fe8-45de-f160-abe488c0bed2" - }, - "outputs": [], - "source": [ - "# !pip install bigframes --upgrade" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hUiAYUFbBCpR" - }, - "source": [ - "#### Restart current runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "jcqgcj_DBFgt" - }, - "outputs": [], - "source": [ - "# # Restart kernel after installs so that your environment can access the new packages\n", - "# import sys\n", - "\n", - "# if \"google.colab\" in sys.modules:\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "shZgRl6qbZYP" - }, - "source": [ - "#### Define Google Cloud project and region information" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "JZLqMJ6va9fc" - }, - "outputs": [], - "source": [ - "# Input your project id\n", - "PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "czcmJpKPBMVC" - }, - "source": [ - "#### Select Claude Model and Region Availability:\n", - "https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#anthropic_claude_quotas_and_supported_context_length" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "wIBkGcFkK0Ci" - }, - "outputs": [], - "source": [ - "REGION = \"us-east5\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F3UmCLerH0t0" - }, - "source": [ - "### Load raw sample data to a bigquery dataset\n", - "\n", - "Create a BigQuery Dataset and table. You can use the sample museum data in CSV from [here](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks/generative_ai/museum_art.csv).\n", - "\n", - "The dataset should be in the **same region** as your chosen claude model. Let's say you selected us-east5 for claude 'haiku', then load the sample data to a dataset in us-east5." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gijJ2vr5B5nV" - }, - "source": [ - "### Text generation for BQ Tables using Python BigFrames\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "collapsed": true, - "executionInfo": { - "elapsed": 756, - "status": "ok", - "timestamp": 1724260427446, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "cU3Gq7TqHFdi", - "outputId": "aa5ec159-a91b-4349-e56a-400e90935edc" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "object_number string[pyarrow]\n", - "is_highlight boolean\n", - "is_public_domain boolean\n", - "object_id Int64\n", - "department string[pyarrow]\n", - "object_name string[pyarrow]\n", - "title string[pyarrow]\n", - "culture string[pyarrow]\n", - "period string[pyarrow]\n", - "dynasty string[pyarrow]\n", - "reign string[pyarrow]\n", - "portfolio string[pyarrow]\n", - "artist_role string[pyarrow]\n", - "artist_prefix string[pyarrow]\n", - "artist_display_name string[pyarrow]\n", - "artist_display_bio string[pyarrow]\n", - "artist_suffix string[pyarrow]\n", - "artist_alpha_sort string[pyarrow]\n", - "artist_nationality string[pyarrow]\n", - "artist_begin_date string[pyarrow]\n", - "artist_end_date string[pyarrow]\n", - "object_date string[pyarrow]\n", - "object_begin_date Int64\n", - "object_end_date Int64\n", - "medium string[pyarrow]\n", - "dimensions string[pyarrow]\n", - "credit_line string[pyarrow]\n", - "geography_type string[pyarrow]\n", - "city string[pyarrow]\n", - "state string[pyarrow]\n", - "county string[pyarrow]\n", - "country string[pyarrow]\n", - "region string[pyarrow]\n", - "subregion string[pyarrow]\n", - "locale string[pyarrow]\n", - "locus string[pyarrow]\n", - "excavation string[pyarrow]\n", - "river string[pyarrow]\n", - "classification string[pyarrow]\n", - "rights_and_reproduction string[pyarrow]\n", - "link_resource string[pyarrow]\n", - "metadata_date timestamp[us, tz=UTC][pyarrow]\n", - "repository string[pyarrow]\n", - "dtype: object" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd\n", - "bigframes.options._bigquery_options.project = PROJECT_ID # replace to user project\n", - "bigframes.options._bigquery_options.location = REGION #choice a region which the claude model you choice allows\n", - "df = bpd.read_gbq(\"bigframes-dev.garrettwu_us_east5.museum_art\") # replace with your table\n", - "df.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 461 - }, - "executionInfo": { - "elapsed": 4568, - "status": "ok", - "timestamp": 1724271168583, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "exWNXEzLHHaU", - "outputId": "1b33b64c-c8bd-42e6-ecc3-0ea0b5e492be" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 1998408a-4e29-4381-9229-cf8585a47dbe is DONE. 7.7 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 817a5321-9852-45da-8b14-004affc20c38 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 36aa1b30-acb5-4188-8377-b9f544443db8 is DONE. 955 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
object_idtitle
0285844Addie Card, 12 years. Spinner in North Pownal ...
1437141Portrait of a Man
2670650[Snow Crystal]
3268450Newhaven Fisherman
4646996전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾...
5287958Bridge of Augustus at Nani
6435869Antoine Dominique Sauveur Aubert (born 1817), ...
755834<NA>
845087<NA>
956883<NA>
\n", - "

10 rows × 2 columns

\n", - "
[10 rows x 2 columns in total]" - ], - "text/plain": [ - " object_id title\n", - "0 285844 Addie Card, 12 years. Spinner in North Pownal ...\n", - "1 437141 Portrait of a Man\n", - "2 670650 [Snow Crystal]\n", - "3 268450 Newhaven Fisherman\n", - "4 646996 전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾...\n", - "5 287958 Bridge of Augustus at Nani\n", - "6 435869 Antoine Dominique Sauveur Aubert (born 1817), ...\n", - "7 55834 \n", - "8 45087 \n", - "9 56883 \n", - "\n", - "[10 rows x 2 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# @title query: select top 10 records from table and put into dataframe\n", - "\n", - "df = df[[\"object_id\", \"title\"]].head(10)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_UZNsP_WDlyr" - }, - "source": [ - "### Enable Claude model on Vertex AI and Create a BQ External Model Connection\n", - "\n", - "\n", - "* Step 1: Visit the Vertex AI Model Garden console and select the model tile for Claude model of your choice. Following this doc [link](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude). Click on the **“Enable”** button and follow the instructions.\n", - "\n", - "* Step 2: Create a BQ External Connection\n", - "Follow the same process like this one: [link](https://cloud.google.com/bigquery/docs/generate-text#create_a_connection). Pay attention to the **supported region** of Claude models and make your conenction follow the same region for example us-east5 for Claude 3.5.\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8Q4aff5m9QOn" - }, - "source": [ - "### Use BigQuery DataFrames ML package with Claude LLM \n", - "\n", - "In this example, we are using the Claude3TextGenerator class from BigQuery DataFrames to translate title of art piece to english.\n", - "\n", - "Documentation for the Claude3TextGenerator Class: https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.Claude3TextGenerator" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 513 - }, - "executionInfo": { - "elapsed": 25662, - "status": "ok", - "timestamp": 1724271197922, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "1pdyI5KBTyTD", - "outputId": "8f1e976b-1fd0-49ba-e068-f480eafb1765" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 514f5afe-15e0-4474-9e09-fbf94f0fe8ca is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5d4df544-e8a4-42f3-8a94-5f7e79b23562 is DONE. 635 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 25288d94-b10c-4b39-a272-3969ccb19af3 is DONE. 14 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d5693878-1037-4798-8aa0-f568ec0be9e3 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 54080328-ba8b-4715-bf2b-3e5b7affa90b is DONE. 4.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ml_generate_text_llm_resultml_generate_text_statusprompt
0This text is already in English. It appears to...translate this into English: Addie Card, 12 ye...
1The phrase \"Portrait of a Man\" is already in E...translate this into English: Portrait of a Man
2The phrase \"[Snow Crystal]\" is already in Engl...translate this into English: [Snow Crystal]
3The phrase \"Newhaven Fisherman\" is already in ...translate this into English: Newhaven Fisherman
4Here's the English translation:\n", - "\n", - "\"Attributed t...translate this into English: 전(傳) 오원 장승업 (1843...
5I apologize, but I'm not sure which language \"...translate this into English: Bridge of Augustu...
6This title is already in English. It describes...translate this into English: Antoine Dominique...
7<NA><NA><NA>
8<NA><NA><NA>
9<NA><NA><NA>
\n", - "

10 rows × 3 columns

\n", - "
[10 rows x 3 columns in total]" - ], - "text/plain": [ - " ml_generate_text_llm_result ml_generate_text_status \\\n", - "0 This text is already in English. It appears to... \n", - "1 The phrase \"Portrait of a Man\" is already in E... \n", - "2 The phrase \"[Snow Crystal]\" is already in Engl... \n", - "3 The phrase \"Newhaven Fisherman\" is already in ... \n", - "4 Here's the English translation:\n", - "\n", - "\"Attributed t... \n", - "5 I apologize, but I'm not sure which language \"... \n", - "6 This title is already in English. It describes... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - " prompt \n", - "0 translate this into English: Addie Card, 12 ye... \n", - "1 translate this into English: Portrait of a Man \n", - "2 translate this into English: [Snow Crystal] \n", - "3 translate this into English: Newhaven Fisherman \n", - "4 translate this into English: 전(傳) 오원 장승업 (1843... \n", - "5 translate this into English: Bridge of Augustu... \n", - "6 translate this into English: Antoine Dominique... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - "[10 rows x 3 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml import llm\n", - "model = llm.Claude3TextGenerator(model_name=\"claude-3-5-sonnet\",\n", - " connection_name=\"bigframes-dev.us-east5.bigframes-rf-conn\" ) # replace with your connection\n", - "df[\"input_prompt\"] = \"translate this into English: \" + df[\"title\"]\n", - "result = model.predict(df[\"input_prompt\"])\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 461 - }, - "executionInfo": { - "elapsed": 5249, - "status": "ok", - "timestamp": 1724274172557, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "Ux1VI5qujHOB", - "outputId": "7b859943-5e7c-4cc0-d9c2-bb3d44682010" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 6b6eceaa-e713-493e-beac-481a3d777a5c is DONE. 4.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5c660da9-318c-424e-9412-43f09e44a8b3 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 82b61007-8370-4514-addb-258d7c48d66c is DONE. 4.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
object_idtitleml_generate_text_llm_resultprompt
0285844Addie Card, 12 years. Spinner in North Pownal ...This text is already in English. It appears to...translate this into English: Addie Card, 12 ye...
1437141Portrait of a ManThe phrase \"Portrait of a Man\" is already in E...translate this into English: Portrait of a Man
2670650[Snow Crystal]The phrase \"[Snow Crystal]\" is already in Engl...translate this into English: [Snow Crystal]
3268450Newhaven FishermanThe phrase \"Newhaven Fisherman\" is already in ...translate this into English: Newhaven Fisherman
4646996전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾...Here's the English translation:\n", - "\n", - "\"Attributed t...translate this into English: 전(傳) 오원 장승업 (1843...
5287958Bridge of Augustus at NaniI apologize, but I'm not sure which language \"...translate this into English: Bridge of Augustu...
6435869Antoine Dominique Sauveur Aubert (born 1817), ...This title is already in English. It describes...translate this into English: Antoine Dominique...
755834<NA><NA><NA>
845087<NA><NA><NA>
956883<NA><NA><NA>
\n", - "

10 rows × 4 columns

\n", - "
[10 rows x 4 columns in total]" - ], - "text/plain": [ - " object_id title \\\n", - "0 285844 Addie Card, 12 years. Spinner in North Pownal ... \n", - "1 437141 Portrait of a Man \n", - "2 670650 [Snow Crystal] \n", - "3 268450 Newhaven Fisherman \n", - "4 646996 전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾... \n", - "5 287958 Bridge of Augustus at Nani \n", - "6 435869 Antoine Dominique Sauveur Aubert (born 1817), ... \n", - "7 55834 \n", - "8 45087 \n", - "9 56883 \n", - "\n", - " ml_generate_text_llm_result \\\n", - "0 This text is already in English. It appears to... \n", - "1 The phrase \"Portrait of a Man\" is already in E... \n", - "2 The phrase \"[Snow Crystal]\" is already in Engl... \n", - "3 The phrase \"Newhaven Fisherman\" is already in ... \n", - "4 Here's the English translation:\n", - "\n", - "\"Attributed t... \n", - "5 I apologize, but I'm not sure which language \"... \n", - "6 This title is already in English. It describes... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - " prompt \n", - "0 translate this into English: Addie Card, 12 ye... \n", - "1 translate this into English: Portrait of a Man \n", - "2 translate this into English: [Snow Crystal] \n", - "3 translate this into English: Newhaven Fisherman \n", - "4 translate this into English: 전(傳) 오원 장승업 (1843... \n", - "5 translate this into English: Bridge of Augustu... \n", - "6 translate this into English: Antoine Dominique... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - "[10 rows x 4 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df=df.drop(columns=[\"input_prompt\"]).join(result.drop(columns=\"ml_generate_text_status\"))\n", - "output_df" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ej70vFMvelsg" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 8c3f1d21-9033-4224-b6f3-4f2414f4ed18 is DONE. 4.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'bigframes-dev.garrettwu_us_east5.museum_art_translate'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# prompt: load the dataframe output to another Bigquery table\n", - "\n", - "# @title Save results to BigQuery\n", - "\n", - "output_df.to_gbq(\"bigframes-dev.garrettwu_us_east5.museum_art_translate\", if_exists=\"replace\") # replace with your table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 4f1329129e2..b05a6f034f6 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -1283,7 +1283,8 @@ "toc_visible": true }, "kernelspec": { - "display_name": "Python 3", + "display_name": "venv (3.10.14)", + "language": "python", "name": "python3" }, "language_info": { @@ -1296,7 +1297,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb deleted file mode 100644 index 1a9b5688975..00000000000 --- a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb +++ /dev/null @@ -1,377 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BigFrames Gemini 2.0 Text Generation Simple Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: This feature is only available in bigframes >= 1.29.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import packages" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "from bigframes.ml import llm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Gemini 2.0 experimental Model with model_name as \"gemini-2.0-flash-exp\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/ml/llm.py:803: PreviewWarning: Model gemini-2.0-flash-exp is subject to the \"Pre-GA Offerings Terms\" in the General Service Terms section of the\n", - " Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available \"as is\"\n", - " and might have limited support. For more information, see the launch stage descriptions\n", - " (https://cloud.google.com/products#product-launch-stages).\n", - " warnings.warn(\n", - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/pandas/__init__.py:435: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return global_session.get_global_session()\n" - ] - }, - { - "data": { - "text/html": [ - "Query job f673a2ea-023e-4771-84a2-fb81f808fa1b is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "model = llm.GeminiTextGenerator(model_name=\"gemini-2.0-flash-exp\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a simple DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 2276ea5b-2e08-4ed6-af34-49a7d165d145 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
prompt
0Tell me something about Gemini 2.0.
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " prompt\n", - "0 Tell me something about Gemini 2.0.\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\"prompt\": [\"Tell me something about Gemini 2.0.\"]})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Make predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 9ba21e96-6023-491e-8e83-f2e6fa7df0e7 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/__init__.py:109: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 933d45cc-4bc0-4bdf-b4b8-573da2d58be3 is DONE. 2 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 3dda9bc6-84b1-4f4a-8891-85d25d8848ce is DONE. 4.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ml_generate_text_llm_resultml_generate_text_rai_resultml_generate_text_statusprompt
0Alright, let's talk about Gemini 2.0! It's a b...<NA>Tell me something about Gemini 2.0.
\n", - "

1 rows × 4 columns

\n", - "
[1 rows x 4 columns in total]" - ], - "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 Alright, let's talk about Gemini 2.0! It's a b... \n", - "\n", - " ml_generate_text_rai_result ml_generate_text_status \\\n", - "0 \n", - "\n", - " prompt \n", - "0 Tell me something about Gemini 2.0. \n", - "\n", - "[1 rows x 4 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result = model.predict(df)\n", - "result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save the model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Copy job 8e68af62-e7ab-475b-99c9-b79e8ba3c40b is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/ml/llm.py:803: PreviewWarning: Model gemini-2.0-flash-exp is subject to the \"Pre-GA Offerings Terms\" in the General Service Terms section of the\n", - " Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available \"as is\"\n", - " and might have limited support. For more information, see the launch stage descriptions\n", - " (https://cloud.google.com/products#product-launch-stages).\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job cae7f929-d8cb-4819-a644-ac832cdc0912 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "GeminiTextGenerator(connection_name='bigframes-dev.us.bigframes-rf-connection',\n", - " model_name='gemini-2.0-flash-exp',\n", - " session=)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.to_gbq(\"bigframes-dev.garrettwu.gemini_2_flash\", replace=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/noxfile.py b/noxfile.py index 888f9fd765a..671aae13d22 100644 --- a/noxfile.py +++ b/noxfile.py @@ -707,11 +707,9 @@ def notebook(session: nox.Session): # bq_dataframes_llm_code_generation creates a bucket in the sample. "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. "notebooks/generative_ai/sentiment_analysis.ipynb", # Too slow - "notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb", # Gemini 2.0 backend hasn't ready in prod. "notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb", # Limited quota for vector index ddl statements on table. "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. # TODO(b/366290533): to protect BQML quota - "notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb", "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI. "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", # Needs BUCKET_URI. "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # Needs BUCKET_URI. diff --git a/tests/system/large/ml/test_llm.py b/tests/system/large/ml/test_llm.py index 1daaebb8cb8..6e2695b1b53 100644 --- a/tests/system/large/ml/test_llm.py +++ b/tests/system/large/ml/test_llm.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Callable +from unittest import mock + import pandas as pd import pyarrow as pa import pytest -from bigframes.ml import llm +from bigframes.ml import core, llm import bigframes.pandas as bpd from bigframes.testing import utils @@ -24,7 +27,6 @@ @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -32,9 +34,7 @@ "gemini-2.5-flash-lite", ), ) -@pytest.mark.flaky( - retries=2 -) # usually create model shouldn't be flaky, but this one due to the limited quota of gemini-2.0-flash-exp. +@pytest.mark.flaky(retries=2) def test_create_load_gemini_text_generator_model( dataset_id, model_name, session, bq_connection ): @@ -56,7 +56,6 @@ def test_create_load_gemini_text_generator_model( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -80,7 +79,6 @@ def test_gemini_text_generator_predict_default_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -106,7 +104,6 @@ def test_gemini_text_generator_predict_with_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -134,7 +131,6 @@ def test_gemini_text_generator_multi_cols_predict_success( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -231,3 +227,581 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) "evaluation_status", ], ) + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), +) +def test_create_load_text_embedding_generator_model( + dataset_id, model_name, session, bq_connection +): + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + assert text_embedding_model is not None + assert text_embedding_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = text_embedding_model.to_gbq( + f"{dataset_id}.temp_text_model", replace=True + ) + assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + assert reloaded_model.model_name == model_name + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), +) +@pytest.mark.flaky(retries=2) +def test_text_embedding_generator_predict_default_params_success( + llm_text_df, model_name, session, bq_connection +): + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + df = text_embedding_model.predict(llm_text_df).to_pandas() + utils.check_pandas_df_schema_and_index( + df, columns=utils.ML_GENERATE_EMBEDDING_OUTPUT, index=3, col_exact=False + ) + assert len(df["ml_generate_embedding_result"][0]) == 768 + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), +) +@pytest.mark.flaky(retries=2) +def test_text_embedding_generator_multi_cols_predict_success( + llm_text_df: bpd.DataFrame, model_name, session, bq_connection +): + df = llm_text_df.assign(additional_col=1) + df = df.rename(columns={"prompt": "content"}) + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + pd_df = text_embedding_model.predict(df).to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=utils.ML_GENERATE_EMBEDDING_OUTPUT + ["additional_col"], + index=3, + col_exact=False, + ) + assert len(pd_df["ml_generate_embedding_result"][0]) == 768 + + +def test_create_load_multimodal_embedding_generator_model( + dataset_id, session, bq_connection +): + mm_embedding_model = llm.MultimodalEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + assert mm_embedding_model is not None + assert mm_embedding_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = mm_embedding_model.to_gbq( + f"{dataset_id}.temp_mm_model", replace=True + ) + assert f"{dataset_id}.temp_mm_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + + +# Overrides __eq__ function for comparing as mock.call parameter +class EqCmpAllDataFrame(bpd.DataFrame): + def __eq__(self, other): + return self.equals(other) + + +@pytest.mark.skip("b/436340035 test failed") +@pytest.mark.parametrize( + ( + "model_class", + "options", + ), + [ + ( + llm.GeminiTextGenerator, + { + "temperature": 0.9, + "max_output_tokens": 8192, + "top_p": 1.0, + "ground_with_google_search": False, + }, + ), + ( + llm.Claude3TextGenerator, + { + "max_output_tokens": 128, + "top_k": 40, + "top_p": 0.95, + }, + ), + ], +) +def test_text_generator_retry_success( + session, + model_class, + options, + bq_connection, +): + # Requests. + df0 = EqCmpAllDataFrame( + { + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", "error"], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + df2 = EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error"], + "prompt": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ) + + mock_generate_text = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_text_tvf = core.BqmlModel.TvfDef( + mock_generate_text, "ml_generate_text_status" + ) + # Responses. Retry twice then all succeeded. + mock_generate_text.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["", "error", "error"], + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", ""], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_text_status": [""], + "prompt": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ), + ] + + text_generator_model = model_class(connection_name=bq_connection, session=session) + text_generator_model._bqml_model = mock_bqml_model + + with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): + # 3rd retry isn't triggered + result = text_generator_model.predict(df0, max_retries=3) + + mock_generate_text.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + mock.call(mock_bqml_model, df2, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_text_status": ["", "", ""], + "prompt": [ + "What is BigQuery?", + "What is BigQuery DataFrame?", + "What is BQML?", + ], + }, + index=[0, 2, 1], + ), + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.skip("b/436340035 test failed") +@pytest.mark.parametrize( + ( + "model_class", + "options", + ), + [ + ( + llm.GeminiTextGenerator, + { + "temperature": 0.9, + "max_output_tokens": 8192, + "top_p": 1.0, + "ground_with_google_search": False, + }, + ), + ( + llm.Claude3TextGenerator, + { + "max_output_tokens": 128, + "top_k": 40, + "top_p": 0.95, + }, + ), + ], +) +def test_text_generator_retry_no_progress(session, model_class, options, bq_connection): + # Requests. + df0 = EqCmpAllDataFrame( + { + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", "error"], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + + mock_generate_text = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_text_tvf = core.BqmlModel.TvfDef( + mock_generate_text, "ml_generate_text_status" + ) + # Responses. Retry once, no progress, just stop. + mock_generate_text.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["", "error", "error"], + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", "error"], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + ] + + text_generator_model = model_class(connection_name=bq_connection, session=session) + text_generator_model._bqml_model = mock_bqml_model + + with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): + # No progress, only conduct retry once + result = text_generator_model.predict(df0, max_retries=3) + + mock_generate_text.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_text_status": ["", "error", "error"], + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + ), + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.skip("b/436340035 test failed") +def test_text_embedding_generator_retry_success(session, bq_connection): + # Requests. + df0 = EqCmpAllDataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", "error"], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + df2 = EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error"], + "content": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ) + + mock_generate_embedding = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_embedding_tvf = core.BqmlModel.TvfDef( + mock_generate_embedding, "ml_generate_embedding_status" + ) + + # Responses. Retry twice then all succeeded. + mock_generate_embedding.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["", "error", "error"], + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", ""], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": [""], + "content": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ), + ] + options: dict = {} + + text_embedding_model = llm.TextEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + text_embedding_model._bqml_model = mock_bqml_model + + with mock.patch.object( + core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf + ): + # 3rd retry isn't triggered + result = text_embedding_model.predict(df0, max_retries=3) + + mock_generate_embedding.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + mock.call(mock_bqml_model, df2, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_embedding_status": ["", "", ""], + "content": [ + "What is BigQuery?", + "What is BigQuery DataFrame?", + "What is BQML?", + ], + }, + index=[0, 2, 1], + ), + check_dtype=False, + check_index_type=False, + ) + + +def test_text_embedding_generator_retry_no_progress(session, bq_connection): + # Requests. + df0 = EqCmpAllDataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", "error"], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + + mock_generate_embedding = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_embedding_tvf = core.BqmlModel.TvfDef( + mock_generate_embedding, "ml_generate_embedding_status" + ) + + # Responses. Retry once, no progress, just stop. + mock_generate_embedding.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["", "error", "error"], + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", "error"], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + ] + options: dict = {} + + text_embedding_model = llm.TextEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + text_embedding_model._bqml_model = mock_bqml_model + + with mock.patch.object( + core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf + ): + # No progress, only conduct retry once + result = text_embedding_model.predict(df0, max_retries=3) + + mock_generate_embedding.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_embedding_status": ["", "error", "error"], + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + ), + check_dtype=False, + check_index_type=False, + ) + + +# b/436340035 temp disable the test to unblock presumbit +@pytest.mark.parametrize( + "model_class", + [ + llm.TextEmbeddingGenerator, + llm.MultimodalEmbeddingGenerator, + llm.GeminiTextGenerator, + # llm.Claude3TextGenerator, + ], +) +def test_text_embedding_generator_no_default_model_warning(model_class): + message = "Since upgrading the default model can cause unintended breakages, the\ndefault model will be removed in BigFrames 3.0. Please supply an\nexplicit model to avoid this message." + with pytest.warns(FutureWarning, match=message): + model_class(model_name=None) diff --git a/tests/system/large/ml/test_multimodal_llm.py b/tests/system/large/ml/test_multimodal_llm.py index 03fdddf6654..f94f0f1dee6 100644 --- a/tests/system/large/ml/test_multimodal_llm.py +++ b/tests/system/large/ml/test_multimodal_llm.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd +import pyarrow as pa import pytest from bigframes.ml import llm @@ -22,7 +24,6 @@ @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", ), @@ -43,3 +44,63 @@ def test_gemini_text_generator_multimodal_input( index=2, col_exact=False, ) + + +@pytest.mark.flaky(retries=2) +def test_multimodal_embedding_generator_predict_default_params_success( + images_mm_df, session, bq_connection +): + text_embedding_model = llm.MultimodalEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + df = text_embedding_model.predict(images_mm_df).to_pandas() + utils.check_pandas_df_schema_and_index( + df, + columns=utils.ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT, + index=2, + col_exact=False, + ) + assert len(df["ml_generate_embedding_result"][0]) == 1408 + + +@pytest.mark.parametrize( + "model_name", + ("gemini-2.0-flash-001",), +) +@pytest.mark.flaky(retries=2) +def test_gemini_text_generator_multimodal_structured_output( + images_mm_df: bpd.DataFrame, model_name, session, bq_connection +): + gemini_text_generator_model = llm.GeminiTextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + output_schema = { + "bool_output": "bool", + "int_output": "int64", + "float_output": "float64", + "str_output": "string", + "array_output": "array", + "struct_output": "struct", + } + df = gemini_text_generator_model.predict( + images_mm_df, + prompt=["Describe", images_mm_df["blob_col"]], + output_schema=output_schema, + ) + assert df["bool_output"].dtype == pd.BooleanDtype() + assert df["int_output"].dtype == pd.Int64Dtype() + assert df["float_output"].dtype == pd.Float64Dtype() + assert df["str_output"].dtype == pd.StringDtype(storage="pyarrow") + assert df["array_output"].dtype == pd.ArrowDtype(pa.list_(pa.int64())) + assert df["struct_output"].dtype == pd.ArrowDtype( + pa.struct([("number", pa.int64())]) + ) + + pd_df = df.to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=list(output_schema.keys()) + + ["blob_col", "prompt", "full_response", "status"], + index=2, + col_exact=False, + ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py deleted file mode 100644 index d15c5d31605..00000000000 --- a/tests/system/small/ml/test_llm.py +++ /dev/null @@ -1,611 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Callable -from unittest import mock - -import pandas as pd -import pytest - -from bigframes import exceptions -from bigframes.ml import core, llm -import bigframes.pandas as bpd -from bigframes.testing import utils - - -@pytest.mark.parametrize( - "model_name", - ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), -) -def test_create_load_text_embedding_generator_model( - dataset_id, model_name, session, bq_connection -): - text_embedding_model = llm.TextEmbeddingGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - assert text_embedding_model is not None - assert text_embedding_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = text_embedding_model.to_gbq( - f"{dataset_id}.temp_text_model", replace=True - ) - assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.connection_name == bq_connection - assert reloaded_model.model_name == model_name - - -@pytest.mark.parametrize( - "model_name", - ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), -) -@pytest.mark.flaky(retries=2) -def test_text_embedding_generator_predict_default_params_success( - llm_text_df, model_name, session, bq_connection -): - text_embedding_model = llm.TextEmbeddingGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - df = text_embedding_model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_EMBEDDING_OUTPUT, index=3, col_exact=False - ) - assert len(df["ml_generate_embedding_result"][0]) == 768 - - -@pytest.mark.parametrize( - "model_name", - ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), -) -@pytest.mark.flaky(retries=2) -def test_text_embedding_generator_multi_cols_predict_success( - llm_text_df: bpd.DataFrame, model_name, session, bq_connection -): - df = llm_text_df.assign(additional_col=1) - df = df.rename(columns={"prompt": "content"}) - text_embedding_model = llm.TextEmbeddingGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - pd_df = text_embedding_model.predict(df).to_pandas() - utils.check_pandas_df_schema_and_index( - pd_df, - columns=utils.ML_GENERATE_EMBEDDING_OUTPUT + ["additional_col"], - index=3, - col_exact=False, - ) - assert len(pd_df["ml_generate_embedding_result"][0]) == 768 - - -def test_create_load_multimodal_embedding_generator_model( - dataset_id, session, bq_connection -): - mm_embedding_model = llm.MultimodalEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - assert mm_embedding_model is not None - assert mm_embedding_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = mm_embedding_model.to_gbq( - f"{dataset_id}.temp_mm_model", replace=True - ) - assert f"{dataset_id}.temp_mm_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.connection_name == bq_connection - - -# Overrides __eq__ function for comparing as mock.call parameter -class EqCmpAllDataFrame(bpd.DataFrame): - def __eq__(self, other): - return self.equals(other) - - -@pytest.mark.skip("b/436340035 test failed") -@pytest.mark.parametrize( - ( - "model_class", - "options", - ), - [ - ( - llm.GeminiTextGenerator, - { - "temperature": 0.9, - "max_output_tokens": 8192, - "top_p": 1.0, - "ground_with_google_search": False, - }, - ), - ( - llm.Claude3TextGenerator, - { - "max_output_tokens": 128, - "top_k": 40, - "top_p": 0.95, - }, - ), - ], -) -def test_text_generator_retry_success( - session, - model_class, - options, - bq_connection, -): - # Requests. - df0 = EqCmpAllDataFrame( - { - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", "error"], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - df2 = EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error"], - "prompt": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ) - - mock_generate_text = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_text_tvf = core.BqmlModel.TvfDef( - mock_generate_text, "ml_generate_text_status" - ) - # Responses. Retry twice then all succeeded. - mock_generate_text.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["", "error", "error"], - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", ""], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_text_status": [""], - "prompt": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ), - ] - - text_generator_model = model_class(connection_name=bq_connection, session=session) - text_generator_model._bqml_model = mock_bqml_model - - with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): - # 3rd retry isn't triggered - result = text_generator_model.predict(df0, max_retries=3) - - mock_generate_text.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - mock.call(mock_bqml_model, df2, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_text_status": ["", "", ""], - "prompt": [ - "What is BigQuery?", - "What is BigQuery DataFrame?", - "What is BQML?", - ], - }, - index=[0, 2, 1], - ), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.skip("b/436340035 test failed") -@pytest.mark.parametrize( - ( - "model_class", - "options", - ), - [ - ( - llm.GeminiTextGenerator, - { - "temperature": 0.9, - "max_output_tokens": 8192, - "top_p": 1.0, - "ground_with_google_search": False, - }, - ), - ( - llm.Claude3TextGenerator, - { - "max_output_tokens": 128, - "top_k": 40, - "top_p": 0.95, - }, - ), - ], -) -def test_text_generator_retry_no_progress(session, model_class, options, bq_connection): - # Requests. - df0 = EqCmpAllDataFrame( - { - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", "error"], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - - mock_generate_text = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_text_tvf = core.BqmlModel.TvfDef( - mock_generate_text, "ml_generate_text_status" - ) - # Responses. Retry once, no progress, just stop. - mock_generate_text.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["", "error", "error"], - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", "error"], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - ] - - text_generator_model = model_class(connection_name=bq_connection, session=session) - text_generator_model._bqml_model = mock_bqml_model - - with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): - # No progress, only conduct retry once - result = text_generator_model.predict(df0, max_retries=3) - - mock_generate_text.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_text_status": ["", "error", "error"], - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - ), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.skip("b/436340035 test failed") -def test_text_embedding_generator_retry_success(session, bq_connection): - # Requests. - df0 = EqCmpAllDataFrame( - { - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", "error"], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - df2 = EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error"], - "content": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ) - - mock_generate_embedding = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_embedding_tvf = core.BqmlModel.TvfDef( - mock_generate_embedding, "ml_generate_embedding_status" - ) - - # Responses. Retry twice then all succeeded. - mock_generate_embedding.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["", "error", "error"], - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", ""], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": [""], - "content": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ), - ] - options: dict = {} - - text_embedding_model = llm.TextEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - text_embedding_model._bqml_model = mock_bqml_model - - with mock.patch.object( - core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf - ): - # 3rd retry isn't triggered - result = text_embedding_model.predict(df0, max_retries=3) - - mock_generate_embedding.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - mock.call(mock_bqml_model, df2, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_embedding_status": ["", "", ""], - "content": [ - "What is BigQuery?", - "What is BigQuery DataFrame?", - "What is BQML?", - ], - }, - index=[0, 2, 1], - ), - check_dtype=False, - check_index_type=False, - ) - - -def test_text_embedding_generator_retry_no_progress(session, bq_connection): - # Requests. - df0 = EqCmpAllDataFrame( - { - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", "error"], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - - mock_generate_embedding = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_embedding_tvf = core.BqmlModel.TvfDef( - mock_generate_embedding, "ml_generate_embedding_status" - ) - - # Responses. Retry once, no progress, just stop. - mock_generate_embedding.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["", "error", "error"], - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", "error"], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - ] - options: dict = {} - - text_embedding_model = llm.TextEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - text_embedding_model._bqml_model = mock_bqml_model - - with mock.patch.object( - core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf - ): - # No progress, only conduct retry once - result = text_embedding_model.predict(df0, max_retries=3) - - mock_generate_embedding.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_embedding_status": ["", "error", "error"], - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - ), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "model_name", - ("gemini-2.0-flash-exp",), -) -def test_gemini_preview_model_warnings(model_name): - with pytest.warns(exceptions.PreviewWarning): - llm.GeminiTextGenerator(model_name=model_name) - - -# b/436340035 temp disable the test to unblock presumbit -@pytest.mark.parametrize( - "model_class", - [ - llm.TextEmbeddingGenerator, - llm.MultimodalEmbeddingGenerator, - llm.GeminiTextGenerator, - # llm.Claude3TextGenerator, - ], -) -def test_text_embedding_generator_no_default_model_warning(model_class): - message = "Since upgrading the default model can cause unintended breakages, the\ndefault model will be removed in BigFrames 3.0. Please supply an\nexplicit model to avoid this message." - with pytest.warns(FutureWarning, match=message): - model_class(model_name=None) diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py deleted file mode 100644 index e29669afd30..00000000000 --- a/tests/system/small/ml/test_multimodal_llm.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pandas as pd -import pyarrow as pa -import pytest - -from bigframes.ml import llm -import bigframes.pandas as bpd -from bigframes.testing import utils - - -@pytest.mark.flaky(retries=2) -def test_multimodal_embedding_generator_predict_default_params_success( - images_mm_df, session, bq_connection -): - text_embedding_model = llm.MultimodalEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - df = text_embedding_model.predict(images_mm_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, - columns=utils.ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT, - index=2, - col_exact=False, - ) - assert len(df["ml_generate_embedding_result"][0]) == 1408 - - -@pytest.mark.parametrize( - "model_name", - ( - "gemini-2.0-flash-exp", - "gemini-2.0-flash-001", - ), -) -@pytest.mark.flaky(retries=2) -def test_gemini_text_generator_multimodal_structured_output( - images_mm_df: bpd.DataFrame, model_name, session, bq_connection -): - gemini_text_generator_model = llm.GeminiTextGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - output_schema = { - "bool_output": "bool", - "int_output": "int64", - "float_output": "float64", - "str_output": "string", - "array_output": "array", - "struct_output": "struct", - } - df = gemini_text_generator_model.predict( - images_mm_df, - prompt=["Describe", images_mm_df["blob_col"]], - output_schema=output_schema, - ) - assert df["bool_output"].dtype == pd.BooleanDtype() - assert df["int_output"].dtype == pd.Int64Dtype() - assert df["float_output"].dtype == pd.Float64Dtype() - assert df["str_output"].dtype == pd.StringDtype(storage="pyarrow") - assert df["array_output"].dtype == pd.ArrowDtype(pa.list_(pa.int64())) - assert df["struct_output"].dtype == pd.ArrowDtype( - pa.struct([("number", pa.int64())]) - ) - - pd_df = df.to_pandas() - utils.check_pandas_df_schema_and_index( - pd_df, - columns=list(output_schema.keys()) - + ["blob_col", "prompt", "full_response", "status"], - index=2, - col_exact=False, - ) diff --git a/tests/system/small/test_iceberg.py b/tests/system/small/test_iceberg.py deleted file mode 100644 index ea0acc6214e..00000000000 --- a/tests/system/small/test_iceberg.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest - -import bigframes -import bigframes.pandas as bpd - - -@pytest.fixture() -def fresh_global_session(): - bpd.reset_session() - yield None - bpd.close_session() - # Undoes side effect of using ths global session to read table - bpd.options.bigquery.location = None - - -def test_read_iceberg_table_w_location(): - session = bigframes.Session(bigframes.BigQueryOptions(location="us-central1")) - df = session.read_gbq( - "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab_2021" - ) - assert df.shape == (30904427, 20) - - -def test_read_iceberg_table_w_wrong_location(): - session = bigframes.Session(bigframes.BigQueryOptions(location="europe-west1")) - with pytest.raises(ValueError, match="Current session is in europe-west1"): - session.read_gbq( - "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab_2021" - ) - - -def test_read_iceberg_table_wo_location(fresh_global_session): - df = bpd.read_gbq( - "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab_2021" - ) - assert df.shape == (30904427, 20) From 7813eaa6fa2ae42943b90583e600c95beaf5d75e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 26 Feb 2026 15:16:09 -0800 Subject: [PATCH 19/29] feat: add display.render_mode to control DataFrame/Series visualization (#2413) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces the display.render_mode configuration option, providing a clearer and more flexible way to control how BigFrames objects are visualized in notebooks and other interactive environments. Key Changes: * New Configuration Option: Added bpd.options.display.render_mode which supports three modes: * html (Default): Standard HTML table rendering. * plaintext: Forces plain text output by removing the HTML component from the mimebundle. * anywidget: Enables the interactive anywidget-based table component. * Migration Path: Maintained backward compatibility for display.repr_mode = "anywidget". The rendering logic now checks both the new render_mode and the legacy repr_mode flags. * Notebook & Test Updates: Updated existing notebooks and system tests to prefer the new render_mode option while verifying that existing behaviors remain intact. * New Unit Tests: Added tests/unit/display/test_render_mode.py to specifically verify the mimebundle selection logic and priority across different configuration states. While repr_mode was previously used to toggle the interactive widget, "representation mode" is a broad term. render_mode specifically addresses the format of the output (HTML vs. Text vs. Widget), allowing for better support for text-only environments and a more intuitive API for users. Also verified at colab notebook: [screen/AHNz4o5Mhb9UHrh](https://screenshot.googleplex.com/AHNz4o5Mhb9UHrh) Fixes #<479282023> 🦕 --- bigframes/display/html.py | 8 +- notebooks/dataframes/anywidget_mode.ipynb | 402 +++++------------- notebooks/ml/timeseries_analysis.ipynb | 6 +- tests/system/small/test_anywidget.py | 63 ++- tests/system/small/test_progress_bar.py | 4 +- tests/unit/display/test_anywidget.py | 8 +- tests/unit/display/test_render_mode.py | 106 +++++ .../pandas/core/config_init.py | 17 + 8 files changed, 282 insertions(+), 332 deletions(-) create mode 100644 tests/unit/display/test_render_mode.py diff --git a/bigframes/display/html.py b/bigframes/display/html.py index ef34985c8e8..4d439ff2d8e 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -361,7 +361,7 @@ def repr_mimebundle( if opts.repr_mode == "deferred": return repr_mimebundle_deferred(obj) - if opts.repr_mode == "anywidget": + if opts.render_mode == "anywidget" or opts.repr_mode == "anywidget": try: with bigframes.option_context("display.progress_bar", None): with warnings.catch_warnings(): @@ -380,4 +380,8 @@ def repr_mimebundle( f"Falling back to static HTML. Error: {traceback.format_exc()}" ) - return repr_mimebundle_head(obj) + bundle = repr_mimebundle_head(obj) + if opts.render_mode == "plaintext": + bundle.pop("text/html", None) + + return bundle diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index e9491610acf..a0efa571a7d 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -63,7 +63,7 @@ "outputs": [], "source": [ "bpd.options.bigquery.ordering_mode = \"partial\"\n", - "bpd.options.display.repr_mode = \"anywidget\"" + "bpd.options.display.render_mode = \"anywidget\"" ] }, { @@ -89,20 +89,10 @@ "id": "f289d250", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ - "✅ Completed. \n", + "\n", " Query processed 0 Bytes in a moment of slot time.\n", " " ], @@ -117,17 +107,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Cora 61\n", + " AL F 1910 Anna 74\n", + " AR F 1910 Willie 132\n", + " CO F 1910 Anna 42\n", + " FL F 1910 Louise 70\n", + " GA F 1910 Catherine 57\n", + " IL F 1910 Jessie 43\n", + " IN F 1910 Anna 100\n", + " IN F 1910 Pauline 77\n", + " IN F 1910 Beulah 39\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -145,30 +135,10 @@ "id": "220340b0", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6fb22be7f21f4d1dacd76dc62a1a7818", + "model_id": "d75a0d81724f4776ae1a592369e78946", "version_major": 2, "version_minor": 1 }, @@ -204,80 +174,80 @@ " AL\n", " F\n", " 1910\n", - " Lillian\n", - " 99\n", + " Annie\n", + " 482\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Ruby\n", - " 204\n", + " Myrtle\n", + " 104\n", " \n", " \n", " 2\n", - " AL\n", + " AR\n", " F\n", " 1910\n", - " Helen\n", - " 76\n", + " Lillian\n", + " 56\n", " \n", " \n", " 3\n", - " AL\n", + " CT\n", " F\n", " 1910\n", - " Eunice\n", - " 41\n", + " Anne\n", + " 38\n", " \n", " \n", " 4\n", - " AR\n", + " CT\n", " F\n", " 1910\n", - " Dora\n", - " 42\n", + " Frances\n", + " 45\n", " \n", " \n", " 5\n", - " CA\n", + " FL\n", " F\n", " 1910\n", - " Edna\n", - " 62\n", + " Margaret\n", + " 53\n", " \n", " \n", " 6\n", - " CA\n", + " GA\n", " F\n", " 1910\n", - " Helen\n", - " 239\n", + " Mae\n", + " 73\n", " \n", " \n", " 7\n", - " CO\n", + " GA\n", " F\n", " 1910\n", - " Alice\n", - " 46\n", + " Beatrice\n", + " 96\n", " \n", " \n", " 8\n", - " FL\n", + " GA\n", " F\n", " 1910\n", - " Willie\n", - " 71\n", + " Lola\n", + " 47\n", " \n", " \n", " 9\n", - " FL\n", + " IA\n", " F\n", " 1910\n", - " Thelma\n", - " 65\n", + " Viola\n", + " 49\n", " \n", " \n", "\n", @@ -285,17 +255,17 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Annie 482\n", + " AL F 1910 Myrtle 104\n", + " AR F 1910 Lillian 56\n", + " CT F 1910 Anne 38\n", + " CT F 1910 Frances 45\n", + " FL F 1910 Margaret 53\n", + " GA F 1910 Mae 73\n", + " GA F 1910 Beatrice 96\n", + " GA F 1910 Lola 47\n", + " IA F 1910 Viola 49\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -328,8 +298,8 @@ { "data": { "text/html": [ - "✅ Completed. \n", - " Query processed 171.4 MB in 41 seconds of slot time. [Job bigframes-dev:US.492b5260-9f44-495c-be09-2ae1324a986c details]\n", + "\n", + " Query processed 171.4 MB in 46 seconds of slot time. [Job bigframes-dev:US.dcf260e0-eaad-4979-9ec6-12f2436698e4 details]\n", " " ], "text/plain": [ @@ -342,7 +312,7 @@ { "data": { "text/html": [ - "✅ Completed. \n", + "\n", " Query processed 88.8 MB in a moment of slot time.\n", " " ], @@ -353,16 +323,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -404,38 +364,10 @@ "id": "da23e0f3", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gsx0h2jHoOSYwqGKUS3lAYLf_qi3 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_1VivAJ2InPdg5RXjWfvAJ1B0oxO3 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7d82208e7e5e40dd9dbf64c4c561cab3", + "model_id": "8e1b0e50cacb4315a231913b321cff55", "version_major": 2, "version_minor": 1 }, @@ -533,34 +465,6 @@ "id": "6920d49b", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 10 seconds of slot time. [Job bigframes-dev:US.job_cmNyG5sJ1IDCyFINx7teExQOZ6UQ details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_aQvP3Sn04Ss4flSLaLhm0sKzFvrd details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -571,12 +475,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "52d11291ba1d42e6b544acbd86eef6cf", + "model_id": "b7f188a72de440359e402d8e41de26a9", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -648,34 +552,6 @@ "id": "a9d5d13a", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -686,12 +562,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "32c61c84740d45a0ac37202a76c7c14e", + "model_id": "cf507362c97b4ccf9084997d03d65290", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -734,8 +610,8 @@ { "data": { "text/html": [ - "✅ Completed. \n", - " Query processed 85.9 kB in 21 seconds of slot time.\n", + "\n", + " Query processed 85.9 kB in 28 seconds of slot time.\n", " " ], "text/plain": [ @@ -745,54 +621,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9d60a47296214553bb10c434b5ee8330", + "model_id": "b9dd4b812443455ba32ec71723331a10", "version_major": 2, "version_minor": 1 }, @@ -858,16 +690,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " H05B 6/12\n", - " <NA>\n", - " 18165514.3\n", - " 03.04.2018\n", - " 30.03.2017\n", + " G06F 11/30\n", " <NA>\n", - " BSH Hausger√§te GmbH\n", - " Acero Acero, Jesus\n", - " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", - " EP 3 383 141 A2\n", + " 18157347.8\n", + " 19.02.2018\n", + " 31.03.2017\n", + " Hoffmann Eitle\n", + " FUJITSU LIMITED\n", + " Kukihara, Kensuke\n", + " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", + " EP 3 382 553 A1\n", " \n", " \n", " 2\n", @@ -876,16 +708,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " H01L 21/20\n", - " <NA>\n", - " 18166536.5\n", - " 16.02.2016\n", + " A01K 31/00\n", " <NA>\n", - " Scheider, Sascha et al\n", - " EV Group E. Thallner GmbH\n", - " Kurz, Florian\n", - " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", - " EP 3 382 744 A1\n", + " 18171005.4\n", + " 05.02.2015\n", + " 05.02.2014\n", + " Stork Bamberger Patentanw√§lte\n", + " Linco Food Systems A/S\n", + " Thrane, Uffe\n", + " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", + " EP 3 381 276 A1\n", " \n", " \n", " 3\n", @@ -894,16 +726,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " G06F 11/30\n", + " H05B 6/12\n", " <NA>\n", - " 18157347.8\n", - " 19.02.2018\n", - " 31.03.2017\n", - " Hoffmann Eitle\n", - " FUJITSU LIMITED\n", - " Kukihara, Kensuke\n", - " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", - " EP 3 382 553 A1\n", + " 18165514.3\n", + " 03.04.2018\n", + " 30.03.2017\n", + " <NA>\n", + " BSH Hausger√§te GmbH\n", + " Acero Acero, Jesus\n", + " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", + " EP 3 383 141 A2\n", " \n", " \n", " 4\n", @@ -912,16 +744,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " A01K 31/00\n", + " H01L 21/20\n", " <NA>\n", - " 18171005.4\n", - " 05.02.2015\n", - " 05.02.2014\n", - " Stork Bamberger Patentanw√§lte\n", - " Linco Food Systems A/S\n", - " Thrane, Uffe\n", - " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", - " EP 3 381 276 A1\n", + " 18166536.5\n", + " 16.02.2016\n", + " <NA>\n", + " Scheider, Sascha et al\n", + " EV Group E. Thallner GmbH\n", + " Kurz, Florian\n", + " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", + " EP 3 382 744 A1\n", " \n", " \n", "\n", @@ -945,31 +777,31 @@ "\n", " publication_date class_international class_us application_number \\\n", "0 29.08.018 E04H 6/12 18157874.1 \n", - "1 03.10.2018 H05B 6/12 18165514.3 \n", - "2 03.10.2018 H01L 21/20 18166536.5 \n", - "3 03.10.2018 G06F 11/30 18157347.8 \n", - "4 03.10.2018 A01K 31/00 18171005.4 \n", + "1 03.10.2018 G06F 11/30 18157347.8 \n", + "2 03.10.2018 A01K 31/00 18171005.4 \n", + "3 03.10.2018 H05B 6/12 18165514.3 \n", + "4 03.10.2018 H01L 21/20 18166536.5 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", - "1 03.04.2018 30.03.2017 \n", - "2 16.02.2016 Scheider, Sascha et al \n", - "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "4 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", + "1 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "2 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", + "3 03.04.2018 30.03.2017 \n", + "4 16.02.2016 Scheider, Sascha et al \n", "\n", " applicant_line_1 inventor_line_1 \\\n", "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", - "1 BSH Hausger√§te GmbH Acero Acero, Jesus \n", - "2 EV Group E. Thallner GmbH Kurz, Florian \n", - "3 FUJITSU LIMITED Kukihara, Kensuke \n", - "4 Linco Food Systems A/S Thrane, Uffe \n", + "1 FUJITSU LIMITED Kukihara, Kensuke \n", + "2 Linco Food Systems A/S Thrane, Uffe \n", + "3 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "4 EV Group E. Thallner GmbH Kurz, Florian \n", "\n", " title_line_1 number \n", "0 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", - "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", - "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "4 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "1 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "2 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "3 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "4 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", "\n", "[5 rows x 15 columns]" ] @@ -995,7 +827,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv", + "display_name": "venv (3.13.0)", "language": "python", "name": "python3" }, diff --git a/notebooks/ml/timeseries_analysis.ipynb b/notebooks/ml/timeseries_analysis.ipynb index 01c5a20efa3..84959b3632b 100644 --- a/notebooks/ml/timeseries_analysis.ipynb +++ b/notebooks/ml/timeseries_analysis.ipynb @@ -12,14 +12,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "c0b2db75", "metadata": {}, "outputs": [], "source": [ "import bigframes.pandas as bpd\n", "from bigframes.ml import forecasting\n", - "bpd.options.display.repr_mode = \"anywidget\"" + "bpd.options.display.render_mode = \"anywidget\"" ] }, { @@ -1113,7 +1113,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv", + "display_name": "venv (3.13.0)", "language": "python", "name": "python3" }, diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index b9f7b87f5ea..607d7f01941 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -70,7 +70,7 @@ def table_widget(paginated_bf_df: bigframes.dataframe.DataFrame): from bigframes.display import TableWidget with bigframes.option_context( - "display.repr_mode", "anywidget", "display.max_rows", 2 + "display.render_mode", "anywidget", "display.max_rows", 2 ): # Delay context manager cleanup of `max_rows` until after tests finish. yield TableWidget(paginated_bf_df) @@ -100,7 +100,7 @@ def small_widget(small_bf_df): """Helper fixture for tests using a DataFrame smaller than the page size.""" from bigframes.display import TableWidget - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5): + with bf.option_context("display.render_mode", "anywidget", "display.max_rows", 5): yield TableWidget(small_bf_df) @@ -126,7 +126,9 @@ def unknown_row_count_widget(session): mock_batches.return_value = blocks.PandasBatches( batches_iterator, total_rows=None ) - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + with bf.option_context( + "display.render_mode", "anywidget", "display.max_rows", 2 + ): widget = TableWidget(bf_df) yield widget @@ -206,7 +208,7 @@ def test_widget_initialization_should_calculate_total_row_count( from bigframes.display import TableWidget with bigframes.option_context( - "display.repr_mode", "anywidget", "display.max_rows", 2 + "display.render_mode", "anywidget", "display.max_rows", 2 ): widget = TableWidget(paginated_bf_df) @@ -316,7 +318,7 @@ def test_widget_pagination_should_work_with_custom_page_size( ): """Test that a widget paginates correctly with a custom page size.""" with bigframes.option_context( - "display.repr_mode", "anywidget", "display.max_rows", 3 + "display.render_mode", "anywidget", "display.max_rows", 3 ): from bigframes.display import TableWidget @@ -370,7 +372,7 @@ def test_global_options_change_should_not_affect_existing_widget_page_size( then the widget's page size should remain unchanged. """ with bigframes.option_context( - "display.repr_mode", "anywidget", "display.max_rows", 2 + "display.render_mode", "anywidget", "display.max_rows", 2 ): from bigframes.display import TableWidget @@ -395,7 +397,7 @@ def test_widget_with_empty_dataframe_should_have_zero_row_count( then its row_count should be 0. """ - with bigframes.option_context("display.repr_mode", "anywidget"): + with bigframes.option_context("display.render_mode", "anywidget"): from bigframes.display import TableWidget widget = TableWidget(empty_bf_df) @@ -407,28 +409,17 @@ def test_widget_with_empty_dataframe_should_render_table_headers( empty_bf_df: bf.dataframe.DataFrame, ): """ - - Given an empty DataFrame, - - when a widget is created from it, - - then its HTML representation should still render the table headers. - - """ - with bigframes.option_context("display.repr_mode", "anywidget"): + with bigframes.option_context("display.render_mode", "anywidget"): from bigframes.display import TableWidget widget = TableWidget(empty_bf_df) - html = widget.table_html - assert ">> import bigframes.pandas as bpd >>> bpd.options.display.repr_mode = "deferred" # doctest: +SKIP """ + render_mode: Literal["plaintext", "html", "anywidget"] = "html" + """ + Determines how to visualize a DataFrame or Series. Default "html". + + `plaintext` + Display as plain text. + + `html` + Display as HTML table. + + `anywidget` + Display as interactive widget using `anywidget` library. + """ + max_colwidth: Optional[int] = 50 """ The maximum width in characters of a column in the repr. Default 50. From aa3a73f73a62d0e32d0f20a6830540e8db0d3b90 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 26 Feb 2026 15:45:01 -0800 Subject: [PATCH 20/29] chore: modularize GEMINI.md file (#2479) People can write specific instructions for different tasks. And only import the specific ones needed. https://geminicli.com/docs/reference/memport/ I will add GEMINI.md to .gitignore in next PR so we don't conflict with each other. --- .gemini/common/constraints.md | 8 ++ .gemini/common/docs.md | 9 +++ .gemini/tasks/scalar_op.md | 67 ++++++++++++++++ .gemini/tools/style_nox.md | 18 +++++ .gemini/tools/test_docs.md | 10 +++ .gemini/tools/test_nox.md | 28 +++++++ .gemini/tools/test_pytest.md | 9 +++ .gitignore | 3 + GEMINI.md | 147 +--------------------------------- 9 files changed, 154 insertions(+), 145 deletions(-) create mode 100644 .gemini/common/constraints.md create mode 100644 .gemini/common/docs.md create mode 100644 .gemini/tasks/scalar_op.md create mode 100644 .gemini/tools/style_nox.md create mode 100644 .gemini/tools/test_docs.md create mode 100644 .gemini/tools/test_nox.md create mode 100644 .gemini/tools/test_pytest.md diff --git a/.gemini/common/constraints.md b/.gemini/common/constraints.md new file mode 100644 index 00000000000..1b563eab3b8 --- /dev/null +++ b/.gemini/common/constraints.md @@ -0,0 +1,8 @@ +## Constraints + +- Only add git commits. Do not change git history. +- Follow the spec file for development. + - Check off items in the "Acceptance + criteria" and "Detailed steps" sections with `[x]`. + - Please do this as they are completed. + - Refer back to the spec after each step. diff --git a/.gemini/common/docs.md b/.gemini/common/docs.md new file mode 100644 index 00000000000..1a718005ad7 --- /dev/null +++ b/.gemini/common/docs.md @@ -0,0 +1,9 @@ +## Documentation + +If a method or property is implementing the same interface as a third-party +package such as pandas or scikit-learn, place the relevant docstring in the +corresponding `third_party/bigframes_vendored/package_name` directory, not in +the `bigframes` directory. Implementations may be placed in the `bigframes` +directory, though. + +@../tools/test_docs.md diff --git a/.gemini/tasks/scalar_op.md b/.gemini/tasks/scalar_op.md new file mode 100644 index 00000000000..a9318d54824 --- /dev/null +++ b/.gemini/tasks/scalar_op.md @@ -0,0 +1,67 @@ +## Adding a scalar operator + +For an example, see commit +[c5b7fdae74a22e581f7705bc0cf5390e928f4425](https://github.com/googleapis/python-bigquery-dataframes/commit/c5b7fdae74a22e581f7705bc0cf5390e928f4425). + +To add a new scalar operator, follow these steps: + +1. **Define the operation dataclass:** + - In `bigframes/operations/`, find the relevant file (e.g., `geo_ops.py` for geography functions) or create a new one. + - Create a new dataclass inheriting from `base_ops.UnaryOp` for unary + operators, `base_ops.BinaryOp` for binary operators, `base_ops.TernaryOp` + for ternary operators, or `base_ops.NaryOp for operators with many + arguments. Note that these operators are counting the number column-like + arguments. A function that takes only a single column but several literal + values would still be a `UnaryOp`. + - Define the `name` of the operation and any parameters it requires. + - Implement the `output_type` method to specify the data type of the result. + +2. **Export the new operation:** + - In `bigframes/operations/__init__.py`, import your new operation dataclass and add it to the `__all__` list. + +3. **Implement the user-facing function (pandas-like):** + + - Identify the canonical function from pandas / geopandas / awkward array / + other popular Python package that this operator implements. + - Find the corresponding class in BigFrames. For example, the implementation + for most geopandas.GeoSeries methods is in + `bigframes/geopandas/geoseries.py`. Pandas Series methods are implemented + in `bigframes/series.py` or one of the accessors, such as `StringMethods` + in `bigframes/operations/strings.py`. + - Create the user-facing function that will be called by users (e.g., `length`). + - If the SQL method differs from pandas or geopandas in a way that can't be + made the same, raise a `NotImplementedError` with an appropriate message and + link to the feedback form. + - Add the docstring to the corresponding file in + `third_party/bigframes_vendored`, modeled after pandas / geopandas. + +4. **Implement the user-facing function (SQL-like):** + + - In `bigframes/bigquery/_operations/`, find the relevant file (e.g., `geo.py`) or create a new one. + - Create the user-facing function that will be called by users (e.g., `st_length`). + - This function should take a `Series` for any column-like inputs, plus any other parameters. + - Inside the function, call `series._apply_unary_op`, + `series._apply_binary_op`, or similar passing the operation dataclass you + created. + - Add a comprehensive docstring with examples. + - In `bigframes/bigquery/__init__.py`, import your new user-facing function and add it to the `__all__` list. + +5. **Implement the compilation logic:** + - In `bigframes/core/compile/scalar_op_compiler.py`: + - If the BigQuery function has a direct equivalent in Ibis, you can often reuse an existing Ibis method. + - If not, define a new Ibis UDF using `@ibis_udf.scalar.builtin` to map to the specific BigQuery function signature. + - Create a new compiler implementation function (e.g., `geo_length_op_impl`). + - Register this function to your operation dataclass using `@scalar_op_compiler.register_unary_op` or `@scalar_op_compiler.register_binary_op`. + - This implementation will translate the BigQuery DataFrames operation into the appropriate Ibis expression. + +6. **Add Tests:** + - Add system tests in the `tests/system/` directory to verify the end-to-end + functionality of the new operator. Test various inputs, including edge cases + and `NULL` values. + + Where possible, run the same test code against pandas or GeoPandas and + compare that the outputs are the same (except for dtypes if BigFrames + differs from pandas). + - If you are overriding a pandas or GeoPandas property, add a unit test to + ensure the correct behavior (e.g., raising `NotImplementedError` if the + functionality is not supported). diff --git a/.gemini/tools/style_nox.md b/.gemini/tools/style_nox.md new file mode 100644 index 00000000000..894fd102363 --- /dev/null +++ b/.gemini/tools/style_nox.md @@ -0,0 +1,18 @@ +## Code Style with nox + +- We use the automatic code formatter `black`. You can run it using + the nox session `format`. This will eliminate many lint errors. Run via: + + ```bash + nox -r -s format + ``` + +- PEP8 compliance is required, with exceptions defined in the linter configuration. + If you have ``nox`` installed, you can test that you have not introduced + any non-compliant code via: + + ``` + nox -r -s lint + ``` + +- When writing tests, use the idiomatic "pytest" style. diff --git a/.gemini/tools/test_docs.md b/.gemini/tools/test_docs.md new file mode 100644 index 00000000000..5cb988186c7 --- /dev/null +++ b/.gemini/tools/test_docs.md @@ -0,0 +1,10 @@ +## Testing code samples + +Code samples are very important for accurate documentation. We use the "doctest" +framework to ensure the samples are functioning as expected. After adding a code +sample, please ensure it is correct by running doctest. To run the samples +doctests for just a single method, refer to the following example: + +```bash +pytest --doctest-modules bigframes/pandas/__init__.py::bigframes.pandas.cut +``` diff --git a/.gemini/tools/test_nox.md b/.gemini/tools/test_nox.md new file mode 100644 index 00000000000..023ada1b61f --- /dev/null +++ b/.gemini/tools/test_nox.md @@ -0,0 +1,28 @@ +## Testing with nox + +Use `nox` to instrument our tests. + +- To test your changes, run unit tests with `nox`: + + ```bash + nox -r -s unit + ``` + +- To run a single unit test: + + ```bash + nox -r -s unit-3.14 -- -k + ``` + +- Ignore this step if you lack access to Google Cloud resources. To run system + tests, you can execute:: + + # Run all system tests + $ nox -r -s system + + # Run a single system test + $ nox -r -s system-3.14 -- -k + +- The codebase must have better coverage than it had previously after each + change. You can test coverage via `nox -s unit system cover` (takes a long + time). Omit `system` if you lack access to cloud resources. diff --git a/.gemini/tools/test_pytest.md b/.gemini/tools/test_pytest.md new file mode 100644 index 00000000000..5228ae06ba8 --- /dev/null +++ b/.gemini/tools/test_pytest.md @@ -0,0 +1,9 @@ +## Testing with pytest + +Use `pytest` to instrument our tests. + +- To test your changes, run `pytest`: + + ```bash + pytest :: + ``` diff --git a/.gitignore b/.gitignore index 52dcccd33d8..6b157559ccd 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,6 @@ pylintrc pylintrc.test dummy.pkl .mypy_cache/ + +# Gemini +GEMINI.md diff --git a/GEMINI.md b/GEMINI.md index 1c8cff33870..4de59125272 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -1,148 +1,5 @@ # Contribution guidelines, tailored for LLM agents -## Testing +@.gemini/common/docs.md -We use `nox` to instrument our tests. - -- To test your changes, run unit tests with `nox`: - - ```bash - nox -r -s unit - ``` - -- To run a single unit test: - - ```bash - nox -r -s unit-3.14 -- -k - ``` - -- Ignore this step if you lack access to Google Cloud resources. To run system - tests, you can execute:: - - # Run all system tests - $ nox -r -s system - - # Run a single system test - $ nox -r -s system-3.14 -- -k - -- The codebase must have better coverage than it had previously after each - change. You can test coverage via `nox -s unit system cover` (takes a long - time). Omit `system` if you lack access to cloud resources. - -## Code Style - -- We use the automatic code formatter `black`. You can run it using - the nox session `format`. This will eliminate many lint errors. Run via: - - ```bash - nox -r -s format - ``` - -- PEP8 compliance is required, with exceptions defined in the linter configuration. - If you have ``nox`` installed, you can test that you have not introduced - any non-compliant code via: - - ``` - nox -r -s lint - ``` - -- When writing tests, use the idiomatic "pytest" style. - -## Documentation - -If a method or property is implementing the same interface as a third-party -package such as pandas or scikit-learn, place the relevant docstring in the -corresponding `third_party/bigframes_vendored/package_name` directory, not in -the `bigframes` directory. Implementations may be placed in the `bigframes` -directory, though. - -### Testing code samples - -Code samples are very important for accurate documentation. We use the "doctest" -framework to ensure the samples are functioning as expected. After adding a code -sample, please ensure it is correct by running doctest. To run the samples -doctests for just a single method, refer to the following example: - -```bash -pytest --doctest-modules bigframes/pandas/__init__.py::bigframes.pandas.cut -``` - -## Tips for implementing common BigFrames features - -### Adding a scalar operator - -For an example, see commit -[c5b7fdae74a22e581f7705bc0cf5390e928f4425](https://github.com/googleapis/python-bigquery-dataframes/commit/c5b7fdae74a22e581f7705bc0cf5390e928f4425). - -To add a new scalar operator, follow these steps: - -1. **Define the operation dataclass:** - - In `bigframes/operations/`, find the relevant file (e.g., `geo_ops.py` for geography functions) or create a new one. - - Create a new dataclass inheriting from `base_ops.UnaryOp` for unary - operators, `base_ops.BinaryOp` for binary operators, `base_ops.TernaryOp` - for ternary operators, or `base_ops.NaryOp for operators with many - arguments. Note that these operators are counting the number column-like - arguments. A function that takes only a single column but several literal - values would still be a `UnaryOp`. - - Define the `name` of the operation and any parameters it requires. - - Implement the `output_type` method to specify the data type of the result. - -2. **Export the new operation:** - - In `bigframes/operations/__init__.py`, import your new operation dataclass and add it to the `__all__` list. - -3. **Implement the user-facing function (pandas-like):** - - - Identify the canonical function from pandas / geopandas / awkward array / - other popular Python package that this operator implements. - - Find the corresponding class in BigFrames. For example, the implementation - for most geopandas.GeoSeries methods is in - `bigframes/geopandas/geoseries.py`. Pandas Series methods are implemented - in `bigframes/series.py` or one of the accessors, such as `StringMethods` - in `bigframes/operations/strings.py`. - - Create the user-facing function that will be called by users (e.g., `length`). - - If the SQL method differs from pandas or geopandas in a way that can't be - made the same, raise a `NotImplementedError` with an appropriate message and - link to the feedback form. - - Add the docstring to the corresponding file in - `third_party/bigframes_vendored`, modeled after pandas / geopandas. - -4. **Implement the user-facing function (SQL-like):** - - - In `bigframes/bigquery/_operations/`, find the relevant file (e.g., `geo.py`) or create a new one. - - Create the user-facing function that will be called by users (e.g., `st_length`). - - This function should take a `Series` for any column-like inputs, plus any other parameters. - - Inside the function, call `series._apply_unary_op`, - `series._apply_binary_op`, or similar passing the operation dataclass you - created. - - Add a comprehensive docstring with examples. - - In `bigframes/bigquery/__init__.py`, import your new user-facing function and add it to the `__all__` list. - -5. **Implement the compilation logic:** - - In `bigframes/core/compile/scalar_op_compiler.py`: - - If the BigQuery function has a direct equivalent in Ibis, you can often reuse an existing Ibis method. - - If not, define a new Ibis UDF using `@ibis_udf.scalar.builtin` to map to the specific BigQuery function signature. - - Create a new compiler implementation function (e.g., `geo_length_op_impl`). - - Register this function to your operation dataclass using `@scalar_op_compiler.register_unary_op` or `@scalar_op_compiler.register_binary_op`. - - This implementation will translate the BigQuery DataFrames operation into the appropriate Ibis expression. - -6. **Add Tests:** - - Add system tests in the `tests/system/` directory to verify the end-to-end - functionality of the new operator. Test various inputs, including edge cases - and `NULL` values. - - Where possible, run the same test code against pandas or GeoPandas and - compare that the outputs are the same (except for dtypes if BigFrames - differs from pandas). - - If you are overriding a pandas or GeoPandas property, add a unit test to - ensure the correct behavior (e.g., raising `NotImplementedError` if the - functionality is not supported). - - -## Constraints - -- Only add git commits. Do not change git history. -- Follow the spec file for development. - - Check off items in the "Acceptance - criteria" and "Detailed steps" sections with `[x]`. - - Please do this as they are completed. - - Refer back to the spec after each step. +@.gemini/common/constraints.md From a6f499c1e225a962b53621158f9d4a19ca220ccd Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Feb 2026 16:37:35 -0800 Subject: [PATCH 21/29] docs: Fix recall_score doc example (#2477) --- .../bigframes_vendored/sklearn/metrics/_classification.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index e60cc8cec49..085388b0456 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -135,10 +135,10 @@ def recall_score( >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) >>> recall_score = bigframes.ml.metrics.recall_score(y_true, y_pred, average=None) >>> recall_score - 0 1 - 1 0 - 2 0 - dtype: int64 + 0 1.0 + 1 0.0 + 2 0.0 + dtype: float64 Args: From 01dc5a34e09171351575d5cbdc9f301e505e1567 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 26 Feb 2026 17:16:02 -0800 Subject: [PATCH 22/29] fix: upload local data through write API if nested JSONs detected (#2478) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes internal issue 487027061 🦕 --- bigframes/session/bq_caching_executor.py | 2 +- bigframes/session/loader.py | 44 +++++++++++++++++------- tests/system/small/test_session.py | 8 +++-- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 943eee0c12d..bcb3ae60e3b 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -604,7 +604,7 @@ def _upload_local_data(self, local_table: local_data.ManagedArrowTable): # Might be better as a queue and a worker thread with self._upload_lock: if local_table not in self.cache._uploaded_local_data: - uploaded = self.loader.load_data( + uploaded = self.loader.load_data_or_write_data( local_table, bigframes.core.guid.generate_guid() ) self.cache.cache_remote_replacement(local_table, uploaded) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 9d222a3755a..0944c0dab6f 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -350,16 +350,38 @@ def read_managed_data( session=self._session, ) + def load_data_or_write_data( + self, + data: local_data.ManagedArrowTable, + offsets_col: str, + ) -> bq_data.BigqueryDataSource: + """Write local data into BigQuery using the local API if possible, + otherwise use the write API.""" + can_load = all( + _is_dtype_can_load(item.column, item.dtype) for item in data.schema.items + ) + if can_load: + return self.load_data(data, offsets_col=offsets_col) + else: + return self.write_data(data, offsets_col=offsets_col) + def load_data( self, data: local_data.ManagedArrowTable, offsets_col: str, ) -> bq_data.BigqueryDataSource: """Load managed data into bigquery""" - - # JSON support incomplete - for item in data.schema.items: - _validate_dtype_can_load(item.column, item.dtype) + cannot_load_columns = { + item.column: item.dtype + for item in data.schema.items + if not _is_dtype_can_load(item.column, item.dtype) + } + + if cannot_load_columns: + raise NotImplementedError( + f"Nested JSON types are currently unsupported for BigQuery Load API. " + f"Unsupported columns: {cannot_load_columns}. {constants.FEEDBACK_LINK}" + ) schema_w_offsets = data.schema.append( schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE) @@ -1474,7 +1496,7 @@ def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: return configuration -def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): +def _is_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype) -> bool: """ Determines whether a datatype is supported by bq load jobs. @@ -1482,23 +1504,19 @@ def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): we're using a workaround: storing JSON as strings and then parsing them into JSON objects. TODO(b/395912450): Remove workaround solution once b/374784249 got resolved. - - Raises: - NotImplementedError: Type is not yet supported by load jobs. """ # we can handle top-level json, but not nested yet through string conversion if column_type == bigframes.dtypes.JSON_DTYPE: - return + return True if isinstance( column_type, pandas.ArrowDtype ) and bigframes.dtypes.contains_db_dtypes_json_arrow_type( column_type.pyarrow_dtype ): - raise NotImplementedError( - f"Nested JSON types, found in column `{name}`: `{column_type}`', " - f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" - ) + return False + + return True # itertools.batched not available in python <3.12, so we use this instead diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 922f73a0ce1..2fa633a62ba 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1092,7 +1092,9 @@ def test_read_pandas_w_nested_json_fails(session, write_engine): pa.list_(pa.struct([("json_field", bigframes.dtypes.JSON_ARROW_TYPE)])) ), ) - with pytest.raises(NotImplementedError, match="Nested JSON types, found in column"): + with pytest.raises( + NotImplementedError, match="Nested JSON types are currently unsupported" + ): session.read_pandas(pd_s, write_engine=write_engine) @@ -1178,7 +1180,9 @@ def test_read_pandas_w_nested_json_index_fails(session, write_engine): pa.list_(pa.struct([("json_field", bigframes.dtypes.JSON_ARROW_TYPE)])) ), ) - with pytest.raises(NotImplementedError, match="Nested JSON types, found in"): + with pytest.raises( + NotImplementedError, match="Nested JSON types are currently unsupported" + ): session.read_pandas(pd_idx, write_engine=write_engine) From 05d11b7fa082969cceed98d177103fe5c111377c Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Fri, 27 Feb 2026 14:18:58 -0800 Subject: [PATCH 23/29] tests: add multimodal snippets tests (#2476) code pieces for https://docs.cloud.google.com/bigquery/docs/multimodal-data-sql-tutorial --- samples/snippets/multimodal_test.py | 401 ++++++++++++++++++++++++++++ 1 file changed, 401 insertions(+) diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 033fead33e0..ce04d511346 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -123,3 +123,404 @@ def test_multimodal_dataframe(gcs_bucket_snippets: str) -> None: assert answer_alt is not None assert embeddings is not None assert chunked is not None + + +def test_multimodal_example(gcs_bucket_snippets: str) -> None: + BUCKET = gcs_bucket_snippets + # [START bigquery_dataframes_multimodal_load] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + bbq.load_data( + "cymbal_pets.products", + write_disposition="OVERWRITE", + from_files_options={ + "format": "avro", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/tables/products/products_*.avro" + ], + }, + ) + # [END bigquery_dataframes_multimodal_load] + + # [START bigquery_dataframes_multimodal_create_images] + bbq.create_external_table( + "cymbal_pets.product_images", + replace=True, + connection_name="us.cymbal_conn", + options={ + "object_metadata": "SIMPLE", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*.png" + ], + }, + ) + # [END bigquery_dataframes_multimodal_create_images] + + # [START bigquery_dataframes_multimodal_create_manuals] + bbq.create_external_table( + "cymbal_pets.product_manuals", + replace=True, + connection_name="us.cymbal_conn", + options={ + "object_metadata": "SIMPLE", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*.pdf" + ], + }, + ) + # [END bigquery_dataframes_multimodal_create_manuals] + + # [START bigquery_dataframes_multimodal_create_gemini] + gemini_model = bbq.ml.create_model( + "cymbal_pets.gemini", + replace=True, + connection_name="us.cymbal_conn", + options={"endpoint": "gemini-2.5-flash"}, + ) + # [END bigquery_dataframes_multimodal_create_gemini] + + # [START bigquery_dataframes_multimodal_create_embedding] + embedding_model = bbq.ml.create_model( + "cymbal_pets.embedding_model", + replace=True, + connection_name="us.cymbal_conn", + options={"endpoint": "multimodalembedding@001"}, + ) + # [END bigquery_dataframes_multimodal_create_embedding] + + # [START bigquery_dataframes_multimodal_create_df_products_mm] + df_images = bpd.read_gbq("SELECT * FROM cymbal_pets.product_images") + df_products = bpd.read_gbq("cymbal_pets.products") + + df_products_mm = df_images.merge(df_products, on="uri").drop(columns="uri") + df_products_mm = df_products_mm.rename(columns={"ref": "image"}) + # [END bigquery_dataframes_multimodal_create_df_products_mm] + + # [START bigquery_dataframes_multimodal_show_df_products_mm] + df_products_mm[["product_name", "image"]] + # [END bigquery_dataframes_multimodal_show_df_products_mm] + + # [START bigquery_dataframes_multimodal_image_description] + df_products_mm["url"] = bbq.obj.get_access_url( + df_products_mm["image"], "R" + ).to_frame() + df_products_mm["prompt0"] = "Can you describe the following image?" + + df_products_mm["prompt"] = bbq.struct(df_products_mm[["prompt0", "url"]]) + df_products_mm = bbq.ai.generate_table( + gemini_model, df_products_mm, output_schema={"image_description": "STRING"} + ) + + df_products_mm = df_products_mm[ + [ + "product_id", + "product_name", + "brand", + "category", + "subcategory", + "animal_type", + "search_keywords", + "price", + "description", + "inventory_level", + "supplier_id", + "average_rating", + "image", + "image_description", + ] + ] + # [END bigquery_dataframes_multimodal_image_description] + + # [START bigquery_dataframes_multimodal_generate_animal_type] + df_prompt = bbq.obj.get_access_url(df_products_mm["image"], "R").to_frame() + df_prompt[ + "prompt0" + ] = "For the image of a pet product, concisely generate the following metadata: 1) animal_type and 2) 5 SEO search keywords, and 3) product subcategory." + + df_products_mm["prompt"] = bbq.struct(df_prompt[["prompt0", "image"]]) + + df_products_mm = df_products_mm.drop( + columns=["animal_type", "search_keywords", "subcategory"] + ) + df_products_mm = bbq.ai.generate_table( + gemini_model, + df_products_mm, + output_schema="animal_type STRING, search_keywords ARRAY, subcategory STRING", + ) + # [END bigquery_dataframes_multimodal_generate_animal_type] + + # [START bigquery_dataframes_multimodal_show_animal_type] + df_products_mm[ + [ + "product_name", + "image_description", + "animal_type", + "search_keywords", + "subcategory", + ] + ] + # [END bigquery_dataframes_multimodal_show_animal_type] + + # [START bigquery_dataframes_multimodal_brand_description] + df_agg = df_products_mm[ + ["image", "description", "category", "subcategory", "brand"] + ] + df_agg["image"] = bbq.obj.get_access_url(df_products_mm["image"], "R") + df_agg = bbq.array_agg(df_agg.groupby(by=["brand"])) + + df_agg["cnt"] = bbq.array_length(df_agg["image"]) + + df_prompt = df_agg[["image", "description", "category", "subcategory"]] + df_prompt[ + "prompt0" + ] = "Use the images and text to give one concise brand description for a website brand page. Return the description only. " + + df_agg["prompt"] = bbq.struct( + df_prompt[["prompt0", "image", "description", "category", "subcategory"]] + ) + + df_agg = df_agg.reset_index() + + df_agg = bbq.ai.generate_table( + gemini_model, df_agg, output_schema={"brand_description": "STRING"} + ) + df_agg[["brand", "brand_description", "cnt"]] + # [END bigquery_dataframes_multimodal_brand_description] + + # [START bigquery_dataframes_multimodal_define_to_grayscale] + @bpd.udf( + dataset="cymbal_pets", + name="to_grayscale", + packages=["numpy", "opencv-python"], + bigquery_connection="us.cymbal_conn", + max_batching_rows=1, + ) + def to_grayscale(src_ref: str, dst_ref: str) -> str: + import json + from urllib.request import Request, urlopen + + import cv2 as cv + import numpy as np + + src_json = json.loads(src_ref) + srcUrl = src_json["access_urls"]["read_url"] + + dst_json = json.loads(dst_ref) + dstUrl = dst_json["access_urls"]["write_url"] + + req = urlopen(srcUrl) + arr = np.asarray(bytearray(req.read()), dtype=np.uint8) + img = cv.imdecode(arr, -1) # 'Load it as it is' + + # Convert the image to grayscale + gray_image = cv.cvtColor(img, cv.COLOR_BGR2GRAY) + + # Send POST request to the URL + _, img_encoded = cv.imencode(".png", gray_image) + + req = Request( + url=dstUrl, + data=img_encoded.tobytes(), + method="PUT", + headers={ + "Content-Type": "image/png", + }, + ) + with urlopen(req): + pass + return dst_ref + + # [END bigquery_dataframes_multimodal_define_to_grayscale] + + # [START bigquery_dataframes_multimodal_apply_to_grayscale] + df_grayscale = df_products_mm[["product_id", "product_name", "image"]] + df_grayscale[ + "gray_image_uri" + ] = f"gs://{BUCKET}/cymbal-pets-images/grayscale/" + df_grayscale[ + "image" + ].struct.field( + "uri" + ).str.extract( + r"([^/]+)$" + ) + + df_grayscale["gray_image"] = bbq.obj.make_ref( + df_grayscale["gray_image_uri"], "us.cymbal_conn" + ) + + df_grayscale["image_url"] = bbq.to_json_string( + bbq.obj.get_access_url(df_grayscale["image"], "r") + ) + df_grayscale["gray_image_url"] = bbq.to_json_string( + bbq.obj.get_access_url(df_grayscale["gray_image"], "rw") + ) + + df_grayscale[["image_url", "gray_image_url"]].apply(to_grayscale, axis=1) + # [END bigquery_dataframes_multimodal_apply_to_grayscale] + + # [START bigquery_dataframes_multimodal_define_chunk_pdf] + @bpd.udf( + dataset="cymbal_pets", + name="chunk_pdf", + packages=["pypdf"], + bigquery_connection="us.cymbal_conn", + max_batching_rows=1, + ) + def chunk_pdf(src_ref: str, chunk_size: int, overlap_size: int) -> list[str]: + import io + import json + from urllib.request import urlopen + + from pypdf import PdfReader # type: ignore + + src_json = json.loads(src_ref) + srcUrl = src_json["access_urls"]["read_url"] + + req = urlopen(srcUrl) + pdf_file = io.BytesIO(bytearray(req.read())) + reader = PdfReader(pdf_file, strict=False) + + # extract and chunk text simultaneously + all_text_chunks = [] + curr_chunk = "" + for page in reader.pages: + page_text = page.extract_text() + if page_text: + curr_chunk += page_text + # split the accumulated text into chunks of a specific size with overlaop + # this loop implements a sliding window approach to create chunks + while len(curr_chunk) >= chunk_size: + split_idx = curr_chunk.rfind(" ", 0, chunk_size) + if split_idx == -1: + split_idx = chunk_size + actual_chunk = curr_chunk[:split_idx] + all_text_chunks.append(actual_chunk) + overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size] + curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :] + if curr_chunk: + all_text_chunks.append(curr_chunk) + + return all_text_chunks + + # [END bigquery_dataframes_multimodal_define_chunk_pdf] + + # [START bigquery_dataframes_multimodal_apply_chunk_pdf] + df_manuals = bpd.read_gbq("SELECT * FROM cymbal_pets.product_manuals") + df_manuals["url"] = bbq.to_json_string( + bbq.obj.get_access_url(df_manuals["ref"], "R") + ) + + df_manuals["chunk_size"] = 1000 + df_manuals["overlap_size"] = 100 + + df_manuals["chunked"] = df_manuals[["url", "chunk_size", "overlap_size"]].apply( + chunk_pdf, axis=1 + ) + # [END bigquery_dataframes_multimodal_apply_chunk_pdf] + + # [START bigquery_dataframes_multimodal_analyze_pdf] + df_chunked = df_manuals["chunked"].explode().to_frame() + df_chunked[ + "prompt0" + ] = "Can you summarize the product manual as bullet points? Highlight the legal clauses" + + df_chunked["prompt"] = bbq.struct(df_chunked[["prompt0", "chunked"]]) + + result = bbq.ai.generate_text(gemini_model, df_chunked["prompt"]) + result + # [END bigquery_dataframes_multimodal_analyze_pdf] + + # [START bigquery_dataframes_multimodal_create_embed_table] + df_products_mm["content"] = bbq.obj.get_access_url(df_products_mm["image"], "R") + df_embed = bbq.ai.generate_embedding( + embedding_model, df_products_mm[["content", "product_id"]] + ) + + df_embed.to_gbq("cymbal_pets.products_embedding", if_exists="replace") + # [END bigquery_dataframes_multimodal_create_embed_table] + + # [START bigquery_dataframes_multimodal_vector_search] + df_image = bpd.DataFrame( + { + "uri": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/cozy-naps-cat-scratching-post-with-condo.png" + ] + } + ).cache() + df_image["image"] = bbq.obj.make_ref(df_image["uri"], "us.cymbal_conn") + df_search = bbq.ai.generate_embedding( + embedding_model, + bbq.obj.get_access_url(bbq.obj.fetch_metadata(df_image["image"]), "R"), + ) + + search_result = bbq.vector_search( + "cymbal_pets.products_embedding", "embedding", df_search["embedding"] + ) + search_result + # [END bigquery_dataframes_multimodal_vector_search] + + # [START bigquery_dataframes_create_external_table_all] + bbq.create_external_table( + "cymbal_pets.product_manuals_all", + replace=True, + connection_name="us.cymbal_conn", + options={ + "object_metadata": "SIMPLE", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*.pdf", + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/document_chunks/*.pdf", + ], + }, + ) + # [END bigquery_dataframes_create_external_table_all] + + # [START bigquery_dataframes_create_manual_to_chunks] + df1 = bpd.read_gbq("SELECT * FROM cymbal_pets.product_manuals_all").sort_values( + "uri" + ) + df2 = df1.copy() + df1["name"] = df1["uri"].str.extract(r".*/([^.]*).[^/]+") + df2["name"] = df2["uri"].str.extract(r".*/([^.]*)_page[0-9]+.[^/]+") + df_manuals_all = df1.merge(df2, on="name") + df_manuals_agg = ( + bbq.array_agg(df_manuals_all[["ref_x", "uri_x"]].groupby("uri_x"))["ref_x"] + .str[0] + .to_frame() + ) + df_manuals_agg["chunks"] = bbq.array_agg( + df_manuals_all[["ref_y", "uri_x"]].groupby("uri_x") + )["ref_y"] + # [END bigquery_dataframes_create_manual_to_chunks] + + # [START bigquery_dataframes_show_manual_to_chunks] + df_manuals_agg + # [END bigquery_dataframes_show_manual_to_chunks] + + # [START bigquery_dataframes_generate_pages_summary] + df_manuals_agg["chunks_url"] = bbq.array_agg( + bbq.obj.get_access_url(df_manuals_agg.explode("chunks")["chunks"], "R").groupby( + "uri_x" + ) + ) + df_manuals_agg[ + "prompt0" + ] = "Can you provide a page by page summary for the first 3 pages of the attached manual? Only write one line for each page. The pages are provided in serial order" + df_manuals_agg["prompt"] = bbq.struct(df_manuals_agg[["prompt0", "chunks_url"]]) + + result = bbq.ai.generate_text(gemini_model, df_manuals_agg["prompt"])["result"] + result + # [END bigquery_dataframes_generate_pages_summary] + + # [START bigquery_dataframes_generate_each_page_summary] + result = bbq.ai.generate_table( + gemini_model, + df_manuals_agg["prompt"], + output_schema={ + "page1_summary": "STRING", + "page2_summary": "STRING", + "page3_summary": "STRING", + }, + )[["page1_summary", "page2_summary", "page3_summary"]] + result + # [END bigquery_dataframes_generate_each_page_summary] From 34f76c70fddef89adc4f08e7f9a8745c63c503d7 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 2 Mar 2026 11:05:57 -0800 Subject: [PATCH 24/29] refactor: remove the googlesql module (#2482) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes internal issue 488047595🦕 --- bigframes/bigquery/_operations/sql.py | 2 +- bigframes/core/compile/compiled.py | 26 +- bigframes/core/compile/googlesql/__init__.py | 61 ----- bigframes/core/compile/googlesql/abc.py | 25 -- bigframes/core/compile/googlesql/datatype.py | 23 -- .../core/compile/googlesql/expression.py | 124 ---------- bigframes/core/compile/googlesql/function.py | 32 --- bigframes/core/compile/googlesql/query.py | 231 ------------------ bigframes/core/compile/sqlglot/sqlglot_ir.py | 28 +++ bigframes/core/sql/__init__.py | 43 +++- bigframes/core/sql/ml.py | 20 +- bigframes/ml/compose.py | 2 +- bigframes/ml/sql.py | 2 +- bigframes/session/_io/bigquery/__init__.py | 4 +- bigframes/session/bigquery_session.py | 8 +- bigframes/session/bq_caching_executor.py | 2 +- tests/unit/core/compile/googlesql/__init__.py | 13 - .../core/compile/googlesql/test_expression.py | 37 --- .../core/compile/googlesql/test_function.py | 21 -- .../unit/core/compile/googlesql/test_query.py | 223 ----------------- 20 files changed, 96 insertions(+), 831 deletions(-) delete mode 100644 bigframes/core/compile/googlesql/__init__.py delete mode 100644 bigframes/core/compile/googlesql/abc.py delete mode 100644 bigframes/core/compile/googlesql/datatype.py delete mode 100644 bigframes/core/compile/googlesql/expression.py delete mode 100644 bigframes/core/compile/googlesql/function.py delete mode 100644 bigframes/core/compile/googlesql/query.py delete mode 100644 tests/unit/core/compile/googlesql/__init__.py delete mode 100644 tests/unit/core/compile/googlesql/test_expression.py delete mode 100644 tests/unit/core/compile/googlesql/test_function.py delete mode 100644 tests/unit/core/compile/googlesql/test_query.py diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index 295412fd759..e6ac1b9c27d 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -20,7 +20,7 @@ import google.cloud.bigquery -import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir +from bigframes.core.compile.sqlglot import sqlglot_ir import bigframes.dtypes import bigframes.operations import bigframes.series diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 5bd141a4062..3d89a597eea 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -23,12 +23,12 @@ import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.operations as ibis_ops import bigframes_vendored.ibis.expr.types as ibis_types +import bigframes_vendored.sqlglot.expressions as sge from google.cloud import bigquery import pyarrow as pa from bigframes.core import agg_expressions, rewrite import bigframes.core.agg_expressions as ex_types -import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_compiler.aggregate_compiler as agg_compiler import bigframes.core.compile.ibis_compiler.scalar_op_compiler as op_compilers import bigframes.core.compile.ibis_types @@ -82,13 +82,21 @@ def to_sql( ) if order_by or limit or not is_noop_selection: - sql = ibis_bigquery.Backend().compile(ibis_table) - sql = ( - bigframes.core.compile.googlesql.Select() - .from_(sql) - .select(selection_strings) - .sql() - ) + # selections are (ref.id.sql, name) where ref.id.sql is escaped identifier + to_select = [ + sge.Alias( + this=sge.to_identifier(src, quoted=True), + alias=sge.to_identifier(alias, quoted=True), + ) + if src != alias + else sge.to_identifier(src, quoted=True) + for src, alias in selection_strings + ] + # Use string formatting for FROM clause to avoid re-parsing potentially complex SQL (like ARRAY>) + # that sqlglot might not handle perfectly when parsing BigQuery dialect strings. + select_sql = sge.Select().select(*to_select).sql(dialect="bigquery") + ibis_sql = ibis_bigquery.Backend().compile(ibis_table) + sql = f"{select_sql} FROM ({ibis_sql}) AS `t`" # Single row frames may not have any ordering columns if len(order_by) > 0: @@ -99,7 +107,7 @@ def to_sql( raise TypeError(f"Limit param: {limit} must be an int.") sql += f"\nLIMIT {limit}" else: - sql = ibis_bigquery.Backend().compile(self._to_ibis_expr()) + sql = ibis_bigquery.Backend().compile(ibis_table) return typing.cast(str, sql) @property diff --git a/bigframes/core/compile/googlesql/__init__.py b/bigframes/core/compile/googlesql/__init__.py deleted file mode 100644 index add0c5ec445..00000000000 --- a/bigframes/core/compile/googlesql/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Python classes representing GoogleSQL syntax nodes, adhering to the official syntax: -https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax""" - -from __future__ import annotations - -from bigframes.core.compile.googlesql.datatype import DataType -from bigframes.core.compile.googlesql.expression import ( - _escape_chars, - AliasExpression, - ColumnExpression, - CTEExpression, - identifier, - StarExpression, - TableExpression, -) -from bigframes.core.compile.googlesql.function import Cast -from bigframes.core.compile.googlesql.query import ( - AsAlias, - FromClause, - FromItem, - NonRecursiveCTE, - QueryExpr, - Select, - SelectAll, - SelectExpression, -) - -__all__ = [ - "_escape_chars", - "identifier", - "AliasExpression", - "AsAlias", - "Cast", - "ColumnExpression", - "CTEExpression", - "DataType", - "FromClause", - "FromItem", - "NonRecursiveCTE", - "QueryExpr", - "Select", - "SelectAll", - "SelectExpression", - "StarExpression", - "StringType", - "TableExpression", -] diff --git a/bigframes/core/compile/googlesql/abc.py b/bigframes/core/compile/googlesql/abc.py deleted file mode 100644 index 081836467c2..00000000000 --- a/bigframes/core/compile/googlesql/abc.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import abc - - -class SQLSyntax(abc.ABC): - """Abstract base class provides GoogleSQL syntax.""" - - @abc.abstractmethod - def sql(self): - ... diff --git a/bigframes/core/compile/googlesql/datatype.py b/bigframes/core/compile/googlesql/datatype.py deleted file mode 100644 index ccf3ff4d41b..00000000000 --- a/bigframes/core/compile/googlesql/datatype.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import enum - -"""This module represents all GoogleSQL for BigQuery data types: -https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types""" - - -class DataType(enum.Enum): - STRING = 1 - FLOAT64 = 2 diff --git a/bigframes/core/compile/googlesql/expression.py b/bigframes/core/compile/googlesql/expression.py deleted file mode 100644 index 581ab67718a..00000000000 --- a/bigframes/core/compile/googlesql/expression.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses -import typing - -import bigframes.core.compile.googlesql.abc as abc - -"""This module represents GoogleSQL `expression` and its extensions. -Core class: - -* `expression`: Models basic SQL expressions. - -Extended classes (not part of standard GoogleSQL syntax, but added for convenience): - -* `ColumnExpression`: Represents column references. -* `TableExpression`: Represents table references. -* `AliasExpression`: Represents aliased expressions. -* ... -""" - - -@dataclasses.dataclass -class Expression(abc.SQLSyntax): - pass - - -@dataclasses.dataclass -class ColumnExpression(Expression): - name: str - parent: typing.Optional[TableExpression | AliasExpression | CTEExpression] = None - - def sql(self) -> str: - if self.parent is not None: - return f"{self.parent.sql()}.{identifier(self.name)}" - return identifier(self.name) - - -@dataclasses.dataclass -class StarExpression(Expression): - parent: typing.Optional[TableExpression | AliasExpression | CTEExpression] = None - - def sql(self) -> str: - if self.parent is not None: - return f"{self.parent.sql()}.*" - return "*" - - -@dataclasses.dataclass -class TableExpression(Expression): - table_id: str - dataset_id: typing.Optional[str] = None - project_id: typing.Optional[str] = None - - def __post_init__(self): - if self.project_id is not None and self.dataset_id is None: - raise ValueError("The `dataset_id` is missing.") - - def sql(self) -> str: - text = [] - if self.project_id is not None: - text.append(identifier(self.project_id)) - if self.dataset_id is not None: - text.append(identifier(self.dataset_id)) - text.append(identifier(self.table_id)) - return ".".join(text) - - -@dataclasses.dataclass -class AliasExpression(Expression): - alias: str - - def sql(self) -> str: - return identifier(self.alias) - - -@dataclasses.dataclass -class CTEExpression(Expression): - name: str - - def sql(self) -> str: - return identifier(self.name) - - -def identifier(id: str) -> str: - """Return a string representing column reference in a SQL.""" - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers - # Just always escape, otherwise need to check against every reserved sql keyword - return f"`{_escape_chars(id)}`" - - -def _escape_chars(value: str): - """Escapes all special charactesrs""" - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals - trans_table = str.maketrans( - { - "\a": r"\a", - "\b": r"\b", - "\f": r"\f", - "\n": r"\n", - "\r": r"\r", - "\t": r"\t", - "\v": r"\v", - "\\": r"\\", - "?": r"\?", - '"': r"\"", - "'": r"\'", - "`": r"\`", - } - ) - return value.translate(trans_table) diff --git a/bigframes/core/compile/googlesql/function.py b/bigframes/core/compile/googlesql/function.py deleted file mode 100644 index 19b61f2fc99..00000000000 --- a/bigframes/core/compile/googlesql/function.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import dataclasses - -import bigframes.core.compile.googlesql.datatype as datatype -import bigframes.core.compile.googlesql.expression as expr - -# Conversion functions: -# https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions - - -@dataclasses.dataclass -class Cast(expr.Expression): - """This class represents the `cast` function.""" - - expression: expr.ColumnExpression - type: datatype.DataType - - def sql(self) -> str: - return f"CAST ({self.expression.sql()} AS {self.type.name})" diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py deleted file mode 100644 index f591216b3a8..00000000000 --- a/bigframes/core/compile/googlesql/query.py +++ /dev/null @@ -1,231 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses -import typing - -import google.cloud.bigquery as bigquery - -import bigframes.core.compile.googlesql.abc as abc -import bigframes.core.compile.googlesql.expression as expr - -"""This module provides a structured representation of GoogleSQL syntax using nodes. -Each node's name and child nodes are designed to strictly follow the official GoogleSQL -syntax rules outlined in the documentation: -https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax""" - -TABLE_SOURCE_TYPE = typing.Union[str, bigquery.TableReference] - - -@dataclasses.dataclass -class QueryExpr(abc.SQLSyntax): - """This class represents GoogleSQL `query_expr` syntax.""" - - select: Select - with_cte_list: typing.Sequence[NonRecursiveCTE] = () - - def sql(self) -> str: - text = [] - if len(self.with_cte_list) > 0: - with_cte_text = ",\n".join( - [with_cte.sql() for with_cte in self.with_cte_list] - ) - text.append(f"WITH {with_cte_text}") - - text.append(self.select.sql()) - return "\n".join(text) - - -@dataclasses.dataclass -class Select(abc.SQLSyntax): - """This class represents GoogleSQL `select` syntax.""" - - select_list: typing.Sequence[ - typing.Union[SelectExpression, SelectAll] - ] = dataclasses.field(default_factory=list) - from_clause_list: typing.Sequence[FromClause] = dataclasses.field( - default_factory=list - ) - distinct: bool = False - - def select( - self, - columns: typing.Union[ - typing.Iterable[str], typing.Iterable[tuple[str, str]], str, None - ] = None, - distinct: bool = False, - ) -> Select: - if isinstance(columns, str): - columns = [columns] - self.select_list: typing.List[typing.Union[SelectExpression, SelectAll]] = ( - [self._select_field(column) for column in columns] - if columns - else [SelectAll(expression=expr.StarExpression())] - ) - self.distinct = distinct - return self - - def _select_field(self, field) -> SelectExpression: - if isinstance(field, str): - return SelectExpression(expression=expr.ColumnExpression(name=field)) - - else: - alias = ( - expr.AliasExpression(field[1]) - if isinstance(field[1], str) - else field[1] - if (field[0] != field[1]) - else None - ) - return SelectExpression( - expression=expr.ColumnExpression(name=field[0]), alias=alias - ) - - def from_( - self, - sources: typing.Union[TABLE_SOURCE_TYPE, typing.Iterable[TABLE_SOURCE_TYPE]], - ) -> Select: - if (not isinstance(sources, typing.Iterable)) or isinstance(sources, str): - sources = [sources] - self.from_clause_list = [ - FromClause(FromItem.from_source(source)) for source in sources - ] - return self - - def sql(self) -> str: - if (self.select_list is not None) and (not self.select_list): - raise ValueError("Select clause has not been properly initialized.") - - text = ["SELECT"] - - if self.distinct: - text.append("DISTINCT") - - select_list_sql = ",\n".join([select.sql() for select in self.select_list]) - text.append(select_list_sql) - - if self.from_clause_list: - from_clauses_sql = ",\n".join( - [clause.sql() for clause in self.from_clause_list] - ) - text.append(f"FROM\n{from_clauses_sql}") - return "\n".join(text) - - -@dataclasses.dataclass(frozen=True) -class SelectExpression(abc.SQLSyntax): - """This class represents `select_expression`.""" - - expression: expr.ColumnExpression - alias: typing.Optional[expr.AliasExpression] = None - - def sql(self) -> str: - if self.alias is None: - return self.expression.sql() - else: - return f"{self.expression.sql()} AS {self.alias.sql()}" - - -@dataclasses.dataclass -class SelectAll(abc.SQLSyntax): - """This class represents `select_all` (aka. `SELECT *`).""" - - expression: expr.StarExpression - - def sql(self) -> str: - return self.expression.sql() - - -@dataclasses.dataclass -class FromClause(abc.SQLSyntax): - """This class represents GoogleSQL `from_clause` syntax.""" - - from_item: FromItem - - def sql(self) -> str: - return self.from_item.sql() - - -@dataclasses.dataclass -class FromItem(abc.SQLSyntax): - """This class represents GoogleSQL `from_item` syntax.""" - - # Note: Temporarily introduces the `str` type to interact with pre-existing, - # compiled SQL strings. - expression: typing.Union[expr.TableExpression, QueryExpr, str, expr.CTEExpression] - as_alias: typing.Optional[AsAlias] = None - - @classmethod - def from_source( - cls, - subquery_or_tableref: typing.Union[bigquery.TableReference, str], - as_alias: typing.Optional[AsAlias] = None, - ): - if isinstance(subquery_or_tableref, bigquery.TableReference): - return cls( - expression=expr.TableExpression( - table_id=subquery_or_tableref.table_id, - dataset_id=subquery_or_tableref.dataset_id, - project_id=subquery_or_tableref.project, - ), - as_alias=as_alias, - ) - elif isinstance(subquery_or_tableref, str): - return cls( - expression=subquery_or_tableref, - as_alias=as_alias, - ) - else: - raise ValueError("The source must be bigquery.TableReference or str.") - - def sql(self) -> str: - if isinstance(self.expression, (expr.TableExpression, expr.CTEExpression)): - text = self.expression.sql() - elif isinstance(self.expression, str): - text = f"({self.expression})" - elif isinstance(self.expression, QueryExpr): - text = f"({self.expression.sql()})" - else: - raise ValueError( - f"Unsupported expression type {type(self.expression).__name__};" - "expected one of TableExpression, QueryExpr, str, or CTEExpression." - ) - - if self.as_alias is None: - return text - else: - return f"{text} {self.as_alias.sql()}" - - -@dataclasses.dataclass -class NonRecursiveCTE(abc.SQLSyntax): - """This class represents GoogleSQL `non_recursive_cte` syntax.""" - - cte_name: expr.CTEExpression - query_expr: QueryExpr - - def sql(self) -> str: - return f"{self.cte_name.sql()} AS (\n{self.query_expr.sql()}\n)" - - -@dataclasses.dataclass -class AsAlias(abc.SQLSyntax): - """This class represents GoogleSQL `as_alias` syntax.""" - - alias: expr.AliasExpression - - def sql(self) -> str: - return f"AS {self.alias.sql()}" diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 88d01c2a9e6..94ffa39dae5 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -598,6 +598,34 @@ def _select_to_cte(self) -> tuple[sge.Select, sge.Identifier]: return new_select_expr, cte_name +def identifier(id: str) -> str: + """Return a string representing column reference in a SQL.""" + return sge.to_identifier(id, quoted=SQLGlotIR.quoted).sql(dialect=SQLGlotIR.dialect) + + +def _escape_chars(value: str): + """Escapes all special characters""" + # TODO: Reuse _literal's escaping logic instead of re-implementing it here. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + trans_table = str.maketrans( + { + "\a": r"\a", + "\b": r"\b", + "\f": r"\f", + "\n": r"\n", + "\r": r"\r", + "\t": r"\t", + "\v": r"\v", + "\\": r"\\", + "?": r"\?", + '"': r"\"", + "'": r"\'", + "`": r"\`", + } + ) + return value.translate(trans_table) + + def _is_null_literal(expr: sge.Expression) -> bool: """Checks if the given expression is a NULL literal.""" if isinstance(expr, sge.Null): diff --git a/bigframes/core/sql/__init__.py b/bigframes/core/sql/__init__.py index ccd2a16ddcd..b025ca07c27 100644 --- a/bigframes/core/sql/__init__.py +++ b/bigframes/core/sql/__init__.py @@ -23,9 +23,10 @@ import math from typing import cast, Collection, Iterable, Mapping, Optional, TYPE_CHECKING, Union +import bigframes_vendored.sqlglot.expressions as sge import shapely.geometry.base # type: ignore -import bigframes.core.compile.googlesql as googlesql +from bigframes.core.compile.sqlglot import sqlglot_ir if TYPE_CHECKING: import google.cloud.bigquery as bigquery @@ -65,7 +66,7 @@ def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: return "NULL" elif isinstance(value, str): # Single quoting seems to work nicer with ibis than double quoting - return f"'{googlesql._escape_chars(value)}'" + return f"'{sqlglot_ir._escape_chars(value)}'" elif isinstance(value, bytes): return repr(value) elif isinstance(value, (bool, int)): @@ -110,15 +111,15 @@ def multi_literal(*values: str): def cast_as_string(column_name: str) -> str: """Return a string representing string casting of a column.""" - return googlesql.Cast( - googlesql.ColumnExpression(column_name), googlesql.DataType.STRING - ).sql() + return sge.Cast(this=sge.to_identifier(column_name, quoted=True), to="STRING").sql( + dialect="bigquery" + ) def to_json_string(column_name: str) -> str: """Return a string representing JSON version of a column.""" - return f"TO_JSON_STRING({googlesql.identifier(column_name)})" + return f"TO_JSON_STRING({sqlglot_ir.identifier(column_name)})" def csv(values: Iterable[str]) -> str: @@ -132,11 +133,29 @@ def infix_op(opname: str, left_arg: str, right_arg: str): def is_distinct_sql(columns: Iterable[str], table_ref: bigquery.TableReference) -> str: + table_expr = sge.Table( + this=sge.Identifier(this=table_ref.table_id, quoted=True), + db=sge.Identifier(this=table_ref.dataset_id, quoted=True), + catalog=sge.Identifier(this=table_ref.project, quoted=True), + ) + to_select = [sge.to_identifier(col, quoted=True) for col in columns] + + full_table_sql = ( + sge.Select().select(*to_select).from_(table_expr).sql(dialect="bigquery") + ) + distinct_table_sql = ( + sge.Select() + .select(*to_select) + .distinct() + .from_(table_expr) + .sql(dialect="bigquery") + ) + is_unique_sql = f"""WITH full_table AS ( - {googlesql.Select().from_(table_ref).select(columns).sql()} + {full_table_sql} ), distinct_table AS ( - {googlesql.Select().from_(table_ref).select(columns, distinct=True).sql()} + {distinct_table_sql} ) SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, @@ -183,7 +202,7 @@ def create_vector_index_ddl( if len(stored_column_names) > 0: escaped_stored = [ - f"{googlesql.identifier(name)}" for name in stored_column_names + f"{sqlglot_ir.identifier(name)}" for name in stored_column_names ] storing = f"STORING({', '.join(escaped_stored)}) " else: @@ -197,8 +216,8 @@ def create_vector_index_ddl( ) return f""" - {create} {googlesql.identifier(index_name)} - ON {googlesql.identifier(table_name)}({googlesql.identifier(column_name)}) + {create} {sqlglot_ir.identifier(index_name)} + ON {sqlglot_ir.identifier(table_name)}({sqlglot_ir.identifier(column_name)}) {storing} OPTIONS({rendered_options}); """ @@ -217,7 +236,7 @@ def create_vector_search_sql( """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" vector_search_args = [ - f"TABLE {googlesql.identifier(cast(str, base_table))}", + f"TABLE {sqlglot_ir.identifier(cast(str, base_table))}", f"{simple_literal(column_to_search)}", f"({sql_string})", ] diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index a2a4d32ae84..38d66ab9a56 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -16,7 +16,7 @@ from typing import Any, Dict, List, Mapping, Optional, Union -import bigframes.core.compile.googlesql as googlesql +from bigframes.core.compile.sqlglot import sqlglot_ir import bigframes.core.sql import bigframes.core.sql.literals @@ -46,7 +46,7 @@ def create_model_ddl( else: create = "CREATE MODEL " - ddl = f"{create}{googlesql.identifier(model_name)}\n" + ddl = f"{create}{sqlglot_ir.identifier(model_name)}\n" # [TRANSFORM (select_list)] if transform: @@ -66,7 +66,7 @@ def create_model_ddl( if connection_name.upper() == "DEFAULT": ddl += "REMOTE WITH CONNECTION DEFAULT\n" else: - ddl += f"REMOTE WITH CONNECTION {googlesql.identifier(connection_name)}\n" + ddl += f"REMOTE WITH CONNECTION {sqlglot_ir.identifier(connection_name)}\n" # [OPTIONS(model_option_list)] if options: @@ -130,7 +130,7 @@ def evaluate( if confidence_level is not None: struct_options["confidence_level"] = confidence_level - sql = f"SELECT * FROM ML.EVALUATE(MODEL {googlesql.identifier(model_name)}" + sql = f"SELECT * FROM ML.EVALUATE(MODEL {sqlglot_ir.identifier(model_name)}" if table: sql += f", ({table})" @@ -159,7 +159,7 @@ def predict( struct_options["trial_id"] = trial_id sql = ( - f"SELECT * FROM ML.PREDICT(MODEL {googlesql.identifier(model_name)}, ({table})" + f"SELECT * FROM ML.PREDICT(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" ) sql += _build_struct_sql(struct_options) sql += ")\n" @@ -190,7 +190,7 @@ def explain_predict( if approx_feature_contrib is not None: struct_options["approx_feature_contrib"] = approx_feature_contrib - sql = f"SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {googlesql.identifier(model_name)}, ({table})" + sql = f"SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" sql += _build_struct_sql(struct_options) sql += ")\n" return sql @@ -208,7 +208,7 @@ def global_explain( if class_level_explain is not None: struct_options["class_level_explain"] = class_level_explain - sql = f"SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {googlesql.identifier(model_name)}" + sql = f"SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {sqlglot_ir.identifier(model_name)}" sql += _build_struct_sql(struct_options) sql += ")\n" return sql @@ -221,7 +221,7 @@ def transform( """Encode the ML.TRANSFORM statement. See https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-transform for reference. """ - sql = f"SELECT * FROM ML.TRANSFORM(MODEL {googlesql.identifier(model_name)}, ({table}))\n" + sql = f"SELECT * FROM ML.TRANSFORM(MODEL {sqlglot_ir.identifier(model_name)}, ({table}))\n" return sql @@ -262,7 +262,7 @@ def generate_text( if request_type is not None: struct_options["request_type"] = request_type - sql = f"SELECT * FROM ML.GENERATE_TEXT(MODEL {googlesql.identifier(model_name)}, ({table})" + sql = f"SELECT * FROM ML.GENERATE_TEXT(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" sql += _build_struct_sql(struct_options) sql += ")\n" return sql @@ -290,7 +290,7 @@ def generate_embedding( if output_dimensionality is not None: struct_options["output_dimensionality"] = output_dimensionality - sql = f"SELECT * FROM ML.GENERATE_EMBEDDING(MODEL {googlesql.identifier(model_name)}, ({table})" + sql = f"SELECT * FROM ML.GENERATE_EMBEDDING(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" sql += _build_struct_sql(struct_options) sql += ")\n" return sql diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index f8244fb0d81..9413cd06954 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -27,7 +27,7 @@ import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery -import bigframes.core.compile.googlesql as sql_utils +import bigframes.core.compile.sqlglot.sqlglot_ir as sql_utils from bigframes.core.logging import log_adapter import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, impute, preprocessing, utils diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 09a46b235d9..d90d23a4747 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -21,7 +21,7 @@ import bigframes_vendored.constants as constants import google.cloud.bigquery -import bigframes.core.compile.googlesql as sql_utils +import bigframes.core.compile.sqlglot.sqlglot_ir as sql_utils import bigframes.core.sql as sql_vals INDENT_STR = " " diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 98b5f194c74..1d1dc57c30e 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -32,7 +32,7 @@ import google.cloud.bigquery._job_helpers import google.cloud.bigquery.table -import bigframes.core.compile.googlesql as googlesql +from bigframes.core.compile.sqlglot import sqlglot_ir import bigframes.core.events from bigframes.core.logging import log_adapter import bigframes.core.sql @@ -599,7 +599,7 @@ def compile_filters(filters: third_party_pandas_gbq.FiltersType) -> str: operator_str = valid_operators[operator] - column_ref = googlesql.identifier(column) + column_ref = sqlglot_ir.identifier(column) if operator_str in ["IN", "NOT IN"]: value_literal = bigframes.core.sql.multi_literal(*value) else: diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py index 99c13007d85..1a38bca1e82 100644 --- a/bigframes/session/bigquery_session.py +++ b/bigframes/session/bigquery_session.py @@ -24,7 +24,7 @@ import bigframes_vendored.ibis.backends.bigquery.datatypes as ibis_bq import google.cloud.bigquery as bigquery -from bigframes.core.compile import googlesql +from bigframes.core.compile.sqlglot import sqlglot_ir import bigframes.core.events from bigframes.session import temporary_storage import bigframes.session._io.bigquery as bfbqio @@ -80,7 +80,7 @@ def create_temp_table( ibis_schema = ibis_bq.BigQuerySchema.to_ibis(list(schema)) fields = [ - f"{googlesql.identifier(name)} {ibis_bq.BigQueryType.from_ibis(ibis_type)}" + f"{sqlglot_ir.identifier(name)} {ibis_bq.BigQueryType.from_ibis(ibis_type)}" for name, ibis_type in ibis_schema.fields.items() ] fields_string = ",".join(fields) @@ -88,12 +88,12 @@ def create_temp_table( cluster_string = "" if cluster_cols: cluster_cols_sql = ", ".join( - f"{googlesql.identifier(cluster_col)}" + f"{sqlglot_ir.identifier(cluster_col)}" for cluster_col in cluster_cols ) cluster_string = f"\nCLUSTER BY {cluster_cols_sql}" - ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" + ddl = f"CREATE TEMP TABLE `_SESSION`.{sqlglot_ir.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" _, job = bfbqio.start_query_with_client( self.bqclient, diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index bcb3ae60e3b..a5c01765655 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -30,7 +30,7 @@ import bigframes.constants import bigframes.core from bigframes.core import bq_data, compile, local_data, rewrite -import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir +from bigframes.core.compile.sqlglot import sqlglot_ir import bigframes.core.events import bigframes.core.guid import bigframes.core.identifiers diff --git a/tests/unit/core/compile/googlesql/__init__.py b/tests/unit/core/compile/googlesql/__init__.py deleted file mode 100644 index 6d5e14bcf4a..00000000000 --- a/tests/unit/core/compile/googlesql/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/core/compile/googlesql/test_expression.py b/tests/unit/core/compile/googlesql/test_expression.py deleted file mode 100644 index e72598b1760..00000000000 --- a/tests/unit/core/compile/googlesql/test_expression.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -import bigframes.core.compile.googlesql as sql - - -@pytest.mark.parametrize( - ("table_id", "dataset_id", "project_id", "expected"), - [ - pytest.param("a", None, None, "`a`"), - pytest.param("a", "b", None, "`b`.`a`"), - pytest.param("a", "b", "c", "`c`.`b`.`a`"), - pytest.param("a", None, "c", None, marks=pytest.mark.xfail(raises=ValueError)), - ], -) -def test_table_expression(table_id, dataset_id, project_id, expected): - expr = sql.TableExpression( - table_id=table_id, dataset_id=dataset_id, project_id=project_id - ) - assert expr.sql() == expected - - -def test_escape_chars(): - assert sql._escape_chars("\a\b\f\n\r\t\v\\?'\"`") == r"\a\b\f\n\r\t\v\\\?\'\"\`" diff --git a/tests/unit/core/compile/googlesql/test_function.py b/tests/unit/core/compile/googlesql/test_function.py deleted file mode 100644 index 4edfda6f345..00000000000 --- a/tests/unit/core/compile/googlesql/test_function.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import bigframes.core.compile.googlesql as sql - - -def test_cast(): - col = sql.ColumnExpression("col") - assert sql.Cast(col, sql.DataType.STRING).sql() == "CAST (`col` AS STRING)" - assert sql.Cast(col, sql.DataType.FLOAT64).sql() == "CAST (`col` AS FLOAT64)" diff --git a/tests/unit/core/compile/googlesql/test_query.py b/tests/unit/core/compile/googlesql/test_query.py deleted file mode 100644 index b8d1d024e2b..00000000000 --- a/tests/unit/core/compile/googlesql/test_query.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from unittest.mock import MagicMock - -import google.cloud.bigquery as bigquery -import pytest - -import bigframes.core.compile.googlesql as sql - - -@pytest.mark.parametrize( - ("table_id", "dataset_id", "project_id", "expected"), - [ - pytest.param("a", None, None, "`a`"), - pytest.param("a", "b", None, "`b`.`a`"), - pytest.param("a", "b", "c", "`c`.`b`.`a`"), - pytest.param("a", None, "c", None, marks=pytest.mark.xfail(raises=ValueError)), - ], -) -def test_table_expression(table_id, dataset_id, project_id, expected): - expr = sql.TableExpression( - table_id=table_id, dataset_id=dataset_id, project_id=project_id - ) - assert expr.sql() == expected - - -@pytest.mark.parametrize( - ("table_name", "alias", "expected"), - [ - pytest.param("a", None, "`a`"), - pytest.param("a", "aa", "`a` AS `aa`"), - ], -) -def test_from_item_w_table_name(table_name, alias, expected): - expr = sql.FromItem( - sql.TableExpression(table_id=table_name), - as_alias=None - if alias is None - else sql.AsAlias(sql.AliasExpression(alias=alias)), - ) - assert expr.sql() == expected - - -def test_from_item_w_query_expr(): - from_clause = sql.FromClause( - sql.FromItem(expression=sql.TableExpression(table_id="table_a")) - ) - select = sql.Select( - select_list=[sql.SelectAll(sql.StarExpression())], - from_clause_list=[from_clause], - ) - query_expr = sql.QueryExpr(select=select) - expected = "SELECT\n*\nFROM\n`table_a`" - - # A QueryExpr object - expr = sql.FromItem(expression=query_expr) - assert expr.sql() == f"({expected})" - - # A str object - expr = sql.FromItem(expression=expected) - assert expr.sql() == f"({expected})" - - -def test_from_item_w_cte(): - expr = sql.FromItem(expression=sql.CTEExpression("test")) - assert expr.sql() == "`test`" - - -def test_from_item_w_table_ref(): - mock_table_ref = MagicMock(spec=bigquery.TableReference) - mock_table_ref.table_id = "mock_table" - mock_table_ref.dataset_id = "mock_dataset" - mock_table_ref.project = "mock_project" - - from_item = sql.FromItem.from_source(mock_table_ref) - - assert from_item.sql() == "`mock_project`.`mock_dataset`.`mock_table`" - - -@pytest.mark.parametrize( - ("col_name", "alias", "expected"), - [ - pytest.param("a", None, "`a`"), - pytest.param("a", "aa", "`a` AS `aa`"), - ], -) -def test_select_expression(col_name, alias, expected): - expr = sql.SelectExpression( - expression=sql.ColumnExpression(col_name), - alias=None if alias is None else sql.AliasExpression(alias=alias), - ) - assert expr.sql() == expected - - -def test_select(): - select_1 = sql.SelectExpression(expression=sql.ColumnExpression("a")) - select_2 = sql.SelectExpression( - expression=sql.ColumnExpression("b"), alias=sql.AliasExpression(alias="bb") - ) - from_1 = sql.FromItem(expression=sql.TableExpression(table_id="table_a")) - from_2 = sql.FromItem( - expression="SELECT * FROM project.table_b", - as_alias=sql.AsAlias(sql.AliasExpression(alias="table_b")), - ) - expr = sql.Select( - select_list=[select_1, select_2], - from_clause_list=[sql.FromClause(from_1), sql.FromClause(from_2)], - ) - expected = "SELECT\n`a`,\n`b` AS `bb`\nFROM\n`table_a`,\n(SELECT * FROM project.table_b) AS `table_b`" - - assert expr.sql() == expected - - -@pytest.mark.parametrize( - "columns, source, expected", - [ - ( - ["a", "b", "c"], - "select * from test", - "SELECT\nDISTINCT\n`a`,\n`b`,\n`c`\nFROM\n(select * from test)", - ), - ( - "a", - "select * from test", - "SELECT\nDISTINCT\n`a`\nFROM\n(select * from test)", - ), - ], -) -def test_select_from_str(columns, source, expected): - expr = sql.Select().from_(source).select(columns, distinct=True) - assert expr.sql() == expected - - -@pytest.mark.parametrize( - ("columns", "distinct", "expected"), - [ - pytest.param( - ["a", "b", "c"], - True, - "SELECT\nDISTINCT\n`a`,\n`b`,\n`c`\nFROM\n`mock_project`.`mock_dataset`.`mock_table`", - ), - pytest.param( - None, - True, - "SELECT\nDISTINCT\n*\nFROM\n`mock_project`.`mock_dataset`.`mock_table`", - ), - pytest.param( - None, False, "SELECT\n*\nFROM\n`mock_project`.`mock_dataset`.`mock_table`" - ), - ], -) -def test_select_from_table_ref(columns, distinct, expected): - mock_table_ref = MagicMock(spec=bigquery.TableReference) - mock_table_ref.table_id = "mock_table" - mock_table_ref.dataset_id = "mock_dataset" - mock_table_ref.project = "mock_project" - - expr = sql.Select().from_(mock_table_ref).select(columns, distinct=distinct) - assert expr.sql() == expected - - -def test_query_expr_w_cte(): - # Test a simple SELECT query. - from_clause1 = sql.FromClause( - sql.FromItem(expression=sql.TableExpression(table_id="table_a")) - ) - select1 = sql.Select( - select_list=[sql.SelectAll(sql.StarExpression())], - from_clause_list=[from_clause1], - ) - query1 = sql.QueryExpr(select=select1) - query1_sql = "SELECT\n*\nFROM\n`table_a`" - assert query1.sql() == query1_sql - - # Test a query with CTE statements. - cte1 = sql.NonRecursiveCTE(cte_name=sql.CTEExpression("a"), query_expr=query1) - cte2 = sql.NonRecursiveCTE(cte_name=sql.CTEExpression("b"), query_expr=query1) - - cte1_sql = f"`a` AS (\n{query1_sql}\n)" - cte2_sql = f"`b` AS (\n{query1_sql}\n)" - assert cte1.sql() == cte1_sql - assert cte2.sql() == cte2_sql - - with_cte_list = [cte1, cte2] - select2 = sql.Select( - select_list=[ - sql.SelectExpression( - sql.ColumnExpression(parent=cte1.cte_name, name="column_x") - ), - sql.SelectAll(sql.StarExpression(parent=cte2.cte_name)), - ], - from_clause_list=[ - sql.FromClause(sql.FromItem(expression=cte1.cte_name)), - sql.FromClause(sql.FromItem(expression=cte2.cte_name)), - ], - distinct=True, - ) - select2_sql = "SELECT\nDISTINCT\n`a`.`column_x`,\n`b`.*\nFROM\n`a`,\n`b`" - assert select2.sql() == select2_sql - - query2 = sql.QueryExpr(select=select2, with_cte_list=with_cte_list) - query2_sql = f"WITH {cte1_sql},\n{cte2_sql}\n{select2_sql}" - assert query2.sql() == query2_sql - - -def test_identifier(): - assert sql.identifier("\aa") == r"`\aa`" - - -def test_escape_chars(): - assert sql._escape_chars("\a\b\f\n\r\t\v\\?'\"`") == r"\a\b\f\n\r\t\v\\\?\'\"\`" From a396f684dd7924de8b803afadb5fb948fc8a936a Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 2 Mar 2026 13:03:32 -0800 Subject: [PATCH 25/29] chore: clean up remaining python 3.9 support codes (#2486) This change is cleaning up remaining Python 3.9 support codes. --- bigframes/series.py | 1 - ...q_dataframes_ml_drug_name_generation.ipynb | 24 ++++++------ noxfile.py | 4 +- samples/polars/noxfile.py | 3 +- samples/snippets/noxfile.py | 3 +- testing/constraints-3.10.txt | 3 +- testing/constraints-3.9.txt | 37 ------------------- 7 files changed, 16 insertions(+), 59 deletions(-) delete mode 100644 testing/constraints-3.9.txt diff --git a/bigframes/series.py b/bigframes/series.py index cd564e5c911..2d0b13b4700 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -448,7 +448,6 @@ def rename( # rename the Series name if isinstance(index, typing.Hashable): - # Python 3.9 doesn't allow isinstance of Optional index = typing.cast(Optional[str], index) block = self._block.with_column_labels([index]) diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index 3220bbf6cdb..e51338c2e8f 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -58,17 +58,6 @@ "" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "24743cf4a1e1" - }, - "source": [ - "**_NOTE_**: This notebook has been tested in the following environment:\n", - "\n", - "* Python version = 3.9" - ] - }, { "cell_type": "markdown", "metadata": { @@ -1579,12 +1568,21 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", + "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.10.9" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4, diff --git a/noxfile.py b/noxfile.py index 671aae13d22..76400671af3 100644 --- a/noxfile.py +++ b/noxfile.py @@ -56,7 +56,7 @@ DEFAULT_PYTHON_VERSION = "3.14" -UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] +UNIT_TEST_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.14"] UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", PYTEST_VERSION, @@ -78,7 +78,7 @@ # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search # bigframes/windows-docker, internally. -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.14"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", diff --git a/samples/polars/noxfile.py b/samples/polars/noxfile.py index 782da043299..63e742993f9 100644 --- a/samples/polars/noxfile.py +++ b/samples/polars/noxfile.py @@ -86,9 +86,8 @@ def get_pytest_env_vars() -> Dict[str, str]: return ret -# DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] +ALL_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index 782da043299..63e742993f9 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -86,9 +86,8 @@ def get_pytest_env_vars() -> Dict[str, str]: return ret -# DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] +ALL_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.14"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index 2414bc546b5..137710df329 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -1,5 +1,4 @@ -# When we drop Python 3.9, -# please keep these in sync with the minimum versions in setup.py +# Please keep these in sync with the minimum versions in setup.py cloudpickle==2.0.0 fsspec==2023.3.0 gcsfs==2023.3.0 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt deleted file mode 100644 index 8e4ade29c74..00000000000 --- a/testing/constraints-3.9.txt +++ /dev/null @@ -1,37 +0,0 @@ -# please keep these in sync with the minimum versions in setup.py -cloudpickle==2.0.0 -fsspec==2023.3.0 -gcsfs==2023.3.0 -geopandas==0.12.2 -google-auth==2.15.0 -google-cloud-bigtable==2.24.0 -google-cloud-pubsub==2.21.4 -google-cloud-bigquery==3.36.0 -google-cloud-functions==1.12.0 -google-cloud-bigquery-connection==1.12.0 -google-cloud-iam==2.12.1 -google-cloud-resource-manager==1.10.3 -google-cloud-storage==2.0.0 -grpc-google-iam-v1==0.14.2 -numpy==1.24.0 -pandas==1.5.3 -pandas-gbq==0.26.1 -pyarrow==15.0.2 -pydata-google-auth==1.8.2 -requests==2.27.1 -scikit-learn==1.2.2 -shapely==1.8.5 -tabulate==0.9 -humanize==4.6.0 -matplotlib==3.7.1 -db-dtypes==1.4.2 -# For vendored ibis-framework. -atpublic==2.3 -python-dateutil==2.8.2 -pytz==2022.7 -toolz==0.11 -typing-extensions==4.5.0 -rich==12.4.4 -# For anywidget mode -anywidget>=0.9.18 -traitlets==5.0.0 From cb00daabce49f067be8e16627166dda00d5d8134 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 2 Mar 2026 13:04:28 -0800 Subject: [PATCH 26/29] feat: Support pd.col simple aggregates (#2480) --- bigframes/core/col.py | 29 ++++++++++++++++++++++++ tests/unit/test_col.py | 51 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/bigframes/core/col.py b/bigframes/core/col.py index 60b24d5e837..d00d61365a9 100644 --- a/bigframes/core/col.py +++ b/bigframes/core/col.py @@ -18,8 +18,10 @@ import bigframes_vendored.pandas.core.col as pd_col +from bigframes.core import agg_expressions, window_spec import bigframes.core.expression as bf_expression import bigframes.operations as bf_ops +import bigframes.operations.aggregations as agg_ops # Not to be confused with the Expression class in `bigframes.core.expressions` @@ -33,6 +35,15 @@ class Expression: def _apply_unary(self, op: bf_ops.UnaryOp) -> Expression: return Expression(op.as_expr(self._value)) + def _apply_unary_agg(self, op: agg_ops.UnaryAggregateOp) -> Expression: + # We probably shouldn't need to windowize here, but block apis expect pre-windowized expressions + # Later on, we will probably have col expressions in windowed context, so will need to defer windowization + # instead of automatically applying the default unbound window + agg_expr = op.as_expr(self._value) + return Expression( + agg_expressions.WindowExpression(agg_expr, window_spec.unbound()) + ) + def _apply_binary(self, other: Any, op: bf_ops.BinaryOp, reverse: bool = False): if isinstance(other, Expression): other_value = other._value @@ -118,6 +129,24 @@ def __rxor__(self, other: Any) -> Expression: def __invert__(self) -> Expression: return self._apply_unary(bf_ops.invert_op) + def sum(self) -> Expression: + return self._apply_unary_agg(agg_ops.sum_op) + + def mean(self) -> Expression: + return self._apply_unary_agg(agg_ops.mean_op) + + def var(self) -> Expression: + return self._apply_unary_agg(agg_ops.var_op) + + def std(self) -> Expression: + return self._apply_unary_agg(agg_ops.std_op) + + def min(self) -> Expression: + return self._apply_unary_agg(agg_ops.min_op) + + def max(self) -> Expression: + return self._apply_unary_agg(agg_ops.max_op) + def col(col_name: Hashable) -> Expression: return Expression(bf_expression.free_var(col_name)) diff --git a/tests/unit/test_col.py b/tests/unit/test_col.py index 9c9088e037c..c7a7eaa326c 100644 --- a/tests/unit/test_col.py +++ b/tests/unit/test_col.py @@ -100,6 +100,57 @@ def test_pd_col_unary_operators(scalars_dfs, op): assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.sum()), + (lambda x: x.mean()), + (lambda x: x.min()), + (lambda x: x.max()), + (lambda x: x.std()), + (lambda x: x.var()), + ], + ids=[ + "sum", + "mean", + "min", + "max", + "std", + "var", + ], +) +def test_pd_col_aggregate_op(scalars_dfs, op): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": op(bpd.col("float64_col")), + } + pd_kwargs = { + "result": op(pd.col("float64_col")), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + +def test_pd_col_aggregate_of_aggregate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": (bpd.col("int64_col") - bpd.col("int64_col").mean()).mean(), + } + pd_kwargs = { + "result": (pd.col("int64_col") - pd.col("int64_col").mean()).mean(), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("op",), [ From 4caf74ccaeb9608d91da864bb80eddf1148a1502 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 2 Mar 2026 14:59:30 -0800 Subject: [PATCH 27/29] feat: Add cloud_function_cpus option to remote_function (#2475) --- bigframes/functions/_function_client.py | 68 ++++++++++++++++++- bigframes/functions/_function_session.py | 8 ++- bigframes/pandas/__init__.py | 4 +- bigframes/session/__init__.py | 8 ++- .../large/functions/test_remote_function.py | 49 +++++++++---- 5 files changed, 120 insertions(+), 17 deletions(-) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index a82217da035..be9ff0956ef 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -65,6 +65,8 @@ # BQ managed functions (@udf) currently only support Python 3.11. _MANAGED_FUNC_PYTHON_VERSION = "python-3.11" +_DEFAULT_FUNCTION_MEMORY_MIB = 1024 + class FunctionClient: # Wait time (in seconds) for an IAM binding to take effect after creation. @@ -402,8 +404,12 @@ def create_cloud_function( is_row_processor=False, vpc_connector=None, vpc_connector_egress_settings="private-ranges-only", - memory_mib=1024, + memory_mib=None, + cpus=None, ingress_settings="internal-only", + workers=None, + threads=None, + concurrency=None, ): """Create a cloud function from the given user defined function.""" @@ -486,6 +492,8 @@ def create_cloud_function( function.service_config = functions_v2.ServiceConfig() if memory_mib is not None: function.service_config.available_memory = f"{memory_mib}Mi" + if cpus is not None: + function.service_config.available_cpu = str(cpus) if timeout_seconds is not None: if timeout_seconds > 1200: raise bf_formatting.create_exception_with_feedback_link( @@ -517,6 +525,20 @@ def create_cloud_function( function.service_config.service_account_email = ( self._cloud_function_service_account ) + if concurrency: + function.service_config.max_instance_request_concurrency = concurrency + + # Functions framework use environment variables to pass config to gunicorn + # See https://github.com/GoogleCloudPlatform/functions-framework-python/issues/241 + # Code: https://github.com/GoogleCloudPlatform/functions-framework-python/blob/v3.10.1/src/functions_framework/_http/gunicorn.py#L37-L43 + env_vars = {} + if workers: + env_vars["WORKERS"] = str(workers) + if threads: + env_vars["THREADS"] = str(threads) + if env_vars: + function.service_config.environment_variables = env_vars + if ingress_settings not in _INGRESS_SETTINGS_MAP: raise bf_formatting.create_exception_with_feedback_link( ValueError, @@ -581,6 +603,7 @@ def provision_bq_remote_function( cloud_function_vpc_connector, cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib, + cloud_function_cpus, cloud_function_ingress_settings, bq_metadata, ): @@ -616,6 +639,21 @@ def provision_bq_remote_function( ) cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) + if cloud_function_memory_mib is None: + cloud_function_memory_mib = _DEFAULT_FUNCTION_MEMORY_MIB + + # assumption is most bigframes functions are cpu bound, single-threaded and many won't release GIL + # therefore, want to allocate a worker for each cpu, and allow a concurrent request per worker + expected_milli_cpus = ( + int(cloud_function_cpus * 1000) + if (cloud_function_cpus is not None) + else _infer_milli_cpus_from_memory(cloud_function_memory_mib) + ) + workers = -(expected_milli_cpus // -1000) # ceil(cpus) without invoking floats + threads = 4 # (per worker) + # max concurrency==1 for vcpus < 1 hard limit from cloud run + concurrency = (workers * threads) if (expected_milli_cpus >= 1000) else 1 + # Create the cloud function if it does not exist if not cf_endpoint: cf_endpoint = self.create_cloud_function( @@ -630,7 +668,11 @@ def provision_bq_remote_function( vpc_connector=cloud_function_vpc_connector, vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, memory_mib=cloud_function_memory_mib, + cpus=cloud_function_cpus, ingress_settings=cloud_function_ingress_settings, + workers=workers, + threads=threads, + concurrency=concurrency, ) else: logger.info(f"Cloud function {cloud_function_name} already exists.") @@ -696,3 +738,27 @@ def get_remote_function_specs(self, remote_function_name): # Note: list_routines doesn't make an API request until we iterate on the response object. pass return (http_endpoint, bq_connection) + + +def _infer_milli_cpus_from_memory(memory_mib: int) -> int: + # observed values, not formally documented by cloud run functions + if memory_mib < 128: + raise ValueError("Cloud run supports at minimum 128MiB per instance") + elif memory_mib == 128: + return 83 + elif memory_mib <= 256: + return 167 + elif memory_mib <= 512: + return 333 + elif memory_mib <= 1024: + return 583 + elif memory_mib <= 2048: + return 1000 + elif memory_mib <= 8192: + return 2000 + elif memory_mib <= 16384: + return 4000 + elif memory_mib <= 32768: + return 8000 + else: + raise ValueError("Cloud run supports at most 32768MiB per instance") diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index b0fc25219af..7541936ede3 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -248,7 +248,8 @@ def remote_function( cloud_function_vpc_connector_egress_settings: Optional[ Literal["all", "private-ranges-only", "unspecified"] ] = None, - cloud_function_memory_mib: Optional[int] = 1024, + cloud_function_memory_mib: Optional[int] = None, + cloud_function_cpus: Optional[float] = None, cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" ] = "internal-only", @@ -444,6 +445,10 @@ def remote_function( default memory of cloud functions be allocated, pass `None`. See for more details https://cloud.google.com/functions/docs/configuring/memory. + cloud_function_cpus (float, Optional): + The number of cpus to allocate for the cloud + function (2nd gen) created. + https://docs.cloud.google.com/run/docs/configuring/services/cpu. cloud_function_ingress_settings (str, Optional): Ingress settings controls dictating what traffic can reach the function. Options are: `all`, `internal-only`, or `internal-and-gclb`. @@ -638,6 +643,7 @@ def wrapper(func): cloud_function_vpc_connector=cloud_function_vpc_connector, cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, + cloud_function_cpus=cloud_function_cpus, cloud_function_ingress_settings=cloud_function_ingress_settings, bq_metadata=bqrf_metadata, ) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index a70e319747a..fcb60bf7782 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -88,7 +88,8 @@ def remote_function( cloud_function_vpc_connector_egress_settings: Optional[ Literal["all", "private-ranges-only", "unspecified"] ] = None, - cloud_function_memory_mib: Optional[int] = 1024, + cloud_function_memory_mib: Optional[int] = None, + cloud_function_cpus: Optional[float] = None, cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" ] = "internal-only", @@ -112,6 +113,7 @@ def remote_function( cloud_function_vpc_connector=cloud_function_vpc_connector, cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, + cloud_function_cpus=cloud_function_cpus, cloud_function_ingress_settings=cloud_function_ingress_settings, cloud_build_service_account=cloud_build_service_account, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c42270c4ddc..7ea6e999545 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1536,7 +1536,8 @@ def remote_function( cloud_function_vpc_connector_egress_settings: Optional[ Literal["all", "private-ranges-only", "unspecified"] ] = None, - cloud_function_memory_mib: Optional[int] = 1024, + cloud_function_memory_mib: Optional[int] = None, + cloud_function_cpus: Optional[float] = None, cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" ] = "internal-only", @@ -1717,6 +1718,10 @@ def remote_function( default memory of cloud functions be allocated, pass `None`. See for more details https://cloud.google.com/functions/docs/configuring/memory. + cloud_function_cpus (float, Optional): + The number of cpus to allocate for the cloud + function (2nd gen) created. + https://docs.cloud.google.com/run/docs/configuring/services/cpu. cloud_function_ingress_settings (str, Optional): Ingress settings controls dictating what traffic can reach the function. Options are: `all`, `internal-only`, or `internal-and-gclb`. @@ -1767,6 +1772,7 @@ def remote_function( cloud_function_vpc_connector=cloud_function_vpc_connector, cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, + cloud_function_cpus=cloud_function_cpus, cloud_function_ingress_settings=cloud_function_ingress_settings, cloud_build_service_account=cloud_build_service_account, ) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 4b5d143c157..079b909e7aa 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -2089,19 +2089,40 @@ def foo_list(x: pandas.Series, y0: float, y1, y2) -> list[str]: @pytest.mark.parametrize( - ("memory_mib_args", "expected_memory"), + ( + "memory_mib_args", + "expected_memory", + "expected_cpus", + ), [ - pytest.param({}, "1024Mi", id="no-set"), - pytest.param({"cloud_function_memory_mib": None}, "256M", id="set-None"), - pytest.param({"cloud_function_memory_mib": 128}, "128Mi", id="set-128"), - pytest.param({"cloud_function_memory_mib": 1024}, "1024Mi", id="set-1024"), - pytest.param({"cloud_function_memory_mib": 4096}, "4096Mi", id="set-4096"), - pytest.param({"cloud_function_memory_mib": 32768}, "32768Mi", id="set-32768"), + pytest.param({}, "1024Mi", None, id="no-set"), + pytest.param( + {"cloud_function_memory_mib": None}, "1024Mi", None, id="set-None" + ), + pytest.param({"cloud_function_memory_mib": 128}, "128Mi", None, id="set-128"), + pytest.param( + {"cloud_function_memory_mib": 512, "cloud_function_cpus": 0.6}, + "512Mi", + "0.6", + id="set-512", + ), + pytest.param( + {"cloud_function_memory_mib": 1024}, "1024Mi", None, id="set-1024" + ), + pytest.param( + {"cloud_function_memory_mib": 4096, "cloud_function_cpus": 4}, + "4096Mi", + "4", + id="set-4096", + ), + pytest.param( + {"cloud_function_memory_mib": 32768}, "32768Mi", None, id="set-32768" + ), ], ) @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_gcf_memory( - session, scalars_dfs, memory_mib_args, expected_memory + session, scalars_dfs, memory_mib_args, expected_memory, expected_cpus ): try: @@ -2117,6 +2138,12 @@ def square(x: int) -> int: name=square_remote.bigframes_cloud_function ) assert gcf.service_config.available_memory == expected_memory + if expected_cpus is not None: + assert gcf.service_config.available_cpu == expected_cpus + if float(gcf.service_config.available_cpu) >= 1.0: + assert gcf.service_config.max_instance_request_concurrency >= float( + gcf.service_config.available_cpu + ) scalars_df, scalars_pandas_df = scalars_dfs @@ -2138,12 +2165,8 @@ def square(x: int) -> int: pytest.param(32769, id="set-32769-too-high"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_remote_function_gcf_memory_unsupported(session, memory_mib): - with pytest.raises( - google.api_core.exceptions.InvalidArgument, - match="Invalid value specified for container memory", - ): + with pytest.raises(ValueError, match="Cloud run supports"): @session.remote_function( reuse=False, From 8a10baebe8e50bb98a1ee953de82af41c356e491 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 2 Mar 2026 16:46:30 -0800 Subject: [PATCH 28/29] chore: librarian update image pull request: 20260303T000950Z (#2487) feat: update image to us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:160860d189ff1c2f7515638478823712fa5b243e27ccc33a2728669fa1e2ed0c --- .librarian/state.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 73bd5d0bc6c..a19ad329e8e 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,4 +1,4 @@ -image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:1a2a85ab507aea26d787c06cc7979decb117164c81dd78a745982dfda80d4f68 +image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:160860d189ff1c2f7515638478823712fa5b243e27ccc33a2728669fa1e2ed0c libraries: - id: bigframes version: 2.36.0 From a8ba8791c4fd1b35c52d1af963d8c80691582173 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 3 Mar 2026 16:05:34 -0800 Subject: [PATCH 29/29] chore: librarian release pull request: 20260303T181914Z (#2489) PR created by the Librarian CLI to initialize a release. Merging this PR will auto trigger a release. Librarian Version: v0.8.3 Language Image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:160860d189ff1c2f7515638478823712fa5b243e27ccc33a2728669fa1e2ed0c
bigframes: v2.37.0 ## [v2.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.36.0...v2.37.0) (2026-03-03) ### Features * Add cloud_function_cpus option to remote_function (#2475) ([4caf74cc](https://github.com/googleapis/python-bigquery-dataframes/commit/4caf74cc)) * add display.render_mode to control DataFrame/Series visualization (#2413) ([7813eaa6](https://github.com/googleapis/python-bigquery-dataframes/commit/7813eaa6)) * Support pd.col expressions with .loc and getitem (#2473) ([ae5c8b32](https://github.com/googleapis/python-bigquery-dataframes/commit/ae5c8b32)) * add support for Python 3.14 (#2232) ([c25a6d01](https://github.com/googleapis/python-bigquery-dataframes/commit/c25a6d01)) * Support pd.col simple aggregates (#2480) ([cb00daab](https://github.com/googleapis/python-bigquery-dataframes/commit/cb00daab)) * add dt.tz_localize() (#2469) ([f70f93a1](https://github.com/googleapis/python-bigquery-dataframes/commit/f70f93a1)) * Update bigquery.ai.generate_table output_schema to allow Mapping type (#2463) ([f7fd1895](https://github.com/googleapis/python-bigquery-dataframes/commit/f7fd1895)) ### Bug Fixes * upload local data through write API if nested JSONs detected (#2478) ([01dc5a34](https://github.com/googleapis/python-bigquery-dataframes/commit/01dc5a34)) * allow IsInOp with same dtypes regardless nullable (#2466) ([1d81b414](https://github.com/googleapis/python-bigquery-dataframes/commit/1d81b414)) ### Documentation * Add code examples to configuration docstrings (#2352) ([3c21993e](https://github.com/googleapis/python-bigquery-dataframes/commit/3c21993e)) * Move readme content to new User Guide section (#2464) ([61a94845](https://github.com/googleapis/python-bigquery-dataframes/commit/61a94845)) * add code sample and docstring for bpd.options.experiments.sql_compiler (#2474) ([867951bc](https://github.com/googleapis/python-bigquery-dataframes/commit/867951bc)) * use direct API for image (#2465) ([8a1a82f7](https://github.com/googleapis/python-bigquery-dataframes/commit/8a1a82f7)) * Fix recall_score doc example (#2477) ([a6f499c1](https://github.com/googleapis/python-bigquery-dataframes/commit/a6f499c1)) * Skip inherited methods, use autosummary only for big classes (#2470) ([a9512498](https://github.com/googleapis/python-bigquery-dataframes/commit/a9512498)) * add bigframes default connection warning (#2471) ([f1bbba23](https://github.com/googleapis/python-bigquery-dataframes/commit/f1bbba23))
--- .librarian/state.yaml | 2 +- CHANGELOG.md | 30 +++++++++++++++++++++++ bigframes/version.py | 4 +-- third_party/bigframes_vendored/version.py | 4 +-- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index a19ad329e8e..dc6c05b541d 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:160860d189ff1c2f7515638478823712fa5b243e27ccc33a2728669fa1e2ed0c libraries: - id: bigframes - version: 2.36.0 + version: 2.37.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index ebc8c5598ec..b69b87bd451 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,36 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.36.0...v2.37.0) (2026-03-03) + + +### Documentation + +* Fix recall_score doc example (#2477) ([a6f499c1e225a962b53621158f9d4a19ca220ccd](https://github.com/googleapis/python-bigquery-dataframes/commit/a6f499c1e225a962b53621158f9d4a19ca220ccd)) +* add code sample and docstring for bpd.options.experiments.sql_compiler (#2474) ([867951bcabcff12e2fce88143b45d929d3237088](https://github.com/googleapis/python-bigquery-dataframes/commit/867951bcabcff12e2fce88143b45d929d3237088)) +* use direct API for image (#2465) ([8a1a82f7a0fd224f2b075c68ab116d1f580d1d82](https://github.com/googleapis/python-bigquery-dataframes/commit/8a1a82f7a0fd224f2b075c68ab116d1f580d1d82)) +* add bigframes default connection warning (#2471) ([f1bbba23667f01d3b8e7c51b18fe64641a4b135f](https://github.com/googleapis/python-bigquery-dataframes/commit/f1bbba23667f01d3b8e7c51b18fe64641a4b135f)) +* Move readme content to new User Guide section (#2464) ([61a948451baeb1caa323e721ad88b31c7cd0b3cb](https://github.com/googleapis/python-bigquery-dataframes/commit/61a948451baeb1caa323e721ad88b31c7cd0b3cb)) +* Skip inherited methods, use autosummary only for big classes (#2470) ([a9512498ef39b9d5260cad2ca0513c701a6d3592](https://github.com/googleapis/python-bigquery-dataframes/commit/a9512498ef39b9d5260cad2ca0513c701a6d3592)) +* Add code examples to configuration docstrings (#2352) ([3c21993e6fca474c32f3c2371c41ef2be146267e](https://github.com/googleapis/python-bigquery-dataframes/commit/3c21993e6fca474c32f3c2371c41ef2be146267e)) + + +### Features + +* Add cloud_function_cpus option to remote_function (#2475) ([4caf74ccaeb9608d91da864bb80eddf1148a1502](https://github.com/googleapis/python-bigquery-dataframes/commit/4caf74ccaeb9608d91da864bb80eddf1148a1502)) +* Support pd.col simple aggregates (#2480) ([cb00daabce49f067be8e16627166dda00d5d8134](https://github.com/googleapis/python-bigquery-dataframes/commit/cb00daabce49f067be8e16627166dda00d5d8134)) +* add display.render_mode to control DataFrame/Series visualization (#2413) ([7813eaa6fa2ae42943b90583e600c95beaf5d75e](https://github.com/googleapis/python-bigquery-dataframes/commit/7813eaa6fa2ae42943b90583e600c95beaf5d75e)) +* add support for Python 3.14 (#2232) ([c25a6d0151380dde74368a35e13deb7a930b494f](https://github.com/googleapis/python-bigquery-dataframes/commit/c25a6d0151380dde74368a35e13deb7a930b494f)) +* Support pd.col expressions with .loc and getitem (#2473) ([ae5c8b322765aef51eed016bfacaff5a7a917a7b](https://github.com/googleapis/python-bigquery-dataframes/commit/ae5c8b322765aef51eed016bfacaff5a7a917a7b)) +* add dt.tz_localize() (#2469) ([f70f93a1227add1627d522d7e55a37f42fc3549e](https://github.com/googleapis/python-bigquery-dataframes/commit/f70f93a1227add1627d522d7e55a37f42fc3549e)) +* Update bigquery.ai.generate_table output_schema to allow Mapping type (#2463) ([f7fd1895e64a133fe63eddeb90f57a42a35c29b2](https://github.com/googleapis/python-bigquery-dataframes/commit/f7fd1895e64a133fe63eddeb90f57a42a35c29b2)) + + +### Bug Fixes + +* upload local data through write API if nested JSONs detected (#2478) ([01dc5a34e09171351575d5cbdc9f301e505e1567](https://github.com/googleapis/python-bigquery-dataframes/commit/01dc5a34e09171351575d5cbdc9f301e505e1567)) +* allow IsInOp with same dtypes regardless nullable (#2466) ([1d81b414acbc964502ca624eae72cdb8c14e1576](https://github.com/googleapis/python-bigquery-dataframes/commit/1d81b414acbc964502ca624eae72cdb8c14e1576)) + ## [2.36.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.35.0...v2.36.0) (2026-02-17) diff --git a/bigframes/version.py b/bigframes/version.py index eaddff2fcbb..012a4502914 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.36.0" +__version__ = "2.37.0" # {x-release-please-start-date} -__release_date__ = "2026-02-17" +__release_date__ = "2026-03-03" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index eaddff2fcbb..012a4502914 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.36.0" +__version__ = "2.37.0" # {x-release-please-start-date} -__release_date__ = "2026-02-17" +__release_date__ = "2026-03-03" # {x-release-please-end}