diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 4dba64808eb..21903a51248 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677 libraries: - id: bigframes - version: 2.33.0 + version: 2.34.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index 090cf2ee57b..f54231f5401 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.33.0...v2.34.0) (2026-02-02) + + +### Features + +* add `bigframes.pandas.options.experiments.sql_compiler` for switching the backend compiler (#2417) ([7eba6ee03f07938315d99e2aeaf72368c02074cf](https://github.com/googleapis/python-bigquery-dataframes/commit/7eba6ee03f07938315d99e2aeaf72368c02074cf)) +* add bigquery.ml.generate_embedding function (#2422) ([35f3f5e6f8c64b47e6e7214034f96f047785e647](https://github.com/googleapis/python-bigquery-dataframes/commit/35f3f5e6f8c64b47e6e7214034f96f047785e647)) +* add bigquery.create_external_table method (#2415) ([76db2956e505aec4f1055118ac7ca523facc10ff](https://github.com/googleapis/python-bigquery-dataframes/commit/76db2956e505aec4f1055118ac7ca523facc10ff)) +* add deprecation warnings for .blob accessor and read_gbq_object_table (#2408) ([7261a4ea5cdab6b30f5bc333501648c60e70be59](https://github.com/googleapis/python-bigquery-dataframes/commit/7261a4ea5cdab6b30f5bc333501648c60e70be59)) +* add bigquery.ml.generate_text function (#2403) ([5ac681028624de15e31f0c2ae360b47b2dcf1e8d](https://github.com/googleapis/python-bigquery-dataframes/commit/5ac681028624de15e31f0c2ae360b47b2dcf1e8d)) + + +### Bug Fixes + +* broken job url (#2411) ([fcb5bc1761c656e1aec61dbcf96a36d436833b7a](https://github.com/googleapis/python-bigquery-dataframes/commit/fcb5bc1761c656e1aec61dbcf96a36d436833b7a)) + ## [2.33.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.32.0...v2.33.0) (2026-01-22) diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index 024de392c06..ee54e017fe3 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Literal, Optional import warnings import bigframes @@ -27,6 +27,7 @@ class ExperimentOptions: def __init__(self): self._semantic_operators: bool = False self._ai_operators: bool = False + self._sql_compiler: Literal["legacy", "stable", "experimental"] = "stable" @property def semantic_operators(self) -> bool: @@ -55,6 +56,24 @@ def ai_operators(self, value: bool): warnings.warn(msg, category=bfe.PreviewWarning) self._ai_operators = value + @property + def sql_compiler(self) -> Literal["legacy", "stable", "experimental"]: + return self._sql_compiler + + @sql_compiler.setter + def sql_compiler(self, value: Literal["legacy", "stable", "experimental"]): + if value not in ["legacy", "stable", "experimental"]: + raise ValueError( + "sql_compiler must be one of 'legacy', 'stable', or 'experimental'" + ) + if value == "experimental": + msg = bfe.format_message( + "The experimental SQL compiler is still under experiments, and is subject " + "to change in the future." + ) + warnings.warn(msg, category=FutureWarning) + self._sql_compiler = value + @property def blob(self) -> bool: msg = bfe.format_message( diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 0bbbc418e6d..150fe5efc0c 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -60,6 +60,7 @@ from bigframes.bigquery._operations.search import create_vector_index, vector_search from bigframes.bigquery._operations.sql import sql_scalar from bigframes.bigquery._operations.struct import struct +from bigframes.bigquery._operations.table import create_external_table from bigframes.core.logging import log_adapter _functions = [ @@ -104,6 +105,8 @@ sql_scalar, # struct ops struct, + # table ops + create_external_table, ] _module = sys.modules[__name__] @@ -155,6 +158,8 @@ "sql_scalar", # struct ops "struct", + # table ops + "create_external_table", # Modules / SQL namespaces "ai", "ml", diff --git a/bigframes/bigquery/_operations/ml.py b/bigframes/bigquery/_operations/ml.py index e5a5c5dfb68..cc5a961af74 100644 --- a/bigframes/bigquery/_operations/ml.py +++ b/bigframes/bigquery/_operations/ml.py @@ -14,7 +14,7 @@ from __future__ import annotations -from typing import cast, Mapping, Optional, Union +from typing import cast, List, Mapping, Optional, Union import bigframes_vendored.constants import google.cloud.bigquery @@ -431,3 +431,152 @@ def transform( return bpd.read_gbq_query(sql) else: return session.read_gbq_query(sql) + + +@log_adapter.method_logger(custom_base_name="bigquery_ml") +def generate_text( + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], + input_: Union[pd.DataFrame, dataframe.DataFrame, str], + *, + temperature: Optional[float] = None, + max_output_tokens: Optional[int] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + flatten_json_output: Optional[bool] = None, + stop_sequences: Optional[List[str]] = None, + ground_with_google_search: Optional[bool] = None, + request_type: Optional[str] = None, +) -> dataframe.DataFrame: + """ + Generates text using a BigQuery ML model. + + See the `BigQuery ML GENERATE_TEXT function syntax + `_ + for additional reference. + + Args: + model (bigframes.ml.base.BaseEstimator or str): + The model to use for text generation. + input_ (Union[bigframes.pandas.DataFrame, str]): + The DataFrame or query to use for text generation. + temperature (float, optional): + A FLOAT64 value that is used for sampling promiscuity. The value + must be in the range ``[0.0, 1.0]``. A lower temperature works well + for prompts that expect a more deterministic and less open-ended + or creative response, while a higher temperature can lead to more + diverse or creative results. A temperature of ``0`` is + deterministic, meaning that the highest probability response is + always selected. + max_output_tokens (int, optional): + An INT64 value that sets the maximum number of tokens in the + generated text. + top_k (int, optional): + An INT64 value that changes how the model selects tokens for + output. A ``top_k`` of ``1`` means the next selected token is the + most probable among all tokens in the model's vocabulary. A + ``top_k`` of ``3`` means that the next token is selected from + among the three most probable tokens by using temperature. The + default value is ``40``. + top_p (float, optional): + A FLOAT64 value that changes how the model selects tokens for + output. Tokens are selected from most probable to least probable + until the sum of their probabilities equals the ``top_p`` value. + For example, if tokens A, B, and C have a probability of 0.3, 0.2, + and 0.1 and the ``top_p`` value is ``0.5``, then the model will + select either A or B as the next token by using temperature. The + default value is ``0.95``. + flatten_json_output (bool, optional): + A BOOL value that determines the content of the generated JSON column. + stop_sequences (List[str], optional): + An ARRAY value that contains the stop sequences for the model. + ground_with_google_search (bool, optional): + A BOOL value that determines whether to ground the model with Google Search. + request_type (str, optional): + A STRING value that contains the request type for the model. + + Returns: + bigframes.pandas.DataFrame: + The generated text. + """ + import bigframes.pandas as bpd + + model_name, session = _get_model_name_and_session(model, input_) + table_sql = _to_sql(input_) + + sql = bigframes.core.sql.ml.generate_text( + model_name=model_name, + table=table_sql, + temperature=temperature, + max_output_tokens=max_output_tokens, + top_k=top_k, + top_p=top_p, + flatten_json_output=flatten_json_output, + stop_sequences=stop_sequences, + ground_with_google_search=ground_with_google_search, + request_type=request_type, + ) + + if session is None: + return bpd.read_gbq_query(sql) + else: + return session.read_gbq_query(sql) + + +@log_adapter.method_logger(custom_base_name="bigquery_ml") +def generate_embedding( + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], + input_: Union[pd.DataFrame, dataframe.DataFrame, str], + *, + flatten_json_output: Optional[bool] = None, + task_type: Optional[str] = None, + output_dimensionality: Optional[int] = None, +) -> dataframe.DataFrame: + """ + Generates text embedding using a BigQuery ML model. + + See the `BigQuery ML GENERATE_EMBEDDING function syntax + `_ + for additional reference. + + Args: + model (bigframes.ml.base.BaseEstimator or str): + The model to use for text embedding. + input_ (Union[bigframes.pandas.DataFrame, str]): + The DataFrame or query to use for text embedding. + flatten_json_output (bool, optional): + A BOOL value that determines the content of the generated JSON column. + task_type (str, optional): + A STRING value that specifies the intended downstream application task. + Supported values are: + - `RETRIEVAL_QUERY` + - `RETRIEVAL_DOCUMENT` + - `SEMANTIC_SIMILARITY` + - `CLASSIFICATION` + - `CLUSTERING` + - `QUESTION_ANSWERING` + - `FACT_VERIFICATION` + - `CODE_RETRIEVAL_QUERY` + output_dimensionality (int, optional): + An INT64 value that specifies the size of the output embedding. + + Returns: + bigframes.pandas.DataFrame: + The generated text embedding. + """ + import bigframes.pandas as bpd + + model_name, session = _get_model_name_and_session(model, input_) + table_sql = _to_sql(input_) + + sql = bigframes.core.sql.ml.generate_embedding( + model_name=model_name, + table=table_sql, + flatten_json_output=flatten_json_output, + task_type=task_type, + output_dimensionality=output_dimensionality, + ) + + if session is None: + return bpd.read_gbq_query(sql) + else: + return session.read_gbq_query(sql) diff --git a/bigframes/bigquery/_operations/table.py b/bigframes/bigquery/_operations/table.py new file mode 100644 index 00000000000..c90f88dcd6f --- /dev/null +++ b/bigframes/bigquery/_operations/table.py @@ -0,0 +1,99 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Mapping, Optional, Union + +import google.cloud.bigquery +import pandas as pd + +import bigframes.core.logging.log_adapter as log_adapter +import bigframes.core.sql.table +import bigframes.session + + +def _get_table_metadata( + *, + bqclient: google.cloud.bigquery.Client, + table_name: str, +) -> pd.Series: + table_metadata = bqclient.get_table(table_name) + table_dict = table_metadata.to_api_repr() + return pd.Series(table_dict) + + +@log_adapter.method_logger(custom_base_name="bigquery_table") +def create_external_table( + table_name: str, + *, + replace: bool = False, + if_not_exists: bool = False, + columns: Optional[Mapping[str, str]] = None, + partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, + options: Mapping[str, Union[str, int, float, bool, list]], + session: Optional[bigframes.session.Session] = None, +) -> pd.Series: + """ + Creates a BigQuery external table. + + See the `BigQuery CREATE EXTERNAL TABLE DDL syntax + `_ + for additional reference. + + Args: + table_name (str): + The name of the table in BigQuery. + replace (bool, default False): + Whether to replace the table if it already exists. + if_not_exists (bool, default False): + Whether to ignore the error if the table already exists. + columns (Mapping[str, str], optional): + The table's schema. + partition_columns (Mapping[str, str], optional): + The table's partition columns. + connection_name (str, optional): + The connection to use for the table. + options (Mapping[str, Union[str, int, float, bool, list]]): + The OPTIONS clause, which specifies the table options. + session (bigframes.session.Session, optional): + The session to use. If not provided, the default session is used. + + Returns: + pandas.Series: + A Series with object dtype containing the table metadata. Reference + the `BigQuery Table REST API reference + `_ + for available fields. + """ + import bigframes.pandas as bpd + + sql = bigframes.core.sql.table.create_external_table_ddl( + table_name=table_name, + replace=replace, + if_not_exists=if_not_exists, + columns=columns, + partition_columns=partition_columns, + connection_name=connection_name, + options=options, + ) + + if session is None: + bpd.read_gbq_query(sql) + session = bpd.get_global_session() + else: + session.read_gbq_query(sql) + + return _get_table_metadata(bqclient=session.bqclient, table_name=table_name) diff --git a/bigframes/bigquery/ml.py b/bigframes/bigquery/ml.py index 6ceadb324d5..b1b33d0dbd4 100644 --- a/bigframes/bigquery/ml.py +++ b/bigframes/bigquery/ml.py @@ -23,6 +23,8 @@ create_model, evaluate, explain_predict, + generate_embedding, + generate_text, global_explain, predict, transform, @@ -35,4 +37,6 @@ "explain_predict", "global_explain", "transform", + "generate_text", + "generate_embedding", ] diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index 68c36df2889..15d2d0e52c1 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -13,13 +13,28 @@ # limitations under the License. from __future__ import annotations +from typing import Any + +from bigframes import options from bigframes.core.compile.api import test_only_ibis_inferred_schema from bigframes.core.compile.configs import CompileRequest, CompileResult -from bigframes.core.compile.ibis_compiler.ibis_compiler import compile_sql + + +def compiler() -> Any: + """Returns the appropriate compiler module based on session options.""" + if options.experiments.sql_compiler == "experimental": + import bigframes.core.compile.sqlglot.compiler as sqlglot_compiler + + return sqlglot_compiler + else: + import bigframes.core.compile.ibis_compiler.ibis_compiler as ibis_compiler + + return ibis_compiler + __all__ = [ "test_only_ibis_inferred_schema", - "compile_sql", "CompileRequest", "CompileResult", + "compiler", ] diff --git a/bigframes/core/compile/configs.py b/bigframes/core/compile/configs.py index 5ffca0cf43b..62c28f87cae 100644 --- a/bigframes/core/compile/configs.py +++ b/bigframes/core/compile/configs.py @@ -34,3 +34,4 @@ class CompileResult: sql: str sql_schema: typing.Sequence[google.cloud.bigquery.SchemaField] row_order: typing.Optional[ordering.RowOrdering] + encoded_type_refs: str diff --git a/bigframes/core/compile/ibis_compiler/ibis_compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py index 31cd9a0456b..9e209ea3b34 100644 --- a/bigframes/core/compile/ibis_compiler/ibis_compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -29,6 +29,7 @@ import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.configs as configs import bigframes.core.compile.explode +from bigframes.core.logging import data_types as data_type_logger import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering import bigframes.core.rewrite as rewrites @@ -56,15 +57,20 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: ) if request.sort_rows: result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) sql = compile_result_node(result_node) return configs.CompileResult( - sql, result_node.schema.to_bigquery(), result_node.order_by + sql, + result_node.schema.to_bigquery(), + result_node.order_by, + encoded_type_refs, ) ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by result_node = dataclasses.replace(result_node, order_by=None) result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) result_node = cast(nodes.ResultNode, rewrites.defer_selection(result_node)) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) sql = compile_result_node(result_node) # Return the ordering iff no extra columns are needed to define the row order if ordering is not None: @@ -72,7 +78,9 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: ordering if ordering.referenced_columns.issubset(result_node.ids) else None ) assert (not request.materialize_all_order_keys) or (output_order is not None) - return configs.CompileResult(sql, result_node.schema.to_bigquery(), output_order) + return configs.CompileResult( + sql, result_node.schema.to_bigquery(), output_order, encoded_type_refs + ) def _replace_unsupported_ops(node: nodes.BigFrameNode): diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index 647e86d28ac..3b851390049 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -527,7 +527,7 @@ def _( else: result = apply_window_if_present(result, window) - if op.should_floor_result: + if op.should_floor_result or column.dtype == dtypes.TIMEDELTA_DTYPE: result = sge.Cast(this=sge.func("FLOOR", result), to="INT64") return result diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index e77370892c0..f2c94f98c7a 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -34,6 +34,7 @@ from bigframes.core.compile.sqlglot.expressions import typed_expr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir +from bigframes.core.logging import data_types as data_type_logger import bigframes.core.ordering as bf_ordering from bigframes.core.rewrite import schema_binding @@ -41,8 +42,6 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: """Compiles a BigFrameNode according to the request into SQL using SQLGlot.""" - # Generator for unique identifiers. - uid_gen = guid.SequentialUIDGenerator() output_names = tuple((expression.DerefOp(id), id.sql) for id in request.node.ids) result_node = nodes.ResultNode( request.node, @@ -61,29 +60,29 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: ) if request.sort_rows: result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) - result_node = _remap_variables(result_node, uid_gen) - result_node = typing.cast( - nodes.ResultNode, rewrite.defer_selection(result_node) - ) - sql = _compile_result_node(result_node, uid_gen) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) + sql = _compile_result_node(result_node) return configs.CompileResult( - sql, result_node.schema.to_bigquery(), result_node.order_by + sql, + result_node.schema.to_bigquery(), + result_node.order_by, + encoded_type_refs, ) ordering: typing.Optional[bf_ordering.RowOrdering] = result_node.order_by result_node = dataclasses.replace(result_node, order_by=None) result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) - - result_node = _remap_variables(result_node, uid_gen) - result_node = typing.cast(nodes.ResultNode, rewrite.defer_selection(result_node)) - sql = _compile_result_node(result_node, uid_gen) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) + sql = _compile_result_node(result_node) # Return the ordering iff no extra columns are needed to define the row order if ordering is not None: output_order = ( ordering if ordering.referenced_columns.issubset(result_node.ids) else None ) assert (not request.materialize_all_order_keys) or (output_order is not None) - return configs.CompileResult(sql, result_node.schema.to_bigquery(), output_order) + return configs.CompileResult( + sql, result_node.schema.to_bigquery(), output_order, encoded_type_refs + ) def _remap_variables( @@ -97,11 +96,16 @@ def _remap_variables( return typing.cast(nodes.ResultNode, result_node) -def _compile_result_node( - root: nodes.ResultNode, uid_gen: guid.SequentialUIDGenerator -) -> str: +def _compile_result_node(root: nodes.ResultNode) -> str: + # Create UIDs to standardize variable names and ensure consistent compilation + # of nodes using the same generator. + uid_gen = guid.SequentialUIDGenerator() + root = _remap_variables(root, uid_gen) + root = typing.cast(nodes.ResultNode, rewrite.defer_selection(root)) + # Have to bind schema as the final step before compilation. root = typing.cast(nodes.ResultNode, schema_binding.bind_schema_to_tree(root)) + selected_cols: tuple[tuple[str, sge.Expression], ...] = tuple( (name, scalar_compiler.scalar_op_compiler.compile_expression(ref)) for ref, name in root.output_cols @@ -127,7 +131,6 @@ def _compile_result_node( return sqlglot_ir.sql -@functools.lru_cache(maxsize=5000) def compile_node( node: nodes.BigFrameNode, uid_gen: guid.SequentialUIDGenerator ) -> ir.SQLGlotIR: @@ -266,10 +269,16 @@ def compile_concat(node: nodes.ConcatNode, *children: ir.SQLGlotIR) -> ir.SQLGlo assert len(children) >= 1 uid_gen = children[0].uid_gen - output_ids = [id.sql for id in node.output_ids] + # BigQuery `UNION` query takes the column names from the first `SELECT` clause. + default_output_ids = [field.id.sql for field in node.child_nodes[0].fields] + output_aliases = [ + (default_output_id, output_id.sql) + for default_output_id, output_id in zip(default_output_ids, node.output_ids) + ] + return ir.SQLGlotIR.from_union( [child.expr for child in children], - output_ids=output_ids, + output_aliases=output_aliases, uid_gen=uid_gen, ) diff --git a/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/bigframes/core/compile/sqlglot/expressions/generic_ops.py index 2f486fc9d51..003a7296fcb 100644 --- a/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -252,7 +252,7 @@ def _cast_to_json(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: sg_expr = expr.expr if from_type == dtypes.STRING_DTYPE: - func_name = "PARSE_JSON_IN_SAFE" if op.safe else "PARSE_JSON" + func_name = "SAFE.PARSE_JSON" if op.safe else "PARSE_JSON" return sge.func(func_name, sg_expr) if from_type in (dtypes.INT_DTYPE, dtypes.BOOL_DTYPE, dtypes.FLOAT_DTYPE): sg_expr = sge.Cast(this=sg_expr, to="STRING") diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index d4dc4ecc064..ad5da31206a 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -170,7 +170,9 @@ def from_query_string( cls, query_string: str, ) -> SQLGlotIR: - """Builds a SQLGlot expression from a query string""" + """Builds a SQLGlot expression from a query string. Wrapping the query + in a CTE can avoid the query parsing issue for unsupported syntax in + SQLGlot.""" uid_gen: guid.SequentialUIDGenerator = guid.SequentialUIDGenerator() cte_name = sge.to_identifier( next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted @@ -187,7 +189,7 @@ def from_query_string( def from_union( cls, selects: typing.Sequence[sge.Select], - output_ids: typing.Sequence[str], + output_aliases: typing.Sequence[typing.Tuple[str, str]], uid_gen: guid.SequentialUIDGenerator, ) -> SQLGlotIR: """Builds a SQLGlot expression by unioning of multiple select expressions.""" @@ -196,7 +198,7 @@ def from_union( ), f"At least two select expressions must be provided, but got {selects}." existing_ctes: list[sge.CTE] = [] - union_selects: list[sge.Expression] = [] + union_selects: list[sge.Select] = [] for select in selects: assert isinstance( select, sge.Select @@ -204,38 +206,28 @@ def from_union( select_expr = select.copy() select_expr, select_ctes = _pop_query_ctes(select_expr) - existing_ctes = [*existing_ctes, *select_ctes] - - new_cte_name = sge.to_identifier( - next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted - ) - new_cte = sge.CTE( - this=select_expr, - alias=new_cte_name, + existing_ctes = _merge_ctes(existing_ctes, select_ctes) + union_selects.append(select_expr) + + union_expr: sge.Query = union_selects[0].subquery() + for select in union_selects[1:]: + union_expr = sge.Union( + this=union_expr, + expression=select.subquery(), + distinct=False, + copy=False, ) - existing_ctes = [*existing_ctes, new_cte] - selections = [ - sge.Alias( - this=sge.to_identifier(expr.alias_or_name, quoted=cls.quoted), - alias=sge.to_identifier(output_id, quoted=cls.quoted), - ) - for expr, output_id in zip(select_expr.expressions, output_ids) - ] - union_selects.append( - sge.Select().select(*selections).from_(sge.Table(this=new_cte_name)) + selections = [ + sge.Alias( + this=sge.to_identifier(old_name, quoted=cls.quoted), + alias=sge.to_identifier(new_name, quoted=cls.quoted), ) - - union_expr = typing.cast( - sge.Select, - functools.reduce( - lambda x, y: sge.Union( - this=x, expression=y, distinct=False, copy=False - ), - union_selects, - ), + for old_name, new_name in output_aliases + ] + final_select_expr = ( + sge.Select().select(*selections).from_(union_expr.subquery()) ) - final_select_expr = sge.Select().select(sge.Star()).from_(union_expr.subquery()) final_select_expr = _set_query_ctes(final_select_expr, existing_ctes) return cls(expr=final_select_expr, uid_gen=uid_gen) @@ -345,7 +337,7 @@ def join( left_select, left_ctes = _pop_query_ctes(left_select) right_select, right_ctes = _pop_query_ctes(right_select) - merged_ctes = [*left_ctes, *right_ctes] + merged_ctes = _merge_ctes(left_ctes, right_ctes) join_on = _and( tuple( @@ -382,7 +374,7 @@ def isin_join( left_select, left_ctes = _pop_query_ctes(left_select) right_select, right_ctes = _pop_query_ctes(right_select) - merged_ctes = [*left_ctes, *right_ctes] + merged_ctes = _merge_ctes(left_ctes, right_ctes) left_condition = typed_expr.TypedExpr( sge.Column(this=conditions[0].expr, table=left_cte_name), @@ -835,6 +827,15 @@ def _set_query_ctes( return new_expr +def _merge_ctes(ctes1: list[sge.CTE], ctes2: list[sge.CTE]) -> list[sge.CTE]: + """Merges two lists of CTEs, de-duplicating by alias name.""" + seen = {cte.alias: cte for cte in ctes1} + for cte in ctes2: + if cte.alias not in seen: + seen[cte.alias] = cte + return list(seen.values()) + + def _pop_query_ctes( expr: sge.Select, ) -> tuple[sge.Select, list[sge.CTE]]: diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index 17493159250..d77c5aa4a0b 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -14,7 +14,9 @@ from __future__ import annotations -from typing import Dict, Mapping, Optional, Union +import collections.abc +import json +from typing import Any, Dict, List, Mapping, Optional, Union import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql @@ -100,14 +102,41 @@ def create_model_ddl( def _build_struct_sql( - struct_options: Mapping[str, Union[str, int, float, bool]] + struct_options: Mapping[ + str, + Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], + ] ) -> str: if not struct_options: return "" rendered_options = [] for option_name, option_value in struct_options.items(): - rendered_val = bigframes.core.sql.simple_literal(option_value) + if option_name == "model_params": + json_str = json.dumps(option_value) + # Escape single quotes for SQL string literal + sql_json_str = json_str.replace("'", "''") + rendered_val = f"JSON'{sql_json_str}'" + elif isinstance(option_value, collections.abc.Mapping): + struct_body = ", ".join( + [ + f"{bigframes.core.sql.simple_literal(v)} AS {k}" + for k, v in option_value.items() + ] + ) + rendered_val = f"STRUCT({struct_body})" + elif isinstance(option_value, list): + rendered_val = ( + "[" + + ", ".join( + [bigframes.core.sql.simple_literal(v) for v in option_value] + ) + + "]" + ) + elif isinstance(option_value, bool): + rendered_val = str(option_value).lower() + else: + rendered_val = bigframes.core.sql.simple_literal(option_value) rendered_options.append(f"{rendered_val} AS {option_name}") return f", STRUCT({', '.join(rendered_options)})" @@ -151,7 +180,7 @@ def predict( """Encode the ML.PREDICT statement. See https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-predict for reference. """ - struct_options = {} + struct_options: Dict[str, Union[str, int, float, bool]] = {} if threshold is not None: struct_options["threshold"] = threshold if keep_original_columns is not None: @@ -205,7 +234,7 @@ def global_explain( """Encode the ML.GLOBAL_EXPLAIN statement. See https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-global-explain for reference. """ - struct_options = {} + struct_options: Dict[str, Union[str, int, float, bool]] = {} if class_level_explain is not None: struct_options["class_level_explain"] = class_level_explain @@ -224,3 +253,74 @@ def transform( """ sql = f"SELECT * FROM ML.TRANSFORM(MODEL {googlesql.identifier(model_name)}, ({table}))\n" return sql + + +def generate_text( + model_name: str, + table: str, + *, + temperature: Optional[float] = None, + max_output_tokens: Optional[int] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + flatten_json_output: Optional[bool] = None, + stop_sequences: Optional[List[str]] = None, + ground_with_google_search: Optional[bool] = None, + request_type: Optional[str] = None, +) -> str: + """Encode the ML.GENERATE_TEXT statement. + See https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-text for reference. + """ + struct_options: Dict[ + str, + Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], + ] = {} + if temperature is not None: + struct_options["temperature"] = temperature + if max_output_tokens is not None: + struct_options["max_output_tokens"] = max_output_tokens + if top_k is not None: + struct_options["top_k"] = top_k + if top_p is not None: + struct_options["top_p"] = top_p + if flatten_json_output is not None: + struct_options["flatten_json_output"] = flatten_json_output + if stop_sequences is not None: + struct_options["stop_sequences"] = stop_sequences + if ground_with_google_search is not None: + struct_options["ground_with_google_search"] = ground_with_google_search + if request_type is not None: + struct_options["request_type"] = request_type + + sql = f"SELECT * FROM ML.GENERATE_TEXT(MODEL {googlesql.identifier(model_name)}, ({table})" + sql += _build_struct_sql(struct_options) + sql += ")\n" + return sql + + +def generate_embedding( + model_name: str, + table: str, + *, + flatten_json_output: Optional[bool] = None, + task_type: Optional[str] = None, + output_dimensionality: Optional[int] = None, +) -> str: + """Encode the ML.GENERATE_EMBEDDING statement. + See https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding for reference. + """ + struct_options: Dict[ + str, + Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], + ] = {} + if flatten_json_output is not None: + struct_options["flatten_json_output"] = flatten_json_output + if task_type is not None: + struct_options["task_type"] = task_type + if output_dimensionality is not None: + struct_options["output_dimensionality"] = output_dimensionality + + sql = f"SELECT * FROM ML.GENERATE_EMBEDDING(MODEL {googlesql.identifier(model_name)}, ({table})" + sql += _build_struct_sql(struct_options) + sql += ")\n" + return sql diff --git a/bigframes/core/sql/table.py b/bigframes/core/sql/table.py new file mode 100644 index 00000000000..24a97ed1598 --- /dev/null +++ b/bigframes/core/sql/table.py @@ -0,0 +1,68 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Mapping, Optional, Union + + +def create_external_table_ddl( + table_name: str, + *, + replace: bool = False, + if_not_exists: bool = False, + columns: Optional[Mapping[str, str]] = None, + partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, + options: Mapping[str, Union[str, int, float, bool, list]], +) -> str: + """Generates the CREATE EXTERNAL TABLE DDL statement.""" + statement = ["CREATE"] + if replace: + statement.append("OR REPLACE") + statement.append("EXTERNAL TABLE") + if if_not_exists: + statement.append("IF NOT EXISTS") + statement.append(table_name) + + if columns: + column_defs = ", ".join([f"{name} {typ}" for name, typ in columns.items()]) + statement.append(f"({column_defs})") + + if connection_name: + statement.append(f"WITH CONNECTION `{connection_name}`") + + if partition_columns: + part_defs = ", ".join( + [f"{name} {typ}" for name, typ in partition_columns.items()] + ) + statement.append(f"WITH PARTITION COLUMNS ({part_defs})") + + if options: + opts = [] + for key, value in options.items(): + if isinstance(value, str): + value_sql = repr(value) + opts.append(f"{key} = {value_sql}") + elif isinstance(value, bool): + opts.append(f"{key} = {str(value).upper()}") + elif isinstance(value, list): + list_str = ", ".join([repr(v) for v in value]) + opts.append(f"{key} = [{list_str}]") + else: + opts.append(f"{key} = {value}") + options_str = ", ".join(opts) + statement.append(f"OPTIONS ({options_str})") + + return " ".join(statement) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 3c37a3470d5..1e3cdabdaf6 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -25,10 +25,10 @@ import google.api_core.exceptions as api_core_exceptions import google.cloud.bigquery as bigquery import humanize -import IPython -import IPython.display as display if TYPE_CHECKING: + from IPython import display + import bigframes.core.events GenericJob = Union[ @@ -160,6 +160,8 @@ def progress_callback( progress_bar = "notebook" if in_ipython() else "terminal" if progress_bar == "notebook": + import IPython.display as display + if ( isinstance(event, bigframes.core.events.ExecutionStarted) or current_display is None @@ -198,10 +200,12 @@ def progress_callback( display_id=current_display_id, ) elif isinstance(event, bigframes.core.events.ExecutionFinished): - display.update_display( - display.HTML(f"✅ Completed. {previous_display_html}"), - display_id=current_display_id, - ) + if previous_display_html: + display.update_display( + display.HTML(f"✅ Completed. {previous_display_html}"), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.SessionClosed): display.update_display( display.HTML(f"Session {event.session_id} closed."), @@ -239,6 +243,8 @@ def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): try: if progress_bar == "notebook": + import IPython.display as display + display_id = str(random.random()) loading_bar = display.HTML(get_base_job_loading_html(job)) display.display(loading_bar, display_id=display_id) @@ -336,7 +342,7 @@ def get_job_url( """ if project_id is None or location is None or job_id is None: return None - return f"""https://console.cloud. google.com/bigquery?project={project_id}&j=bq:{location}:{job_id}&page=queryresults""" + return f"""https://console.cloud.google.com/bigquery?project={project_id}&j=bq:{location}:{job_id}&page=queryresults""" def render_bqquery_sent_event_html( @@ -607,4 +613,8 @@ def get_bytes_processed_string(val: Any): def in_ipython(): """Return True iff we're in a colab-like IPython.""" + try: + import IPython + except (ImportError, NameError): + return False return hasattr(IPython.get_ipython(), "kernel") diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 29f720b3ebc..9210addaa81 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -18,7 +18,6 @@ from typing import cast, Literal, Optional, Union import warnings -import IPython.display as ipy_display import pandas as pd import requests @@ -241,6 +240,8 @@ def display( width (int or None, default None): width in pixels that the image/video are constrained to. If unset, use the global setting in bigframes.options.display.blob_display_width, otherwise image/video's original size or ratio is used. No-op for other content types. height (int or None, default None): height in pixels that the image/video are constrained to. If unset, use the global setting in bigframes.options.display.blob_display_height, otherwise image/video's original size or ratio is used. No-op for other content types. """ + import IPython.display as ipy_display + width = width or bigframes.options.display.blob_display_width height = height or bigframes.options.display.blob_display_height diff --git a/bigframes/series.py b/bigframes/series.py index 814d59befff..0c74a0dd19c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -317,6 +317,14 @@ def list(self) -> lists.ListAccessor: @property def blob(self) -> blob.BlobAccessor: + """ + Accessor for Blob operations. + """ + warnings.warn( + "The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.", + category=bfe.ApiDeprecationWarning, + stacklevel=2, + ) return blob.BlobAccessor(self) @property diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index ca8fbf29196..757bb50a940 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -2291,6 +2291,11 @@ def read_gbq_object_table( bigframes.pandas.DataFrame: Result BigFrames DataFrame. """ + warnings.warn( + "read_gbq_object_table is deprecated and will be removed in a future release. Use read_gbq with 'ref' column instead.", + category=bfe.ApiDeprecationWarning, + stacklevel=2, + ) # TODO(garrettwu): switch to pseudocolumn when b/374988109 is done. table = self.bqclient.get_table(object_table) connection = table._properties["externalDataConfiguration"]["connectionId"] diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index ca19d1be86f..2f5ec035dc6 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -174,7 +174,9 @@ def to_sql( else array_value.node ) node = self._substitute_large_local_sources(node) - compiled = compile.compile_sql(compile.CompileRequest(node, sort_rows=ordered)) + compiled = compile.compiler().compile_sql( + compile.CompileRequest(node, sort_rows=ordered) + ) return compiled.sql def execute( @@ -290,7 +292,9 @@ def _export_gbq( # validate destination table existing_table = self._maybe_find_existing_table(spec) - compiled = compile.compile_sql(compile.CompileRequest(plan, sort_rows=False)) + compiled = compile.compiler().compile_sql( + compile.CompileRequest(plan, sort_rows=False) + ) sql = compiled.sql if (existing_table is not None) and _if_schema_match( @@ -318,6 +322,8 @@ def _export_gbq( clustering_fields=spec.cluster_cols if spec.cluster_cols else None, ) + # Attach data type usage to the job labels + job_config.labels["bigframes-dtypes"] = compiled.encoded_type_refs # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. iterator, job = self._run_execute_query( @@ -641,7 +647,7 @@ def _execute_plan_gbq( ] cluster_cols = cluster_cols[:_MAX_CLUSTER_COLUMNS] - compiled = compile.compile_sql( + compiled = compile.compiler().compile_sql( compile.CompileRequest( plan, sort_rows=ordered, @@ -661,6 +667,8 @@ def _execute_plan_gbq( ) job_config.destination = destination_table + # Attach data type usage to the job labels + job_config.labels["bigframes-dtypes"] = compiled.encoded_type_refs iterator, query_job = self._run_execute_query( sql=compiled.sql, job_config=job_config, diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py index 3ec10bf20f2..c60670b5425 100644 --- a/bigframes/session/direct_gbq_execution.py +++ b/bigframes/session/direct_gbq_execution.py @@ -20,7 +20,8 @@ import google.cloud.bigquery.table as bq_table from bigframes.core import compile, nodes -from bigframes.core.compile import sqlglot +import bigframes.core.compile.ibis_compiler.ibis_compiler as ibis_compiler +import bigframes.core.compile.sqlglot.compiler as sqlglot_compiler import bigframes.core.events from bigframes.session import executor, semi_executor import bigframes.session._io.bigquery as bq_io @@ -40,7 +41,9 @@ def __init__( ): self.bqclient = bqclient self._compile_fn = ( - compile.compile_sql if compiler == "ibis" else sqlglot.compile_sql + ibis_compiler.compile_sql + if compiler == "ibis" + else sqlglot_compiler.compile_sql ) self._publisher = publisher diff --git a/bigframes/version.py b/bigframes/version.py index 1e9ed79f825..a6862ee201c 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.33.0" +__version__ = "2.34.0" # {x-release-please-start-date} -__release_date__ = "2026-01-22" +__release_date__ = "2026-02-02" # {x-release-please-end} diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 5dd8af1c5f1..e9491610acf 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -91,9 +91,7 @@ "outputs": [ { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -119,17 +117,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Annie 482\n", - " AL F 1910 Myrtle 104\n", - " AR F 1910 Lillian 56\n", - " CT F 1910 Anne 38\n", - " CT F 1910 Frances 45\n", - " FL F 1910 Margaret 53\n", - " GA F 1910 Mae 73\n", - " GA F 1910 Beatrice 96\n", - " GA F 1910 Lola 47\n", - " IA F 1910 Viola 49\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -143,32 +141,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "220340b0", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.161c75bd-f9f8-4b21-8a45-1d7dfc659034.
SQL
SELECT\n",
-       "`state` AS `state`,\n",
-       "`gender` AS `gender`,\n",
-       "`year` AS `year`,\n",
-       "`name` AS `name`,\n",
-       "`number` AS `number`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  `t0`.`state`,\n",
-       "  `t0`.`gender`,\n",
-       "  `t0`.`year`,\n",
-       "  `t0`.`name`,\n",
-       "  `t0`.`number`,\n",
-       "  `t0`.`bfuid_col_2` AS `bfuid_col_15`\n",
-       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._c58be946_1477_4c00_b699_0ae022f13563_bqdf_8e323719-899f-4da2-89cd-2dbb53ab1dfc` AS `t0`)\n",
-       "ORDER BY `bfuid_col_15` ASC NULLS LAST
\n", - " " - ], + "text/html": [], "text/plain": [ "" ] @@ -178,11 +157,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 7 seconds of slot time. [Job bigframes-dev:US.job_IuiJsjhfPtOrKuTIOqPIjnVLX820 details]\n", - " " - ], + "text/html": [], "text/plain": [ "" ] @@ -193,7 +168,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e68fbb9eb4d24bab837c77730d31c8a1", + "model_id": "6fb22be7f21f4d1dacd76dc62a1a7818", "version_major": 2, "version_minor": 1 }, @@ -229,80 +204,80 @@ " AL\n", " F\n", " 1910\n", - " Hazel\n", - " 51\n", + " Lillian\n", + " 99\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Lucy\n", - " 76\n", + " Ruby\n", + " 204\n", " \n", " \n", " 2\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Nellie\n", - " 39\n", + " Helen\n", + " 76\n", " \n", " \n", " 3\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Lena\n", - " 40\n", + " Eunice\n", + " 41\n", " \n", " \n", " 4\n", - " CO\n", + " AR\n", " F\n", " 1910\n", - " Thelma\n", - " 36\n", + " Dora\n", + " 42\n", " \n", " \n", " 5\n", - " CO\n", + " CA\n", " F\n", " 1910\n", - " Ruth\n", - " 68\n", + " Edna\n", + " 62\n", " \n", " \n", " 6\n", - " CT\n", + " CA\n", " F\n", " 1910\n", - " Elizabeth\n", - " 86\n", + " Helen\n", + " 239\n", " \n", " \n", " 7\n", - " DC\n", + " CO\n", " F\n", " 1910\n", - " Mary\n", - " 80\n", + " Alice\n", + " 46\n", " \n", " \n", " 8\n", " FL\n", " F\n", " 1910\n", - " Annie\n", - " 101\n", + " Willie\n", + " 71\n", " \n", " \n", " 9\n", " FL\n", " F\n", " 1910\n", - " Alma\n", - " 39\n", + " Thelma\n", + " 65\n", " \n", " \n", "\n", @@ -310,67 +285,25 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Hazel 51\n", - " AL F 1910 Lucy 76\n", - " AR F 1910 Nellie 39\n", - " AR F 1910 Lena 40\n", - " CO F 1910 Thelma 36\n", - " CO F 1910 Ruth 68\n", - " CT F 1910 Elizabeth 86\n", - " DC F 1910 Mary 80\n", - " FL F 1910 Annie 101\n", - " FL F 1910 Alma 39\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]" ] }, - "execution_count": 13, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 9 seconds of slot time. [Job bigframes-dev:US.job_IEjIRaqt2w-_pAttPw1VAVuRPxA7 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 5 seconds of slot time. [Job bigframes-dev:US.job_Mi-3m2AkEC1iPgWi7hmcWa1M1oIA details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 6 seconds of slot time. [Job bigframes-dev:US.job_j8pvY385WwIY7tGvhI7Yxc62aBwd details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -396,7 +329,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 30 seconds of slot time. [Job bigframes-dev:US.ff90d507-bec8-4d24-abc3-0209ac28e21f details]\n", + " Query processed 171.4 MB in 41 seconds of slot time. [Job bigframes-dev:US.492b5260-9f44-495c-be09-2ae1324a986c details]\n", " " ], "text/plain": [ @@ -422,9 +355,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -477,7 +408,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_517TdI--FMoURkV7QQNMltY_-dZ7 details]\n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gsx0h2jHoOSYwqGKUS3lAYLf_qi3 details]\n", " " ], "text/plain": [ @@ -491,7 +422,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_rCeYkeBPqmTKNFWFgwXjz5Ed8uWI details]\n", + " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_1VivAJ2InPdg5RXjWfvAJ1B0oxO3 details]\n", " " ], "text/plain": [ @@ -504,7 +435,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3e630b1a56c740e781772ca5f5c7267a", + "model_id": "7d82208e7e5e40dd9dbf64c4c561cab3", "version_major": 2, "version_minor": 1 }, @@ -606,7 +537,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 11 seconds of slot time. [Job bigframes-dev:US.job_XwXTDb6gWVkuyIFMeWA0waE33bSg details]\n", + " Query processed 215.9 MB in 10 seconds of slot time. [Job bigframes-dev:US.job_cmNyG5sJ1IDCyFINx7teExQOZ6UQ details]\n", " " ], "text/plain": [ @@ -620,7 +551,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 7 seconds of slot time. [Job bigframes-dev:US.job_bCW0LYK5_PzyyGPf9OAg4YfNMG1C details]\n", + " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_aQvP3Sn04Ss4flSLaLhm0sKzFvrd details]\n", " " ], "text/plain": [ @@ -640,12 +571,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a6a2b19314b04283a5a66ca9d66eb771", + "model_id": "52d11291ba1d42e6b544acbd86eef6cf", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -755,12 +686,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "beb362548a6b4fd4a163569edd6f1a90", + "model_id": "32c61c84740d45a0ac37202a76c7c14e", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -804,7 +735,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 19 seconds of slot time.\n", + " Query processed 85.9 kB in 21 seconds of slot time.\n", " " ], "text/plain": [ @@ -826,9 +757,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -838,9 +767,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -865,7 +792,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "02a46cf499b442d4bfe03934195e67df", + "model_id": "9d60a47296214553bb10c434b5ee8330", "version_major": 2, "version_minor": 1 }, @@ -912,17 +839,17 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", - " 03.10.2018\n", - " H01L 21/20\n", - " <NA>\n", - " 18166536.5\n", - " 16.02.2016\n", + " 29.08.018\n", + " E04H 6/12\n", " <NA>\n", - " Scheider, Sascha et al\n", - " EV Group E. Thallner GmbH\n", - " Kurz, Florian\n", - " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", - " EP 3 382 744 A1\n", + " 18157874.1\n", + " 21.02.2018\n", + " 22.02.2017\n", + " Liedtke & Partner Patentanw√§lte\n", + " SHB Hebezeugbau GmbH\n", + " VOLGER, Alexander\n", + " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", + " EP 3 366 869 A1\n", " \n", " \n", " 1\n", @@ -931,16 +858,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " A01K 31/00\n", + " H05B 6/12\n", " <NA>\n", - " 18171005.4\n", - " 05.02.2015\n", - " 05.02.2014\n", - " Stork Bamberger Patentanw√§lte\n", - " Linco Food Systems A/S\n", - " Thrane, Uffe\n", - " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", - " EP 3 381 276 A1\n", + " 18165514.3\n", + " 03.04.2018\n", + " 30.03.2017\n", + " <NA>\n", + " BSH Hausger√§te GmbH\n", + " Acero Acero, Jesus\n", + " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", + " EP 3 383 141 A2\n", " \n", " \n", " 2\n", @@ -949,16 +876,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " G06F 11/30\n", + " H01L 21/20\n", " <NA>\n", - " 18157347.8\n", - " 19.02.2018\n", - " 31.03.2017\n", - " Hoffmann Eitle\n", - " FUJITSU LIMITED\n", - " Kukihara, Kensuke\n", - " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", - " EP 3 382 553 A1\n", + " 18166536.5\n", + " 16.02.2016\n", + " <NA>\n", + " Scheider, Sascha et al\n", + " EV Group E. Thallner GmbH\n", + " Kurz, Florian\n", + " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", + " EP 3 382 744 A1\n", " \n", " \n", " 3\n", @@ -967,16 +894,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " H05B 6/12\n", - " <NA>\n", - " 18165514.3\n", - " 03.04.2018\n", - " 30.03.2017\n", + " G06F 11/30\n", " <NA>\n", - " BSH Hausger√§te GmbH\n", - " Acero Acero, Jesus\n", - " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", - " EP 3 383 141 A2\n", + " 18157347.8\n", + " 19.02.2018\n", + " 31.03.2017\n", + " Hoffmann Eitle\n", + " FUJITSU LIMITED\n", + " Kukihara, Kensuke\n", + " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", + " EP 3 382 553 A1\n", " \n", " \n", " 4\n", @@ -984,17 +911,17 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", - " 29.08.018\n", - " E04H 6/12\n", + " 03.10.2018\n", + " A01K 31/00\n", " <NA>\n", - " 18157874.1\n", - " 21.02.2018\n", - " 22.02.2017\n", - " Liedtke & Partner Patentanw√§lte\n", - " SHB Hebezeugbau GmbH\n", - " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", - " EP 3 366 869 A1\n", + " 18171005.4\n", + " 05.02.2015\n", + " 05.02.2014\n", + " Stork Bamberger Patentanw√§lte\n", + " Linco Food Systems A/S\n", + " Thrane, Uffe\n", + " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", + " EP 3 381 276 A1\n", " \n", " \n", "\n", @@ -1017,32 +944,32 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 03.10.2018 H01L 21/20 18166536.5 \n", - "1 03.10.2018 A01K 31/00 18171005.4 \n", - "2 03.10.2018 G06F 11/30 18157347.8 \n", - "3 03.10.2018 H05B 6/12 18165514.3 \n", - "4 29.08.018 E04H 6/12 18157874.1 \n", + "0 29.08.018 E04H 6/12 18157874.1 \n", + "1 03.10.2018 H05B 6/12 18165514.3 \n", + "2 03.10.2018 H01L 21/20 18166536.5 \n", + "3 03.10.2018 G06F 11/30 18157347.8 \n", + "4 03.10.2018 A01K 31/00 18171005.4 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 16.02.2016 Scheider, Sascha et al \n", - "1 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", - "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "3 03.04.2018 30.03.2017 \n", - "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "1 03.04.2018 30.03.2017 \n", + "2 16.02.2016 Scheider, Sascha et al \n", + "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "4 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 EV Group E. Thallner GmbH Kurz, Florian \n", - "1 Linco Food Systems A/S Thrane, Uffe \n", - "2 FUJITSU LIMITED Kukihara, Kensuke \n", - "3 BSH Hausger√§te GmbH Acero Acero, Jesus \n", - "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "1 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "2 EV Group E. Thallner GmbH Kurz, Florian \n", + "3 FUJITSU LIMITED Kukihara, Kensuke \n", + "4 Linco Food Systems A/S Thrane, Uffe \n", "\n", " title_line_1 number \n", - "0 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "1 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", - "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "3 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", - "4 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "0 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "4 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", "\n", "[5 rows x 15 columns]" ] diff --git a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb index 501bfc88d31..3dc0eabf5a1 100644 --- a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb +++ b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb @@ -991,7 +991,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv", + "display_name": "venv (3.10.14)", "language": "python", "name": "python3" }, @@ -1005,7 +1005,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/noxfile.py b/noxfile.py index 7626a49adfe..00ada18a469 100644 --- a/noxfile.py +++ b/noxfile.py @@ -67,14 +67,10 @@ UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", - "asyncmock", PYTEST_VERSION, - "pytest-asyncio", "pytest-cov", - "pytest-mock", "pytest-timeout", ] -UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = ["tests"] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { @@ -106,8 +102,6 @@ SYSTEM_TEST_EXTERNAL_DEPENDENCIES = [ "google-cloud-bigquery", ] -SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = [] -SYSTEM_TEST_DEPENDENCIES: List[str] = [] SYSTEM_TEST_EXTRAS: List[str] = ["tests"] SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { # Make sure we leave some versions without "extras" so we know those @@ -206,20 +200,20 @@ def lint_setup_py(session): def install_unittest_dependencies(session, install_test_extra, *constraints): - standard_deps = UNIT_TEST_STANDARD_DEPENDENCIES + UNIT_TEST_DEPENDENCIES - session.install(*standard_deps, *constraints) - - if UNIT_TEST_LOCAL_DEPENDENCIES: - session.install(*UNIT_TEST_LOCAL_DEPENDENCIES, *constraints) - + extras = [] if install_test_extra: if session.python in UNIT_TEST_EXTRAS_BY_PYTHON: extras = UNIT_TEST_EXTRAS_BY_PYTHON[session.python] else: extras = UNIT_TEST_EXTRAS - session.install("-e", f".[{','.join(extras)}]", *constraints) - else: - session.install("-e", ".", *constraints) + + session.install( + *UNIT_TEST_STANDARD_DEPENDENCIES, + *UNIT_TEST_DEPENDENCIES, + "-e", + f".[{','.join(extras)}]" if extras else ".", + *constraints, + ) def run_unit(session, install_test_extra): @@ -308,22 +302,6 @@ def mypy(session): def install_systemtest_dependencies(session, install_test_extra, *constraints): - # Use pre-release gRPC for system tests. - # Exclude version 1.49.0rc1 which has a known issue. - # See https://github.com/grpc/grpc/pull/30642 - session.install("--pre", "grpcio!=1.49.0rc1") - - session.install(*SYSTEM_TEST_STANDARD_DEPENDENCIES, *constraints) - - if SYSTEM_TEST_EXTERNAL_DEPENDENCIES: - session.install(*SYSTEM_TEST_EXTERNAL_DEPENDENCIES, *constraints) - - if SYSTEM_TEST_LOCAL_DEPENDENCIES: - session.install("-e", *SYSTEM_TEST_LOCAL_DEPENDENCIES, *constraints) - - if SYSTEM_TEST_DEPENDENCIES: - session.install("-e", *SYSTEM_TEST_DEPENDENCIES, *constraints) - if install_test_extra and SYSTEM_TEST_EXTRAS_BY_PYTHON: extras = SYSTEM_TEST_EXTRAS_BY_PYTHON.get(session.python, []) elif install_test_extra and SYSTEM_TEST_EXTRAS: @@ -331,10 +309,19 @@ def install_systemtest_dependencies(session, install_test_extra, *constraints): else: extras = [] - if extras: - session.install("-e", f".[{','.join(extras)}]", *constraints) - else: - session.install("-e", ".", *constraints) + # Use pre-release gRPC for system tests. + # Exclude version 1.49.0rc1 which has a known issue. + # See https://github.com/grpc/grpc/pull/30642 + + session.install( + "--pre", + "grpcio!=1.49.0rc1", + *SYSTEM_TEST_STANDARD_DEPENDENCIES, + *SYSTEM_TEST_EXTERNAL_DEPENDENCIES, + "-e", + f".[{','.join(extras)}]" if extras else ".", + *constraints, + ) def run_system( @@ -444,6 +431,8 @@ def doctest(session: nox.sessions.Session): "bigframes/testing", "--ignore", "bigframes/display/anywidget.py", + "--ignore", + "bigframes/bigquery/_operations/ai.py", ), test_folder="bigframes", check_cov=True, diff --git a/setup.py b/setup.py index 720687952c4..090b1035364 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,6 @@ "requests >=2.27.1", "shapely >=1.8.5", "tabulate >=0.9", - "ipywidgets >=7.7.1", "humanize >=4.6.0", "matplotlib >=3.7.1", "db-dtypes >=1.4.2", diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index 1695a4806b8..4810d6461b8 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -11,7 +11,6 @@ requests==2.32.3 tornado==6.3.3 absl-py==1.4.0 debugpy==1.6.6 -ipywidgets==7.7.1 matplotlib==3.7.1 psutil==5.9.5 seaborn==0.13.1 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 9865d3b364a..8e4ade29c74 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -22,7 +22,6 @@ requests==2.27.1 scikit-learn==1.2.2 shapely==1.8.5 tabulate==0.9 -ipywidgets==7.7.1 humanize==4.6.0 matplotlib==3.7.1 db-dtypes==1.4.2 diff --git a/tests/system/large/bigquery/test_ml.py b/tests/system/large/bigquery/test_ml.py new file mode 100644 index 00000000000..22011199feb --- /dev/null +++ b/tests/system/large/bigquery/test_ml.py @@ -0,0 +1,64 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.bigquery.ml as ml +import bigframes.pandas as bpd + + +@pytest.fixture(scope="session") +def embedding_model(bq_connection, dataset_id): + model_name = f"{dataset_id}.embedding_model" + return ml.create_model( + model_name=model_name, + options={"endpoint": "gemini-embedding-001"}, + connection_name=bq_connection, + ) + + +def test_generate_embedding(embedding_model): + df = bpd.DataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + ] + } + ) + + result = ml.generate_embedding(embedding_model, df) + assert len(result) == 2 + assert "ml_generate_embedding_result" in result.columns + assert "ml_generate_embedding_status" in result.columns + + +def test_generate_embedding_with_options(embedding_model): + df = bpd.DataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + ] + } + ) + + result = ml.generate_embedding( + embedding_model, df, task_type="RETRIEVAL_DOCUMENT", output_dimensionality=256 + ) + assert len(result) == 2 + assert "ml_generate_embedding_result" in result.columns + assert "ml_generate_embedding_status" in result.columns + embedding = result["ml_generate_embedding_result"].to_pandas() + assert len(embedding[0]) == 256 diff --git a/tests/system/large/bigquery/test_table.py b/tests/system/large/bigquery/test_table.py new file mode 100644 index 00000000000..dd956b3a040 --- /dev/null +++ b/tests/system/large/bigquery/test_table.py @@ -0,0 +1,36 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.bigquery as bbq + + +def test_create_external_table(session, dataset_id, bq_connection): + table_name = f"{dataset_id}.test_object_table" + uri = "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*" + + # Create the external table + table = bbq.create_external_table( + table_name, + connection_name=bq_connection, + options={"object_metadata": "SIMPLE", "uris": [uri]}, + session=session, + ) + assert table is not None + + # Read the table to verify + import bigframes.pandas as bpd + + bf_df = bpd.read_gbq(table_name) + pd_df = bf_df.to_pandas() + assert len(pd_df) > 0 diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index 5ada4fabb0e..102d6083822 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -14,12 +14,14 @@ from unittest import mock -import IPython.display import pandas as pd +import pytest import bigframes import bigframes.pandas as bpd +idisplay = pytest.importorskip("IPython.display") + def test_blob_create_from_uri_str( bq_connection: str, session: bigframes.Session, images_uris @@ -99,14 +101,14 @@ def test_blob_create_read_gbq_object_table( def test_display_images(monkeypatch, images_mm_df: bpd.DataFrame): mock_display = mock.Mock() - monkeypatch.setattr(IPython.display, "display", mock_display) + monkeypatch.setattr(idisplay, "display", mock_display) images_mm_df["blob_col"].blob.display() for call in mock_display.call_args_list: args, _ = call arg = args[0] - assert isinstance(arg, IPython.display.Image) + assert isinstance(arg, idisplay.Image) def test_display_nulls( @@ -117,7 +119,7 @@ def test_display_nulls( uri_series = bpd.Series([None, None, None], dtype="string", session=session) blob_series = uri_series.str.to_blob(connection=bq_connection) mock_display = mock.Mock() - monkeypatch.setattr(IPython.display, "display", mock_display) + monkeypatch.setattr(idisplay, "display", mock_display) blob_series.blob.display() diff --git a/tests/system/small/session/test_session_logging.py b/tests/system/small/session/test_session_logging.py new file mode 100644 index 00000000000..b9515823093 --- /dev/null +++ b/tests/system/small/session/test_session_logging.py @@ -0,0 +1,40 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +from bigframes.core.logging import data_types +import bigframes.session._io.bigquery as bq_io + + +def test_data_type_logging(scalars_df_index): + s = scalars_df_index["int64_col"] + 1.5 + + # We want to check the job_config passed to _query_and_wait_bigframes + with mock.patch( + "bigframes.session._io.bigquery.start_query_with_client", + wraps=bq_io.start_query_with_client, + ) as mock_query: + s.to_pandas() + + # Fetch job labels sent to the BQ client and verify their values + assert mock_query.called + call_args = mock_query.call_args + job_config = call_args.kwargs.get("job_config") + assert job_config is not None + job_labels = job_config.labels + assert "bigframes-dtypes" in job_labels + assert job_labels["bigframes-dtypes"] == data_types.encode_type_refs( + s._block._expr.node + ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 698f531d57b..0501df3f8c9 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -352,7 +352,7 @@ def test_read_gbq_w_primary_keys_table( pd.testing.assert_frame_equal(result, sorted_result) # Verify that we're working from a snapshot rather than a copy of the table. - assert "FOR SYSTEM_TIME AS OF TIMESTAMP" in df.sql + assert "FOR SYSTEM_TIME AS OF" in df.sql def test_read_gbq_w_primary_keys_table_and_filters( diff --git a/tests/unit/_config/test_experiment_options.py b/tests/unit/_config/test_experiment_options.py index deeee2e46a7..0e69dfe36d7 100644 --- a/tests/unit/_config/test_experiment_options.py +++ b/tests/unit/_config/test_experiment_options.py @@ -46,3 +46,18 @@ def test_ai_operators_set_true_shows_warning(): options.ai_operators = True assert options.ai_operators is True + + +def test_sql_compiler_default_stable(): + options = experiment_options.ExperimentOptions() + + assert options.sql_compiler == "stable" + + +def test_sql_compiler_set_experimental_shows_warning(): + options = experiment_options.ExperimentOptions() + + with pytest.warns(FutureWarning): + options.sql_compiler = "experimental" + + assert options.sql_compiler == "experimental" diff --git a/tests/unit/bigquery/test_ml.py b/tests/unit/bigquery/test_ml.py index 96b97d68fe3..fd774691528 100644 --- a/tests/unit/bigquery/test_ml.py +++ b/tests/unit/bigquery/test_ml.py @@ -163,3 +163,69 @@ def test_transform_with_pandas_dataframe(read_pandas_mock, read_gbq_query_mock): assert "ML.TRANSFORM" in generated_sql assert f"MODEL `{MODEL_NAME}`" in generated_sql assert "(SELECT * FROM `pandas_df`)" in generated_sql + + +@mock.patch("bigframes.pandas.read_gbq_query") +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_text_with_pandas_dataframe(read_pandas_mock, read_gbq_query_mock): + df = pd.DataFrame({"col1": [1, 2, 3]}) + read_pandas_mock.return_value._to_sql_query.return_value = ( + "SELECT * FROM `pandas_df`", + [], + [], + ) + ml_ops.generate_text( + MODEL_SERIES, + input_=df, + temperature=0.5, + max_output_tokens=128, + top_k=20, + top_p=0.9, + flatten_json_output=True, + stop_sequences=["a", "b"], + ground_with_google_search=True, + request_type="TYPE", + ) + read_pandas_mock.assert_called_once() + read_gbq_query_mock.assert_called_once() + generated_sql = read_gbq_query_mock.call_args[0][0] + assert "ML.GENERATE_TEXT" in generated_sql + assert f"MODEL `{MODEL_NAME}`" in generated_sql + assert "(SELECT * FROM `pandas_df`)" in generated_sql + assert "STRUCT(0.5 AS temperature" in generated_sql + assert "128 AS max_output_tokens" in generated_sql + assert "20 AS top_k" in generated_sql + assert "0.9 AS top_p" in generated_sql + assert "true AS flatten_json_output" in generated_sql + assert "['a', 'b'] AS stop_sequences" in generated_sql + assert "true AS ground_with_google_search" in generated_sql + assert "'TYPE' AS request_type" in generated_sql + + +@mock.patch("bigframes.pandas.read_gbq_query") +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_embedding_with_pandas_dataframe( + read_pandas_mock, read_gbq_query_mock +): + df = pd.DataFrame({"col1": [1, 2, 3]}) + read_pandas_mock.return_value._to_sql_query.return_value = ( + "SELECT * FROM `pandas_df`", + [], + [], + ) + ml_ops.generate_embedding( + MODEL_SERIES, + input_=df, + flatten_json_output=True, + task_type="RETRIEVAL_DOCUMENT", + output_dimensionality=256, + ) + read_pandas_mock.assert_called_once() + read_gbq_query_mock.assert_called_once() + generated_sql = read_gbq_query_mock.call_args[0][0] + assert "ML.GENERATE_EMBEDDING" in generated_sql + assert f"MODEL `{MODEL_NAME}`" in generated_sql + assert "(SELECT * FROM `pandas_df`)" in generated_sql + assert "true AS flatten_json_output" in generated_sql + assert "'RETRIEVAL_DOCUMENT' AS task_type" in generated_sql + assert "256 AS output_dimensionality" in generated_sql diff --git a/tests/unit/bigquery/test_table.py b/tests/unit/bigquery/test_table.py new file mode 100644 index 00000000000..badce5e5e23 --- /dev/null +++ b/tests/unit/bigquery/test_table.py @@ -0,0 +1,95 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License""); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pytest + +import bigframes.bigquery +import bigframes.core.sql.table +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +def test_create_external_table_ddl(): + sql = bigframes.core.sql.table.create_external_table_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_create_external_table_ddl_replace(): + sql = bigframes.core.sql.table.create_external_table_ddl( + "my-project.my_dataset.my_table", + replace=True, + columns={"col1": "INT64", "col2": "STRING"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "CREATE OR REPLACE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_create_external_table_ddl_if_not_exists(): + sql = bigframes.core.sql.table.create_external_table_ddl( + "my-project.my_dataset.my_table", + if_not_exists=True, + columns={"col1": "INT64", "col2": "STRING"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "CREATE EXTERNAL TABLE IF NOT EXISTS my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_create_external_table_ddl_partition_columns(): + sql = bigframes.core.sql.table.create_external_table_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + partition_columns={"part1": "DATE", "part2": "STRING"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) WITH PARTITION COLUMNS (part1 DATE, part2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_create_external_table_ddl_connection(): + sql = bigframes.core.sql.table.create_external_table_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + connection_name="my-connection", + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) WITH CONNECTION `my-connection` OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +@mock.patch("bigframes.bigquery._operations.table._get_table_metadata") +def test_create_external_table(get_table_metadata_mock, mock_session): + bigframes.bigquery.create_external_table( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + session=mock_session, + ) + mock_session.read_gbq_query.assert_called_once() + generated_sql = mock_session.read_gbq_query.call_args[0][0] + expected = "CREATE EXTERNAL TABLE my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (format = 'CSV', uris = ['gs://bucket/path*'])" + assert generated_sql == expected + get_table_metadata_mock.assert_called_once() diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_json/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_json/out.sql index 2defc2e72b0..4ffaf7256a1 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_json/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_json/out.sql @@ -13,7 +13,7 @@ WITH `bfcte_0` AS ( PARSE_JSON(CAST(`bool_col` AS STRING)) AS `bfcol_6`, PARSE_JSON(`string_col`) AS `bfcol_7`, PARSE_JSON(CAST(`bool_col` AS STRING)) AS `bfcol_8`, - PARSE_JSON_IN_SAFE(`string_col`) AS `bfcol_9` + SAFE.PARSE_JSON(`string_col`) AS `bfcol_9` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql index a0d7db2b1a2..db1da10086f 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql @@ -14,15 +14,6 @@ WITH `bfcte_1` AS ( *, 0 AS `bfcol_8` FROM `bfcte_3` -), `bfcte_6` AS ( - SELECT - `rowindex` AS `bfcol_9`, - `rowindex` AS `bfcol_10`, - `int64_col` AS `bfcol_11`, - `string_col` AS `bfcol_12`, - `bfcol_8` AS `bfcol_13`, - `bfcol_7` AS `bfcol_14` - FROM `bfcte_5` ), `bfcte_0` AS ( SELECT `int64_col`, @@ -39,36 +30,36 @@ WITH `bfcte_1` AS ( *, 1 AS `bfcol_23` FROM `bfcte_2` -), `bfcte_7` AS ( - SELECT - `rowindex` AS `bfcol_24`, - `rowindex` AS `bfcol_25`, - `int64_col` AS `bfcol_26`, - `string_col` AS `bfcol_27`, - `bfcol_23` AS `bfcol_28`, - `bfcol_22` AS `bfcol_29` - FROM `bfcte_4` -), `bfcte_8` AS ( +), `bfcte_6` AS ( SELECT - * + `bfcol_9` AS `bfcol_30`, + `bfcol_10` AS `bfcol_31`, + `bfcol_11` AS `bfcol_32`, + `bfcol_12` AS `bfcol_33`, + `bfcol_13` AS `bfcol_34`, + `bfcol_14` AS `bfcol_35` FROM ( - SELECT - `bfcol_9` AS `bfcol_30`, - `bfcol_10` AS `bfcol_31`, - `bfcol_11` AS `bfcol_32`, - `bfcol_12` AS `bfcol_33`, - `bfcol_13` AS `bfcol_34`, - `bfcol_14` AS `bfcol_35` - FROM `bfcte_6` + ( + SELECT + `rowindex` AS `bfcol_9`, + `rowindex` AS `bfcol_10`, + `int64_col` AS `bfcol_11`, + `string_col` AS `bfcol_12`, + `bfcol_8` AS `bfcol_13`, + `bfcol_7` AS `bfcol_14` + FROM `bfcte_5` + ) UNION ALL - SELECT - `bfcol_24` AS `bfcol_30`, - `bfcol_25` AS `bfcol_31`, - `bfcol_26` AS `bfcol_32`, - `bfcol_27` AS `bfcol_33`, - `bfcol_28` AS `bfcol_34`, - `bfcol_29` AS `bfcol_35` - FROM `bfcte_7` + ( + SELECT + `rowindex` AS `bfcol_24`, + `rowindex` AS `bfcol_25`, + `int64_col` AS `bfcol_26`, + `string_col` AS `bfcol_27`, + `bfcol_23` AS `bfcol_28`, + `bfcol_22` AS `bfcol_29` + FROM `bfcte_4` + ) ) ) SELECT @@ -76,7 +67,7 @@ SELECT `bfcol_31` AS `rowindex_1`, `bfcol_32` AS `int64_col`, `bfcol_33` AS `string_col` -FROM `bfcte_8` +FROM `bfcte_6` ORDER BY `bfcol_34` ASC NULLS LAST, `bfcol_35` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat_filter_sorted/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat_filter_sorted/out.sql index 8e65381fef1..65b0f9abd5e 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat_filter_sorted/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat_filter_sorted/out.sql @@ -13,13 +13,6 @@ WITH `bfcte_2` AS ( *, 0 AS `bfcol_5` FROM `bfcte_6` -), `bfcte_13` AS ( - SELECT - `float64_col` AS `bfcol_6`, - `int64_col` AS `bfcol_7`, - `bfcol_5` AS `bfcol_8`, - `bfcol_4` AS `bfcol_9` - FROM `bfcte_10` ), `bfcte_0` AS ( SELECT `bool_col`, @@ -42,13 +35,6 @@ WITH `bfcte_2` AS ( *, 1 AS `bfcol_16` FROM `bfcte_8` -), `bfcte_14` AS ( - SELECT - `float64_col` AS `bfcol_17`, - `int64_too` AS `bfcol_18`, - `bfcol_16` AS `bfcol_19`, - `bfcol_15` AS `bfcol_20` - FROM `bfcte_12` ), `bfcte_1` AS ( SELECT `float64_col`, @@ -64,19 +50,6 @@ WITH `bfcte_2` AS ( *, 2 AS `bfcol_26` FROM `bfcte_5` -), `bfcte_15` AS ( - SELECT - `float64_col` AS `bfcol_27`, - `int64_col` AS `bfcol_28`, - `bfcol_26` AS `bfcol_29`, - `bfcol_25` AS `bfcol_30` - FROM `bfcte_9` -), `bfcte_0` AS ( - SELECT - `bool_col`, - `float64_col`, - `int64_too` - FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_3` AS ( SELECT * @@ -93,50 +66,54 @@ WITH `bfcte_2` AS ( *, 3 AS `bfcol_37` FROM `bfcte_7` -), `bfcte_16` AS ( - SELECT - `float64_col` AS `bfcol_38`, - `int64_too` AS `bfcol_39`, - `bfcol_37` AS `bfcol_40`, - `bfcol_36` AS `bfcol_41` - FROM `bfcte_11` -), `bfcte_17` AS ( +), `bfcte_13` AS ( SELECT - * + `bfcol_6` AS `bfcol_42`, + `bfcol_7` AS `bfcol_43`, + `bfcol_8` AS `bfcol_44`, + `bfcol_9` AS `bfcol_45` FROM ( - SELECT - `bfcol_6` AS `bfcol_42`, - `bfcol_7` AS `bfcol_43`, - `bfcol_8` AS `bfcol_44`, - `bfcol_9` AS `bfcol_45` - FROM `bfcte_13` + ( + SELECT + `float64_col` AS `bfcol_6`, + `int64_col` AS `bfcol_7`, + `bfcol_5` AS `bfcol_8`, + `bfcol_4` AS `bfcol_9` + FROM `bfcte_10` + ) UNION ALL - SELECT - `bfcol_17` AS `bfcol_42`, - `bfcol_18` AS `bfcol_43`, - `bfcol_19` AS `bfcol_44`, - `bfcol_20` AS `bfcol_45` - FROM `bfcte_14` + ( + SELECT + `float64_col` AS `bfcol_17`, + `int64_too` AS `bfcol_18`, + `bfcol_16` AS `bfcol_19`, + `bfcol_15` AS `bfcol_20` + FROM `bfcte_12` + ) UNION ALL - SELECT - `bfcol_27` AS `bfcol_42`, - `bfcol_28` AS `bfcol_43`, - `bfcol_29` AS `bfcol_44`, - `bfcol_30` AS `bfcol_45` - FROM `bfcte_15` + ( + SELECT + `float64_col` AS `bfcol_27`, + `int64_col` AS `bfcol_28`, + `bfcol_26` AS `bfcol_29`, + `bfcol_25` AS `bfcol_30` + FROM `bfcte_9` + ) UNION ALL - SELECT - `bfcol_38` AS `bfcol_42`, - `bfcol_39` AS `bfcol_43`, - `bfcol_40` AS `bfcol_44`, - `bfcol_41` AS `bfcol_45` - FROM `bfcte_16` + ( + SELECT + `float64_col` AS `bfcol_38`, + `int64_too` AS `bfcol_39`, + `bfcol_37` AS `bfcol_40`, + `bfcol_36` AS `bfcol_41` + FROM `bfcte_11` + ) ) ) SELECT `bfcol_42` AS `float64_col`, `bfcol_43` AS `int64_col` -FROM `bfcte_17` +FROM `bfcte_13` ORDER BY `bfcol_44` ASC NULLS LAST, `bfcol_45` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index c5fabd99e6f..03a8b39d9a0 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys - import numpy as np import pandas as pd import pytest @@ -36,7 +34,6 @@ def test_compile_readlocal_w_structs_df( compiler_session_w_nested_structs_types: bigframes.Session, snapshot, ): - # TODO(b/427306734): Check why the output is different from the expected output. bf_df = bpd.DataFrame( nested_structs_pandas_df, session=compiler_session_w_nested_structs_types ) @@ -66,8 +63,6 @@ def test_compile_readlocal_w_json_df( def test_compile_readlocal_w_special_values( compiler_session: bigframes.Session, snapshot ): - if sys.version_info < (3, 12): - pytest.skip("Skipping test due to inconsistent SQL formatting") df = pd.DataFrame( { "col_none": [None, 1, 2], diff --git a/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql index 01eb4d37819..848c36907b9 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.EVALUATE(MODEL `my_model`, STRUCT(False AS perform_aggregation, 10 AS horizon, 0.95 AS confidence_level)) +SELECT * FROM ML.EVALUATE(MODEL `my_model`, STRUCT(false AS perform_aggregation, 10 AS horizon, 0.95 AS confidence_level)) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_generate_embedding_model_basic/generate_embedding_model_basic.sql b/tests/unit/core/sql/snapshots/test_ml/test_generate_embedding_model_basic/generate_embedding_model_basic.sql new file mode 100644 index 00000000000..7294f1655f7 --- /dev/null +++ b/tests/unit/core/sql/snapshots/test_ml/test_generate_embedding_model_basic/generate_embedding_model_basic.sql @@ -0,0 +1 @@ +SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `my_project.my_dataset.my_model`, (SELECT * FROM new_data)) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_generate_embedding_model_with_options/generate_embedding_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_generate_embedding_model_with_options/generate_embedding_model_with_options.sql new file mode 100644 index 00000000000..d07e1c1e15e --- /dev/null +++ b/tests/unit/core/sql/snapshots/test_ml/test_generate_embedding_model_with_options/generate_embedding_model_with_options.sql @@ -0,0 +1 @@ +SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `my_project.my_dataset.my_model`, (SELECT * FROM new_data), STRUCT(true AS flatten_json_output, 'RETRIEVAL_DOCUMENT' AS task_type, 256 AS output_dimensionality)) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_generate_text_model_basic/generate_text_model_basic.sql b/tests/unit/core/sql/snapshots/test_ml/test_generate_text_model_basic/generate_text_model_basic.sql new file mode 100644 index 00000000000..9d986876448 --- /dev/null +++ b/tests/unit/core/sql/snapshots/test_ml/test_generate_text_model_basic/generate_text_model_basic.sql @@ -0,0 +1 @@ +SELECT * FROM ML.GENERATE_TEXT(MODEL `my_project.my_dataset.my_model`, (SELECT * FROM new_data)) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_generate_text_model_with_options/generate_text_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_generate_text_model_with_options/generate_text_model_with_options.sql new file mode 100644 index 00000000000..7839ff3fbdd --- /dev/null +++ b/tests/unit/core/sql/snapshots/test_ml/test_generate_text_model_with_options/generate_text_model_with_options.sql @@ -0,0 +1 @@ +SELECT * FROM ML.GENERATE_TEXT(MODEL `my_project.my_dataset.my_model`, (SELECT * FROM new_data), STRUCT(0.5 AS temperature, 128 AS max_output_tokens, 20 AS top_k, 0.9 AS top_p, true AS flatten_json_output, ['a', 'b'] AS stop_sequences, true AS ground_with_google_search, 'TYPE' AS request_type)) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql index 1a3baa0c13b..b8d158acfc7 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL `my_model`, STRUCT(True AS class_level_explain)) +SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL `my_model`, STRUCT(true AS class_level_explain)) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql index 96c8074e4c1..f320d47fcf4 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.PREDICT(MODEL `my_model`, (SELECT * FROM new_data), STRUCT(True AS keep_original_columns)) +SELECT * FROM ML.PREDICT(MODEL `my_model`, (SELECT * FROM new_data), STRUCT(true AS keep_original_columns)) diff --git a/tests/unit/core/sql/test_ml.py b/tests/unit/core/sql/test_ml.py index 9721f42fee1..27b7a00ac21 100644 --- a/tests/unit/core/sql/test_ml.py +++ b/tests/unit/core/sql/test_ml.py @@ -177,3 +177,46 @@ def test_transform_model_basic(snapshot): table="SELECT * FROM new_data", ) snapshot.assert_match(sql, "transform_model_basic.sql") + + +def test_generate_text_model_basic(snapshot): + sql = bigframes.core.sql.ml.generate_text( + model_name="my_project.my_dataset.my_model", + table="SELECT * FROM new_data", + ) + snapshot.assert_match(sql, "generate_text_model_basic.sql") + + +def test_generate_text_model_with_options(snapshot): + sql = bigframes.core.sql.ml.generate_text( + model_name="my_project.my_dataset.my_model", + table="SELECT * FROM new_data", + temperature=0.5, + max_output_tokens=128, + top_k=20, + top_p=0.9, + flatten_json_output=True, + stop_sequences=["a", "b"], + ground_with_google_search=True, + request_type="TYPE", + ) + snapshot.assert_match(sql, "generate_text_model_with_options.sql") + + +def test_generate_embedding_model_basic(snapshot): + sql = bigframes.core.sql.ml.generate_embedding( + model_name="my_project.my_dataset.my_model", + table="SELECT * FROM new_data", + ) + snapshot.assert_match(sql, "generate_embedding_model_basic.sql") + + +def test_generate_embedding_model_with_options(snapshot): + sql = bigframes.core.sql.ml.generate_embedding( + model_name="my_project.my_dataset.my_model", + table="SELECT * FROM new_data", + flatten_json_output=True, + task_type="RETRIEVAL_DOCUMENT", + output_dimensionality=256, + ) + snapshot.assert_match(sql, "generate_embedding_model_with_options.sql") diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py index 7a1cf1ab13a..ec681b36ab0 100644 --- a/tests/unit/test_formatting_helpers.py +++ b/tests/unit/test_formatting_helpers.py @@ -197,3 +197,18 @@ def test_render_bqquery_finished_event_plaintext(): assert "finished" in text assert "1.0 kB processed" in text assert "Slot time: 2 seconds" in text + + +def test_get_job_url(): + job_id = "my-job-id" + location = "us-central1" + project_id = "my-project" + expected_url = ( + f"https://console.cloud.google.com/bigquery?project={project_id}" + f"&j=bq:{location}:{job_id}&page=queryresults" + ) + + actual_url = formatting_helpers.get_job_url( + job_id=job_id, location=location, project_id=project_id + ) + assert actual_url == expected_url diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 1e9ed79f825..a6862ee201c 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.33.0" +__version__ = "2.34.0" # {x-release-please-start-date} -__release_date__ = "2026-01-22" +__release_date__ = "2026-02-02" # {x-release-please-end}