diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index b20c6561ea..5a573694ae 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -212,11 +212,17 @@ def filter(self, predicate: ex.Expression): return arr.drop_columns(filter_ids) def order_by( - self, by: Sequence[OrderingExpression], is_total_order: bool = False + self, + by: Sequence[OrderingExpression], + is_total_order: bool = False, + stable: bool = True, ) -> ArrayValue: return ArrayValue( nodes.OrderByNode( - child=self.node, by=tuple(by), is_total_order=is_total_order + child=self.node, + by=tuple(by), + is_total_order=is_total_order, + stable=stable, ) ) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 239eedf6d3..c86e4e2baf 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -395,9 +395,10 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]: def order_by( self, by: typing.Sequence[ordering.OrderingExpression], + stable: bool = True, ) -> Block: return Block( - self._expr.order_by(by), + self._expr.order_by(by, stable=stable), index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self.index.names, diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index f7438307f7..894f3bbb89 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -436,6 +436,7 @@ def sort_values( *, inplace: bool = False, ascending: bool = True, + kind: __builtins__.str | None = None, na_position: __builtins__.str = "last", ) -> Index: if na_position not in ["first", "last"]: @@ -448,7 +449,8 @@ def sort_values( else order.descending_over(column, na_last) for column in index_columns ] - return Index(self._block.order_by(ordering)) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + return Index(self._block.order_by(ordering, stable=is_stable)) def astype( self, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 6071eaeaea..470c766c74 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -992,7 +992,8 @@ def remap_refs( @dataclasses.dataclass(frozen=True, eq=False) class OrderByNode(UnaryNode): by: Tuple[OrderingExpression, ...] - # This is an optimization, if true, can discard previous orderings. + stable: bool = True + # This is an optimization, if true, can discard previous orderings, even if doing a stable sort # might be a total ordering even if false is_total_order: bool = False diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 6741dfddad..382af92d66 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -71,7 +71,8 @@ def pull_up_order_inner( child_result, child_order = pull_up_order_inner(node.child) return child_result, child_order.with_reverse() elif isinstance(node, bigframes.core.nodes.OrderByNode): - if node.is_total_order: + # unstable sorts don't care about previous order, total orders override previous order + if (not node.stable) or node.is_total_order: new_node = remove_order(node.child) else: new_node, child_order = pull_up_order_inner(node.child) @@ -106,6 +107,10 @@ def pull_up_order_inner( ), ) ) + elif not node.stable: + new_order = bigframes.core.ordering.RowOrdering( + ordering_value_columns=tuple(new_by), + ) else: assert child_order new_order = child_order.with_ordering_columns(new_by) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 25cedda8f4..79412c0732 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2390,6 +2390,7 @@ def sort_index( *, ascending: bool = ..., inplace: Literal[False] = ..., + kind: str = ..., na_position: Literal["first", "last"] = ..., ) -> DataFrame: ... @@ -2400,6 +2401,7 @@ def sort_index( *, ascending: bool = ..., inplace: Literal[True] = ..., + kind: str = ..., na_position: Literal["first", "last"] = ..., ) -> None: ... @@ -2410,6 +2412,7 @@ def sort_index( axis: Union[int, str] = 0, ascending: bool = True, inplace: bool = False, + kind: str | None = None, na_position: Literal["first", "last"] = "last", ) -> Optional[DataFrame]: if utils.get_axis_number(axis) == 0: @@ -2423,7 +2426,8 @@ def sort_index( else order.descending_over(column, na_last) for column in index_columns ] - block = self._block.order_by(ordering) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + block = self._block.order_by(ordering, stable=is_stable) else: # axis=1 _, indexer = self.columns.sort_values( return_indexer=True, ascending=ascending, na_position=na_position # type: ignore @@ -2467,7 +2471,7 @@ def sort_values( *, inplace: bool = False, ascending: bool | typing.Sequence[bool] = True, - kind: str = "quicksort", + kind: str | None = None, na_position: typing.Literal["first", "last"] = "last", ) -> Optional[DataFrame]: if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)): @@ -2499,7 +2503,8 @@ def sort_values( if is_ascending else order.descending_over(column_id, na_last) ) - block = self._block.order_by(ordering) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + block = self._block.order_by(ordering, stable=is_stable) if inplace: self._set_block(block) return None diff --git a/bigframes/series.py b/bigframes/series.py index 23799a0a43..4c37ba5ae5 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1801,19 +1801,21 @@ def sort_values( axis=0, inplace: bool = False, ascending=True, - kind: str = "quicksort", + kind: str | None = None, na_position: typing.Literal["first", "last"] = "last", ) -> Optional[Series]: if axis != 0 and axis != "index": raise ValueError(f"No axis named {axis} for object type Series") if na_position not in ["first", "last"]: raise ValueError("Param na_position must be one of 'first' or 'last'") + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] block = self._block.order_by( [ order.ascending_over(self._value_column, (na_position == "last")) if ascending else order.descending_over(self._value_column, (na_position == "last")) ], + stable=is_stable, ) if inplace: self._set_block(block) @@ -1823,19 +1825,37 @@ def sort_values( @typing.overload # type: ignore[override] def sort_index( - self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=... + self, + *, + axis=..., + inplace: Literal[False] = ..., + ascending=..., + kind: str = ..., + na_position=..., ) -> Series: ... @typing.overload def sort_index( - self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=... + self, + *, + axis=0, + inplace: Literal[True] = ..., + ascending=..., + kind: str = ..., + na_position=..., ) -> None: ... @validations.requires_index def sort_index( - self, *, axis=0, inplace: bool = False, ascending=True, na_position="last" + self, + *, + axis=0, + inplace: bool = False, + ascending=True, + kind: str | None = None, + na_position="last", ) -> Optional[Series]: # TODO(tbergeron): Support level parameter once multi-index introduced. if axis != 0 and axis != "index": @@ -1850,7 +1870,8 @@ def sort_index( else order.descending_over(column, na_last) for column in block.index_columns ] - block = block.order_by(ordering) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + block = block.order_by(ordering, stable=is_stable) if inplace: self._set_block(block) return None diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index 9705b19c90..fc698ea188 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -55,3 +55,5 @@ "_deferred", ] VALID_WRITE_ENGINES = typing.get_args(WriteEngineType) + +DEFAULT_SORT_KIND = "stable" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f04d9989dd..2d62add6a8 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2210,7 +2210,7 @@ def sort_values( *, inplace: bool = False, ascending: bool | Sequence[bool] = True, - kind: str = "quicksort", + kind: str | None = None, na_position: Literal["first", "last"] = "last", ): """Sort by the values along row axis. @@ -2296,7 +2296,7 @@ def sort_values( the by. inplace (bool, default False): If True, perform operation in-place. - kind (str, default 'quicksort'): + kind (str, default None): Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder. @@ -2320,6 +2320,7 @@ def sort_index( axis: str | int = 0, ascending: bool = True, inplace: bool = False, + kind: str | None = None, na_position: Literal["first", "last"] = "last", ): """Sort object by labels (along an axis). @@ -2332,6 +2333,10 @@ def sort_index( Sort ascending vs. descending. inplace (bool, default False): Whether to modify the DataFrame rather than creating a new one. + kind (str, default None): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default 'last'): Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. Not implemented for MultiIndex. diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index d21056a8cf..9933903b10 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -828,7 +828,11 @@ def nunique(self) -> int: raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_values( - self, *, ascending: bool = True, na_position: str = "last" + self, + *, + ascending: bool = True, + kind: str | None = None, + na_position: str = "last", ) -> Index: """ Return a sorted copy of the index. @@ -851,6 +855,10 @@ def sort_values( Args: ascending (bool, default True): Should the index values be sorted in an ascending order. + kind (str, default None): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first' or 'last'}, default 'last'): Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 775971ab35..fee1a42a37 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1458,7 +1458,7 @@ def sort_values( axis: Axis = 0, inplace: bool = False, ascending: bool | int | Sequence[bool] | Sequence[int] = True, - kind: str = "quicksort", + kind: str | None = None, na_position: str = "last", ): """ @@ -1535,7 +1535,7 @@ def sort_values( Whether to modify the Series rather than creating a new one. ascending (bool or list of bools, default True): If True, sort values in ascending order, otherwise descending. - kind (str, default to 'quicksort'): + kind (str, default to None): Choice of sorting algorithm. Accepts quicksort', 'mergesort', 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder @@ -1555,6 +1555,7 @@ def sort_index( axis: Axis = 0, inplace: bool = False, ascending: bool | Sequence[bool] = True, + kind: str | None = None, na_position: NaPosition = "last", ): """ @@ -1602,6 +1603,10 @@ def sort_index( ascending (bool or list-like of bools, default True): Sort ascending vs. descending. When the index is a MultiIndex the sort direction can be controlled for each level individually. + kind (str, default None): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default 'last'): If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. Not implemented for MultiIndex.