diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index 8f218abd8..b1e43a32f 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -41,8 +41,8 @@ to form a single summary value. For performing an aggregation, DataFusion provid f.approx_median(col_speed).alias("Median Speed"), f.approx_percentile_cont(col_speed, 0.9).alias("90% Speed")]) -When the :code:`group_by` list is empty the aggregation is done over the whole :class:`.DataFrame`. -For grouping the :code:`group_by` list must contain at least one column. +When :code:`group_by` is :code:`None` or an empty list, the aggregation is done over the whole +:class:`.DataFrame`. For grouping the :code:`group_by` list must contain at least one column. .. ipython:: python diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 9ac8293d6..de00ff474 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -798,7 +798,7 @@ def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame: def aggregate( self, - group_by: Sequence[Expr | str] | Expr | str, + group_by: Sequence[Expr | str] | Expr | str | None, aggs: Sequence[Expr] | Expr, ) -> DataFrame: """Aggregates the rows of the current DataFrame. @@ -816,23 +816,24 @@ def aggregate( Args: group_by: Sequence of expressions or column names to group - by. A :py:class:`~datafusion.expr.GroupingSet` - expression may be included to produce multiple grouping - levels (rollup, cube, or explicit grouping sets). + by, or ``None`` for aggregation over the whole DataFrame. + A :py:class:`~datafusion.expr.GroupingSet` expression may + be included to produce multiple grouping levels (rollup, + cube, or explicit grouping sets). aggs: Sequence of expressions to aggregate. Returns: DataFrame after aggregation. Examples: - Aggregate without grouping — an empty ``group_by`` produces a - single row: + Aggregate without grouping — ``None`` or an empty ``group_by`` + produces a single row: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict( ... {"team": ["x", "x", "y"], "score": [1, 2, 5]} ... ) - >>> df.aggregate([], [F.sum(col("score")).alias("total")]).to_pydict() + >>> df.aggregate(None, [F.sum(col("score")).alias("total")]).to_pydict() {'total': [8]} Group by a column and produce one row per group: @@ -842,11 +843,15 @@ def aggregate( ... ).sort("team").to_pydict() {'team': ['x', 'y'], 'total': [3, 5]} """ - group_by_list = ( - list(group_by) - if isinstance(group_by, Sequence) and not isinstance(group_by, Expr | str) - else [group_by] - ) + if group_by is None: + group_by_list = [] + else: + group_by_list = ( + list(group_by) + if isinstance(group_by, Sequence) + and not isinstance(group_by, Expr | str) + else [group_by] + ) aggs_list = ( list(aggs) if isinstance(aggs, Sequence) and not isinstance(aggs, Expr) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index ab3992a79..bb21a3974 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -475,6 +475,12 @@ def test_aggregate_tuple_group_by(df): assert result_tuple == result_list +def test_aggregate_none_group_by_equivalent_to_empty_list(df): + result_none = df.aggregate(None, [f.count()]).to_pydict() + result_empty = df.aggregate([], [f.count()]).to_pydict() + assert result_none == result_empty + + def test_aggregate_tuple_aggs(df): result_list = df.aggregate("a", [f.count()]).sort("a").to_pydict() result_tuple = df.aggregate("a", (f.count(),)).sort("a").to_pydict()