Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: enforce that group-by aggregations actually aggregate #1844

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions narwhals/_duckdb/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,8 @@ def agg(
*self._keys,
*(x for expr in exprs for x in expr(self._compliant_frame)),
]
try:
return self._compliant_frame._from_native_frame(
self._compliant_frame._native_frame.aggregate(
agg_columns, group_expr=",".join(f'"{key}"' for key in self._keys)
)
return self._compliant_frame._from_native_frame(
self._compliant_frame._native_frame.aggregate(
agg_columns, group_expr=",".join(f'"{key}"' for key in self._keys)
)
except ValueError as exc: # pragma: no cover
msg = "Failed to aggregated - does your aggregation function return a scalar?"
raise RuntimeError(msg) from exc
)
5 changes: 0 additions & 5 deletions narwhals/_pandas_like/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,11 +344,6 @@ def func(df: Any) -> Any:
out_names = []
for expr in exprs:
results_keys = expr(from_dataframe(df))
if not all(len(x) == 1 for x in results_keys):
msg = f"Aggregation '{expr._function_name}' failed to aggregate - does your aggregation function return a scalar? \
\n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/"

raise ValueError(msg)
for result_keys in results_keys:
out_group.append(result_keys._native_series.iloc[0])
out_names.append(result_keys.name)
Expand Down
7 changes: 1 addition & 6 deletions narwhals/_spark_like/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,5 @@ def agg_pyspark(
)

agg_columns = [col_.alias(name) for name, col_ in simple_aggregations.items()]
try:
result_simple = grouped.agg(*agg_columns)
except ValueError as exc: # pragma: no cover
msg = "Failed to aggregated - does your aggregation function return a scalar? \
\n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/"
raise RuntimeError(msg) from exc
result_simple = grouped.agg(*agg_columns)
return from_dataframe(result_simple)
21 changes: 21 additions & 0 deletions narwhals/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from narwhals.dataframe import DataFrame
from narwhals.dataframe import LazyFrame
from narwhals.exceptions import InvalidOperationError
from narwhals.utils import tupleify

if TYPE_CHECKING:
Expand Down Expand Up @@ -109,6 +110,16 @@ def agg(
β”‚ c ┆ 3 ┆ 1 β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
"""
if not all(getattr(x, "_aggregates", True) for x in aggs) and all(
getattr(x, "_aggregates", True) for x in named_aggs.values()
):
msg = (
"Found expression which does not aggregate.\n\n"
"All expressions passed to GroupBy.agg must aggregate.\n"
"For example, `df.group_by('a').agg(nw.col('b').sum())` is valid,\n"
"but `df.group_by('a').agg(nw.col('b'))` is not."
)
raise InvalidOperationError(msg)
aggs, named_aggs = self._df._flatten_and_extract(*aggs, **named_aggs)
return self._df._from_compliant_dataframe( # type: ignore[return-value]
self._grouped.agg(*aggs, **named_aggs),
Expand Down Expand Up @@ -195,6 +206,16 @@ def agg(
β”‚ c ┆ 3 ┆ 1 β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
"""
if not all(getattr(x, "_aggregates", True) for x in aggs) and all(
getattr(x, "_aggregates", True) for x in named_aggs.values()
):
msg = (
"Found expression which does not aggregate.\n\n"
"All expressions passed to GroupBy.agg must aggregate.\n"
"For example, `df.group_by('a').agg(nw.col('b').sum())` is valid,\n"
"but `df.group_by('a').agg(nw.col('b'))` is not."
)
raise InvalidOperationError(msg)
aggs, named_aggs = self._df._flatten_and_extract(*aggs, **named_aggs)
return self._df._from_compliant_dataframe( # type: ignore[return-value]
self._grouped.agg(*aggs, **named_aggs),
Expand Down
24 changes: 5 additions & 19 deletions tests/group_by_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import narwhals.stable.v1 as nw
from narwhals.exceptions import AnonymousExprError
from narwhals.exceptions import InvalidOperationError
from tests.utils import PANDAS_VERSION
from tests.utils import PYARROW_VERSION
from tests.utils import Constructor
Expand Down Expand Up @@ -45,7 +46,7 @@ def test_invalid_group_by_dask() -> None:
with pytest.raises(ValueError, match=r"Non-trivial complex aggregation found"):
nw.from_native(df_dask).group_by("a").agg(nw.col("b").mean().min())

with pytest.raises(ValueError, match="Non-trivial complex aggregation"):
with pytest.raises(InvalidOperationError, match="does not aggregate"):
nw.from_native(df_dask).group_by("a").agg(nw.col("b"))

with pytest.raises(
Expand All @@ -58,7 +59,7 @@ def test_invalid_group_by_dask() -> None:
@pytest.mark.filterwarnings("ignore:Found complex group-by expression:UserWarning")
def test_invalid_group_by() -> None:
df = nw.from_native(df_pandas)
with pytest.raises(ValueError, match="does your"):
with pytest.raises(InvalidOperationError, match="does not aggregate"):
df.group_by("a").agg(nw.col("b"))
with pytest.raises(
AnonymousExprError,
Expand Down Expand Up @@ -366,25 +367,10 @@ def test_group_by_categorical(
assert_equal_data(result, data)


@pytest.mark.filterwarnings("ignore:Found complex group-by expression:UserWarning")
def test_group_by_shift_raises(
constructor: Constructor, request: pytest.FixtureRequest
) -> None:
if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
request.applymarker(pytest.mark.xfail)
if "polars" in str(constructor):
# Polars supports all kinds of crazy group-by aggregations, so
# we don't check that it errors here.
request.applymarker(pytest.mark.xfail)
if "cudf" in str(constructor):
# This operation fails completely in cuDF anyway, we just let raise its own
# error.
request.applymarker(pytest.mark.xfail)
def test_group_by_shift_raises(constructor: Constructor) -> None:
df_native = {"a": [1, 2, 3], "b": [1, 1, 2]}
df = nw.from_native(constructor(df_native))
with pytest.raises(
ValueError, match=".*(failed to aggregate|Non-trivial complex aggregation found)"
):
with pytest.raises(InvalidOperationError, match="does not aggregate"):
df.group_by("b").agg(nw.col("a").shift(1))


Expand Down
Loading