Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: validate duplicate column names in pyarrow and duckdb #1815

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/extremes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: install-minimum-versions
run: uv pip install pipdeptree tox virtualenv setuptools pandas==0.25.3 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
run: uv pip install pipdeptree tox virtualenv setuptools pandas==0.25.3 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata --system
- name: install-reqs
run: |
uv pip install -e ".[dev]" --system
Expand All @@ -40,6 +40,7 @@ jobs:
echo "$DEPS" | grep 'pyarrow==11.0.0'
echo "$DEPS" | grep 'scipy==1.5.0'
echo "$DEPS" | grep 'scikit-learn==1.1.0'
echo "$DEPS" | grep 'duckdb==1.0'
- name: Run pytest
run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy]

Expand All @@ -61,7 +62,7 @@ jobs:
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: install-pretty-old-versions
run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata --system
- name: install-reqs
run: uv pip install -e ".[dev]" --system
- name: show-deps
Expand All @@ -78,6 +79,7 @@ jobs:
echo "$DEPS" | grep 'pyspark==3.5.0'
echo "$DEPS" | grep 'scipy==1.5.0'
echo "$DEPS" | grep 'scikit-learn==1.1.0'
echo "$DEPS" | grep 'duckdb==1.0'
- name: Run pytest
run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy]

Expand All @@ -99,7 +101,7 @@ jobs:
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: install-not-so-old-versions
run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system
run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.8.0 scikit-learn==1.3.0 duckdb==1.0 dask[dataframe]==2024.10 tzdata --system
- name: install-reqs
run: uv pip install -e ".[dev]" --system
- name: show-deps
Expand All @@ -115,6 +117,7 @@ jobs:
echo "$DEPS" | grep 'scipy==1.8.0'
echo "$DEPS" | grep 'scikit-learn==1.3.0'
echo "$DEPS" | grep 'dask==2024.10'
echo "$DEPS" | grep 'duckdb==1.0'
- name: Run pytest
run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy],dask

Expand Down
1 change: 1 addition & 0 deletions docs/how_it_works.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ df = PandasLikeDataFrame(
implementation=Implementation.PANDAS,
backend_version=parse_version(pd.__version__),
version=Version.MAIN,
validate_column_names=True,
)
expression = pn.col("a") + 1
result = expression._call(df)
Expand Down
73 changes: 55 additions & 18 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from narwhals.utils import Implementation
from narwhals.utils import Version
from narwhals.utils import check_column_exists
from narwhals.utils import check_column_names_are_unique
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
Expand Down Expand Up @@ -58,7 +59,10 @@ def __init__(
*,
backend_version: tuple[int, ...],
version: Version,
validate_column_names: bool,
) -> None:
if validate_column_names:
check_column_names_are_unique(native_dataframe.column_names)
self._native_frame = native_dataframe
self._implementation = Implementation.PYARROW
self._backend_version = backend_version
Expand Down Expand Up @@ -87,12 +91,20 @@ def __narwhals_lazyframe__(self: Self) -> Self:

def _change_version(self: Self, version: Version) -> Self:
return self.__class__(
self._native_frame, backend_version=self._backend_version, version=version
self._native_frame,
backend_version=self._backend_version,
version=version,
validate_column_names=False,
)

def _from_native_frame(self: Self, df: pa.Table) -> Self:
def _from_native_frame(
self: Self, df: pa.Table, *, validate_column_names: bool = True
) -> Self:
return self.__class__(
df, backend_version=self._backend_version, version=self._version
df,
backend_version=self._backend_version,
version=self._version,
validate_column_names=validate_column_names,
)

@property
Expand Down Expand Up @@ -293,16 +305,20 @@ def columns(self: Self) -> list[str]:
return self._native_frame.schema.names # type: ignore[no-any-return]

def simple_select(self, *column_names: str) -> Self:
return self._from_native_frame(self._native_frame.select(list(column_names)))
return self._from_native_frame(
self._native_frame.select(list(column_names)), validate_column_names=False
)

def select(self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr) -> Self:
new_series: list[ArrowSeries] = evaluate_into_exprs(self, *exprs, **named_exprs)
if not new_series:
# return empty dataframe, like Polars does
return self._from_native_frame(self._native_frame.__class__.from_arrays([]))
return self._from_native_frame(
self._native_frame.__class__.from_arrays([]), validate_column_names=False
)
names = [s.name for s in new_series]
df = pa.Table.from_arrays(broadcast_series(new_series), names=names)
return self._from_native_frame(df)
return self._from_native_frame(df, validate_column_names=False)

def with_columns(
self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr
Expand All @@ -328,7 +344,7 @@ def with_columns(
else native_frame.append_column(field_=col_name, column=column)
)

return self._from_native_frame(native_frame)
return self._from_native_frame(native_frame, validate_column_names=False)

def group_by(self: Self, *keys: str, drop_null_keys: bool) -> ArrowGroupBy:
from narwhals._arrow.group_by import ArrowGroupBy
Expand Down Expand Up @@ -397,11 +413,15 @@ def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001
to_drop = parse_columns_to_drop(
compliant_frame=self, columns=columns, strict=strict
)
return self._from_native_frame(self._native_frame.drop(to_drop))
return self._from_native_frame(
self._native_frame.drop(to_drop), validate_column_names=False
)

def drop_nulls(self: Self, subset: list[str] | None) -> Self:
if subset is None:
return self._from_native_frame(self._native_frame.drop_null())
return self._from_native_frame(
self._native_frame.drop_null(), validate_column_names=False
)
plx = self.__narwhals_namespace__()
return self.filter(~plx.any_horizontal(plx.col(*subset).is_null()))

Expand All @@ -424,7 +444,10 @@ def sort(

null_placement = "at_end" if nulls_last else "at_start"

return self._from_native_frame(df.sort_by(sorting, null_placement=null_placement))
return self._from_native_frame(
df.sort_by(sorting, null_placement=null_placement),
validate_column_names=False,
)

def to_pandas(self: Self) -> pd.DataFrame:
return self._native_frame.to_pandas()
Expand Down Expand Up @@ -495,7 +518,9 @@ def filter(self: Self, *predicates: IntoArrowExpr, **constraints: Any) -> Self:
mask_native = broadcast_and_extract_dataframe_comparand(
length=len(self), other=mask, backend_version=self._backend_version
)
return self._from_native_frame(self._native_frame.filter(mask_native))
return self._from_native_frame(
self._native_frame.filter(mask_native), validate_column_names=False
)

def null_count(self: Self) -> Self:
df = self._native_frame
Expand All @@ -508,18 +533,22 @@ def null_count(self: Self) -> Self:
def head(self: Self, n: int) -> Self:
df = self._native_frame
if n >= 0:
return self._from_native_frame(df.slice(0, n))
return self._from_native_frame(df.slice(0, n), validate_column_names=False)
else:
num_rows = df.num_rows
return self._from_native_frame(df.slice(0, max(0, num_rows + n)))
return self._from_native_frame(
df.slice(0, max(0, num_rows + n)), validate_column_names=False
)

def tail(self: Self, n: int) -> Self:
df = self._native_frame
if n >= 0:
num_rows = df.num_rows
return self._from_native_frame(df.slice(max(0, num_rows - n)))
return self._from_native_frame(
df.slice(max(0, num_rows - n)), validate_column_names=False
)
else:
return self._from_native_frame(df.slice(abs(n)))
return self._from_native_frame(df.slice(abs(n)), validate_column_names=False)

def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame:
from narwhals.utils import parse_version
Expand All @@ -536,6 +565,7 @@ def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyF
df=duckdb.table("df"),
backend_version=parse_version(duckdb.__version__),
version=self._version,
validate_column_names=False,
)
elif backend is Implementation.POLARS:
import polars as pl # ignore-banned-import
Expand All @@ -557,6 +587,7 @@ def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyF
native_dataframe=dd.from_pandas(self._native_frame.to_pandas()),
backend_version=parse_version(dask.__version__),
version=self._version,
validate_column_names=False,
)
raise AssertionError # pragma: no cover

Expand All @@ -572,6 +603,7 @@ def collect(
native_dataframe=self._native_frame,
backend_version=self._backend_version,
version=self._version,
validate_column_names=False,
)

if backend is Implementation.PANDAS:
Expand All @@ -584,6 +616,7 @@ def collect(
implementation=Implementation.PANDAS,
backend_version=parse_version(pd.__version__),
version=self._version,
validate_column_names=False,
)

if backend is Implementation.POLARS:
Expand Down Expand Up @@ -730,13 +763,17 @@ def unique(
.column(f"{col_token}_{agg_func}")
)

return self._from_native_frame(pc.take(df, keep_idx))
return self._from_native_frame(
pc.take(df, keep_idx), validate_column_names=False
)

keep_idx = self.simple_select(*subset).is_unique()
return self.filter(keep_idx)

def gather_every(self: Self, n: int, offset: int) -> Self:
return self._from_native_frame(self._native_frame[offset::n])
return self._from_native_frame(
self._native_frame[offset::n], validate_column_names=False
)

def to_arrow(self: Self) -> pa.Table:
return self._native_frame
Expand All @@ -760,7 +797,7 @@ def sample(
idx = np.arange(0, num_rows)
mask = rng.choice(idx, size=n, replace=with_replacement)

return self._from_native_frame(pc.take(frame, mask))
return self._from_native_frame(pc.take(frame, mask), validate_column_names=False)

def unpivot(
self: Self,
Expand Down
5 changes: 4 additions & 1 deletion narwhals/_arrow/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,10 @@ def concat(
raise NotImplementedError

return ArrowDataFrame(
result_table, backend_version=self._backend_version, version=self._version
result_table,
backend_version=self._backend_version,
version=self._version,
validate_column_names=True,
)

@property
Expand Down
11 changes: 9 additions & 2 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,10 @@ def value_counts(
val_count = val_count.sort_by([(value_name_, "descending")])

return ArrowDataFrame(
val_count, backend_version=self._backend_version, version=self._version
val_count,
backend_version=self._backend_version,
version=self._version,
validate_column_names=True,
)

def zip_with(self: Self, mask: Self, other: Self) -> Self:
Expand Down Expand Up @@ -621,7 +624,10 @@ def to_frame(self: Self) -> ArrowDataFrame:

df = pa.Table.from_arrays([self._native_series], names=[self.name])
return ArrowDataFrame(
df, backend_version=self._backend_version, version=self._version
df,
backend_version=self._backend_version,
version=self._version,
validate_column_names=False,
)

def to_pandas(self: Self) -> pd.Series:
Expand Down Expand Up @@ -745,6 +751,7 @@ def to_dummies(self: Self, *, separator: str, drop_first: bool) -> ArrowDataFram
pa.Table.from_arrays(columns, names=cols),
backend_version=self._backend_version,
version=self._version,
validate_column_names=True,
).simple_select(*output_order)

def quantile(
Expand Down
Loading
Loading