From 9d9b73560501e7d2e8c8e5fe0a7f53308ea70fd5 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Fri, 17 Jan 2025 22:47:33 +0100 Subject: [PATCH 01/12] feat: datetime selector --- docs/api-reference/selectors.md | 1 + docs/api-reference/typing.md | 2 + narwhals/_arrow/expr_dt.py | 4 +- narwhals/_arrow/selectors.py | 19 +- narwhals/_arrow/series_dt.py | 4 +- narwhals/_dask/expr_dt.py | 6 +- narwhals/_dask/selectors.py | 20 +- narwhals/_pandas_like/expr_dt.py | 4 +- narwhals/_pandas_like/selectors.py | 33 ++- narwhals/_pandas_like/series_dt.py | 4 +- narwhals/_pandas_like/utils.py | 6 +- narwhals/_polars/namespace.py | 19 ++ narwhals/_polars/utils.py | 14 +- narwhals/dtypes.py | 7 +- narwhals/expr_dt.py | 4 +- narwhals/selectors.py | 341 +++++++++++++++++++++++------ narwhals/series_dt.py | 4 +- narwhals/typing.py | 2 + narwhals/utils.py | 39 ++++ 19 files changed, 435 insertions(+), 98 deletions(-) diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md index 6480a869e..e75c6090d 100644 --- a/docs/api-reference/selectors.md +++ b/docs/api-reference/selectors.md @@ -15,6 +15,7 @@ set operations are supported: - boolean - by_dtype - categorical + - datetime - numeric - string show_root_heading: false diff --git a/docs/api-reference/typing.md b/docs/api-reference/typing.md index 9791b50a4..4097b6378 100644 --- a/docs/api-reference/typing.md +++ b/docs/api-reference/typing.md @@ -17,6 +17,8 @@ Narwhals comes fully statically typed. In addition to `nw.DataFrame`, `nw.Expr`, - IntoFrameT - IntoSeries - IntoSeriesT + - SizeUnit + - TimeUnit show_source: false show_bases: false diff --git a/narwhals/_arrow/expr_dt.py b/narwhals/_arrow/expr_dt.py index 1438eba81..30d2e22c8 100644 --- a/narwhals/_arrow/expr_dt.py +++ b/narwhals/_arrow/expr_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._expression_parsing import reuse_series_namespace_implementation @@ -9,6 +8,7 @@ from typing_extensions import Self from narwhals._arrow.expr import ArrowExpr + from narwhals.typing import TimeUnit class ArrowExprDateTimeNamespace: @@ -30,7 +30,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowExpr: + def timestamp(self: Self, time_unit: TimeUnit) -> ArrowExpr: return reuse_series_namespace_implementation( self._compliant_expr, "dt", "timestamp", time_unit=time_unit ) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 48e837ec7..7d85ee07d 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -10,11 +10,16 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from collections.abc import Collection + from collections.abc import Container + from datetime import timezone + from typing_extensions import Self from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -26,7 +31,7 @@ def __init__( self._implementation = Implementation.PYARROW self._version = version - def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> ArrowSelector: + def by_dtype(self: Self, dtypes: Container[DType | type[DType]]) -> ArrowSelector: def func(df: ArrowDataFrame) -> list[ArrowSeries]: return [df[col] for col in df.columns if df.schema[col] in dtypes] @@ -85,6 +90,18 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> ArrowSelector: + from narwhals.utils import _parse_datetime_selector_to_datetimes + + datetime_dtypes = _parse_datetime_selector_to_datetimes( + time_unit=time_unit, time_zone=time_zone, version=self._version + ) + return self.by_dtype(datetime_dtypes) + class ArrowSelector(ArrowExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/_arrow/series_dt.py b/narwhals/_arrow/series_dt.py index 697cd473b..0e0cfab71 100644 --- a/narwhals/_arrow/series_dt.py +++ b/narwhals/_arrow/series_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._arrow.utils import floordiv_compat from narwhals.utils import import_dtypes_module @@ -10,6 +9,7 @@ from typing_extensions import Self from narwhals._arrow.series import ArrowSeries + from narwhals.typing import TimeUnit class ArrowSeriesDateTimeNamespace: @@ -52,7 +52,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: return self._compliant_series._from_native_series(result) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowSeries: + def timestamp(self: Self, time_unit: TimeUnit) -> ArrowSeries: import pyarrow as pa import pyarrow.compute as pc diff --git a/narwhals/_dask/expr_dt.py b/narwhals/_dask/expr_dt.py index 177f2c236..1a67fe4dd 100644 --- a/narwhals/_dask/expr_dt.py +++ b/narwhals/_dask/expr_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime @@ -16,6 +15,7 @@ import dask_expr as dx from narwhals._dask.expr import DaskExpr + from narwhals.typing import TimeUnit class DaskExprDateTimeNamespace: @@ -143,8 +143,8 @@ def func(s: dx.Series, time_zone: str) -> dx.Series: returns_scalar=self._compliant_expr._returns_scalar, ) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: - def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"] = "us") -> dx.Series: + def timestamp(self, time_unit: TimeUnit) -> DaskExpr: + def func(s: dx.Series, time_unit: TimeUnit) -> dx.Series: dtype = native_to_narwhals_dtype( s, self._compliant_expr._version, Implementation.DASK ) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 703e24860..960c919fa 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -12,10 +12,16 @@ import dask.dataframe.dask_expr as dx except ModuleNotFoundError: import dask_expr as dx + + from collections.abc import Collection + from collections.abc import Container + from datetime import timezone + from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -26,7 +32,7 @@ def __init__( self._backend_version = backend_version self._version = version - def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DaskSelector: + def by_dtype(self: Self, dtypes: Container[DType | type[DType]]) -> DaskSelector: def func(df: DaskLazyFrame) -> list[Any]: return [ df._native_frame[col] for col in df.columns if df.schema[col] in dtypes @@ -89,6 +95,18 @@ def func(df: DaskLazyFrame) -> list[Any]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> DaskSelector: + from narwhals.utils import _parse_datetime_selector_to_datetimes + + datetime_dtypes = _parse_datetime_selector_to_datetimes( + time_unit=time_unit, time_zone=time_zone, version=self._version + ) + return self.by_dtype(datetime_dtypes) + class DaskSelector(DaskExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/_pandas_like/expr_dt.py b/narwhals/_pandas_like/expr_dt.py index 13e94080e..6525d78c3 100644 --- a/narwhals/_pandas_like/expr_dt.py +++ b/narwhals/_pandas_like/expr_dt.py @@ -1,12 +1,12 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._expression_parsing import reuse_series_namespace_implementation if TYPE_CHECKING: from narwhals._pandas_like.expr import PandasLikeExpr + from narwhals.typing import TimeUnit class PandasLikeExprDateTimeNamespace: @@ -99,7 +99,7 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeExpr: self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone ) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeExpr: + def timestamp(self, time_unit: TimeUnit) -> PandasLikeExpr: return reuse_series_namespace_implementation( self._compliant_expr, "dt", "timestamp", time_unit=time_unit ) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index e7d7fe18d..24b9b0058 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -8,16 +8,23 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from collections.abc import Collection + from collections.abc import Container + from datetime import timezone + + from typing_extensions import Self + from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Implementation from narwhals.utils import Version class PandasSelectorNamespace: def __init__( - self, + self: Self, *, implementation: Implementation, backend_version: tuple[int, ...], @@ -27,7 +34,7 @@ def __init__( self._backend_version = backend_version self._version = version - def by_dtype(self, dtypes: list[DType | type[DType]]) -> PandasSelector: + def by_dtype(self: Self, dtypes: Container[DType | type[DType]]) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [df[col] for col in df.columns if df.schema[col] in dtypes] @@ -43,7 +50,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: kwargs={"dtypes": dtypes}, ) - def numeric(self) -> PandasSelector: + def numeric(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype( [ @@ -60,19 +67,19 @@ def numeric(self) -> PandasSelector: ], ) - def categorical(self) -> PandasSelector: + def categorical(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.Categorical]) - def string(self) -> PandasSelector: + def string(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.String]) - def boolean(self) -> PandasSelector: + def boolean(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.Boolean]) - def all(self) -> PandasSelector: + def all(self: Self) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [df[col] for col in df.columns] @@ -88,6 +95,18 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> PandasSelector: + from narwhals.utils import _parse_datetime_selector_to_datetimes + + datetime_dtypes = _parse_datetime_selector_to_datetimes( + time_unit=time_unit, time_zone=time_zone, version=self._version + ) + return self.by_dtype(datetime_dtypes) + class PandasSelector(PandasLikeExpr): def __repr__(self) -> str: # pragma: no cover diff --git a/narwhals/_pandas_like/series_dt.py b/narwhals/_pandas_like/series_dt.py index 2bf768203..f05136ea5 100644 --- a/narwhals/_pandas_like/series_dt.py +++ b/narwhals/_pandas_like/series_dt.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING from typing import Any -from typing import Literal from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime @@ -12,6 +11,7 @@ if TYPE_CHECKING: from narwhals._pandas_like.series import PandasLikeSeries + from narwhals.typing import TimeUnit class PandasLikeSeriesDateTimeNamespace: @@ -206,7 +206,7 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: result = self._compliant_series._native_series.dt.tz_convert(time_zone) return self._compliant_series._from_native_series(result) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSeries: + def timestamp(self, time_unit: TimeUnit) -> PandasLikeSeries: s = self._compliant_series._native_series dtype = self._compliant_series.dtype is_pyarrow_dtype = "pyarrow" in str(self._compliant_series._native_series.dtype) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 4eec710a4..b4659dc97 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable -from typing import Literal from typing import Sequence from typing import TypeVar @@ -29,6 +28,7 @@ from narwhals._pandas_like.expr import PandasLikeExpr from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version ExprT = TypeVar("ExprT", bound=PandasLikeExpr) @@ -455,13 +455,13 @@ def non_object_native_to_narwhals_dtype( if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( match_ := PATTERN_PA_DATETIME.match(dtype) ): - dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + dt_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] dt_time_zone: str | None = match_.group("time_zone") return dtypes.Datetime(dt_time_unit, dt_time_zone) if (match_ := PATTERN_PD_DURATION.match(dtype)) or ( match_ := PATTERN_PA_DURATION.match(dtype) ): - du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + du_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] return dtypes.Duration(du_time_unit) if dtype == "date32[day][pyarrow]": return dtypes.Date() diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 00e005c33..dfca44148 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -14,6 +14,9 @@ from narwhals.utils import Implementation if TYPE_CHECKING: + from collections.abc import Collection + from datetime import timezone + from typing_extensions import Self from narwhals._polars.dataframe import PolarsDataFrame @@ -21,6 +24,7 @@ from narwhals._polars.expr import PolarsExpr from narwhals._polars.typing import IntoPolarsExpr from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -285,3 +289,18 @@ def all(self: Self) -> PolarsExpr: version=self._version, backend_version=self._backend_version, ) + + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> PolarsExpr: + import polars as pl + + from narwhals._polars.expr import PolarsExpr + + return PolarsExpr( + pl.selectors.datetime(time_unit=time_unit, time_zone=time_zone), # type: ignore[arg-type] + version=self._version, + backend_version=self._backend_version, + ) diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index e85132f8e..efa0d08f0 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -3,7 +3,6 @@ from functools import lru_cache from typing import TYPE_CHECKING from typing import Any -from typing import Literal from typing import TypeVar from typing import overload @@ -17,6 +16,7 @@ from narwhals._polars.expr import PolarsExpr from narwhals._polars.series import PolarsSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version T = TypeVar("T") @@ -113,11 +113,11 @@ def native_to_narwhals_dtype( if dtype == pl.Date: return dtypes.Date() if dtype == pl.Datetime: - dt_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + dt_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") dt_time_zone = getattr(dtype, "time_zone", None) return dtypes.Datetime(time_unit=dt_time_unit, time_zone=dt_time_zone) if dtype == pl.Duration: - du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + du_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") return dtypes.Duration(time_unit=du_time_unit) if dtype == pl.Struct: return dtypes.Struct( @@ -188,12 +188,12 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> pl if dtype == dtypes.Date: return pl.Date() if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime): - dt_time_unit: Literal["ms", "us", "ns"] = getattr(dtype, "time_unit", "us") + dt_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") dt_time_zone = getattr(dtype, "time_zone", None) - return pl.Datetime(dt_time_unit, dt_time_zone) + return pl.Datetime(dt_time_unit, dt_time_zone) # type: ignore[arg-type] if dtype == dtypes.Duration or isinstance(dtype, dtypes.Duration): - du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") - return pl.Duration(time_unit=du_time_unit) + du_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") + return pl.Duration(time_unit=du_time_unit) # type: ignore[arg-type] if dtype == dtypes.List: return pl.List(narwhals_to_native_dtype(dtype.inner, version)) # type: ignore[union-attr] if dtype == dtypes.Struct: diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 57ee762eb..68ea212ac 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -9,11 +9,12 @@ if TYPE_CHECKING: from typing import Iterator - from typing import Literal from typing import Sequence from typing_extensions import Self + from narwhals.typing import TimeUnit + def _validate_dtype(dtype: DType | type[DType]) -> None: if not isinstance_or_issubclass(dtype, DType): @@ -437,7 +438,7 @@ class Datetime(TemporalType): def __init__( self: Self, - time_unit: Literal["us", "ns", "ms", "s"] = "us", + time_unit: TimeUnit = "us", time_zone: str | timezone | None = None, ) -> None: if time_unit not in {"s", "ms", "us", "ns"}: @@ -500,7 +501,7 @@ class Duration(TemporalType): def __init__( self: Self, - time_unit: Literal["us", "ns", "ms", "s"] = "us", + time_unit: TimeUnit = "us", ) -> None: if time_unit not in ("s", "ms", "us", "ns"): msg = ( diff --git a/narwhals/expr_dt.py b/narwhals/expr_dt.py index d0676dd9b..d16a0c53f 100644 --- a/narwhals/expr_dt.py +++ b/narwhals/expr_dt.py @@ -2,13 +2,13 @@ from typing import TYPE_CHECKING from typing import Generic -from typing import Literal from typing import TypeVar if TYPE_CHECKING: from typing_extensions import Self from narwhals.expr import Expr + from narwhals.typing import TimeUnit ExprT = TypeVar("ExprT", bound="Expr") @@ -1341,7 +1341,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: lambda plx: self._expr._to_compliant_expr(plx).dt.convert_time_zone(time_zone) ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: + def timestamp(self: Self, time_unit: TimeUnit = "us") -> ExprT: """Return a timestamp in the given time unit. Arguments: diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 31a5f80e8..bd0ba40fc 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -1,10 +1,17 @@ from __future__ import annotations +from typing import TYPE_CHECKING from typing import Any from narwhals.expr import Expr from narwhals.utils import flatten +if TYPE_CHECKING: + from collections.abc import Collection + from datetime import timezone + + from narwhals.typing import TimeUnit + class Selector(Expr): ... @@ -19,29 +26,34 @@ def by_dtype(*dtypes: Any) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [4.1, 2.3]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function to select int64 and float64 dtypes and multiplies each value by 2: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.by_dtype(nw.Int64, nw.Float64) * 2) + >>> def agnostic_select_by_dtype(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.by_dtype(nw.Int64, nw.Float64) * 2).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_by_dtype`: - >>> func(df_pd) + >>> agnostic_select_by_dtype(df_pd) a c 0 2 8.2 1 4 4.6 - >>> func(df_pl) + + >>> agnostic_select_by_dtype(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ c │ @@ -51,6 +63,14 @@ def by_dtype(*dtypes: Any) -> Expr: │ 2 ┆ 8.2 │ │ 4 ┆ 4.6 │ └─────┴─────┘ + + >>> agnostic_select_by_dtype(df_pa) + pyarrow.Table + a: int64 + c: double + ---- + a: [[2,4]] + c: [[8.2,4.6]] """ return Selector(lambda plx: plx.selectors.by_dtype(flatten(dtypes))) @@ -62,29 +82,34 @@ def numeric() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [4.1, 2.3]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function to select numeric dtypes and multiplies each value by 2: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.numeric() * 2) + >>> def agnostic_select_numeric(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.numeric() * 2).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_numeric`: - >>> func(df_pd) + >>> agnostic_select_numeric(df_pd) a c 0 2 8.2 1 4 4.6 - >>> func(df_pl) + + >>> agnostic_select_numeric(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ c │ @@ -94,6 +119,14 @@ def numeric() -> Expr: │ 2 ┆ 8.2 │ │ 4 ┆ 4.6 │ └─────┴─────┘ + + >>> agnostic_select_numeric(df_pa) + pyarrow.Table + a: int64 + c: double + ---- + a: [[2,4]] + c: [[8.2,4.6]] """ return Selector(lambda plx: plx.selectors.numeric()) @@ -105,29 +138,33 @@ def boolean() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select boolean - dtypes: + Let's define a dataframe-agnostic function to select boolean dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.boolean()) + >>> def agnostic_select_boolean(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.boolean()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_boolean`: - >>> func(df_pd) + >>> agnostic_select_boolean(df_pd) c 0 False 1 True - >>> func(df_pl) + + >>> agnostic_select_boolean(df_pl) shape: (2, 1) ┌───────┐ │ c │ @@ -137,6 +174,12 @@ def boolean() -> Expr: │ false │ │ true │ └───────┘ + + >>> agnostic_select_boolean(df_pa) + pyarrow.Table + c: bool + ---- + c: [[false,true]] """ return Selector(lambda plx: plx.selectors.boolean()) @@ -148,29 +191,33 @@ def string() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select string - dtypes: + Let's define a dataframe-agnostic function to select string dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.string()) + >>> def agnostic_select_string(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.string()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_string`: - >>> func(df_pd) + >>> agnostic_select_string(df_pd) b 0 x 1 y - >>> func(df_pl) + + >>> agnostic_select_string(df_pl) shape: (2, 1) ┌─────┐ │ b │ @@ -180,6 +227,12 @@ def string() -> Expr: │ x │ │ y │ └─────┘ + + >>> agnostic_select_string(df_pa) + pyarrow.Table + b: string + ---- + b: [["x","y"]] """ return Selector(lambda plx: plx.selectors.string()) @@ -191,29 +244,36 @@ def categorical() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} - >>> df_pd = pd.DataFrame(data).astype({"b": "category"}) - >>> df_pl = pl.DataFrame(data, schema_overrides={"b": pl.Categorical}) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select string - dtypes: + Let's define a dataframe-agnostic function that first converts column "b" to + categorical, and then selects categorical dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.categorical()) + >>> def agnostic_select_categorical(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native).with_columns( + ... b=nw.col("b").cast(nw.Categorical()) + ... ) + ... return df_nw.select(ncs.categorical()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_categorical`: - >>> func(df_pd) + >>> agnostic_select_categorical(df_pd) b 0 x 1 y - >>> func(df_pl) + + >>> agnostic_select_categorical(df_pl) shape: (2, 1) ┌─────┐ │ b │ @@ -223,6 +283,14 @@ def categorical() -> Expr: │ x │ │ y │ └─────┘ + + >>> agnostic_select_categorical(df_pa) + pyarrow.Table + b: dictionary + ---- + b: [ -- dictionary: + ["x","y"] -- indices: + [0,1]] """ return Selector(lambda plx: plx.selectors.categorical()) @@ -234,42 +302,193 @@ def all() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} - >>> df_pd = pd.DataFrame(data).astype({"b": "category"}) - >>> df_pl = pl.DataFrame(data, schema_overrides={"b": pl.Categorical}) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select string - dtypes: + Let's define a dataframe-agnostic function to select all dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.all()) + >>> def agnostic_select_all(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.all()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_all`: - >>> func(df_pd) + >>> agnostic_select_all(df_pd) a b c 0 1 x False 1 2 y True - >>> func(df_pl) + + >>> agnostic_select_all(df_pl) shape: (2, 3) ┌─────┬─────┬───────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ - │ i64 ┆ cat ┆ bool │ + │ i64 ┆ str ┆ bool │ ╞═════╪═════╪═══════╡ │ 1 ┆ x ┆ false │ │ 2 ┆ y ┆ true │ └─────┴─────┴───────┘ + + >>> agnostic_select_all(df_pa) + pyarrow.Table + a: int64 + b: string + c: bool + ---- + a: [[1,2]] + b: [["x","y"]] + c: [[false,true]] """ return Selector(lambda plx: plx.selectors.all()) +def datetime( + time_unit: TimeUnit | Collection[TimeUnit] | None = None, + time_zone: str | timezone | Collection[str | timezone | None] | None = ("*", None), +) -> Expr: + """Select all datetime columns, optionally filtering by time unit/zone. + + Arguments: + time_unit: One (or more) of the allowed timeunit precision strings, "ms", "us", + "ns" and "s". Omit to select columns with any valid timeunit. + time_zone: Specify which timezone(s) to select: + + * One or more timezone strings, as defined in zoneinfo (to see valid options + run `import zoneinfo; zoneinfo.available_timezones()` for a full list). + * Set `None` to select Datetime columns that do not have a timezone. + * Set `"*"` to select Datetime columns that have *any* timezone. + + Returns: + A new expression. + + Examples: + >>> from __future__ import annotations + >>> from datetime import datetime, timezone + >>> from zoneinfo import ZoneInfo + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT + >>> + >>> berlin_tz = ZoneInfo("Europe/Berlin") + >>> utc_tz = timezone.utc + >>> data = { + ... "tstamp_berlin": [ + ... datetime(1999, 7, 21, 5, 20, 16, 987654, tzinfo=berlin_tz), + ... datetime(2000, 5, 16, 6, 21, 21, 123465, tzinfo=berlin_tz), + ... ], + ... "tstamp_utc": [ + ... datetime(2023, 4, 10, 12, 14, 16, 999000, tzinfo=utc_tz), + ... datetime(2025, 8, 25, 14, 18, 22, 666000, tzinfo=utc_tz), + ... ], + ... "tstamp": [ + ... datetime(2000, 11, 20, 18, 12, 16, 600000), + ... datetime(2020, 10, 30, 10, 20, 25, 123000), + ... ], + ... "numeric": [3.14, 6.28], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function to select datetime dtypes: + + >>> def agnostic_datetime_selector(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = ( + ... nw.from_native(df_native) + ... .with_columns( + ... tstamp_berlin=nw.col("tstamp_berlin").cast( + ... nw.Datetime(time_zone="Europe/Berlin") + ... ) + ... ) + ... .select(ncs.datetime()) + ... ) + ... return df_nw.to_native() + + Select all datetime columns: + + >>> pd.set_option("display.width", 0) + >>> agnostic_datetime_selector(df_pd) + tstamp_berlin tstamp_utc tstamp + 0 1999-07-21 05:20:16.987654+02:00 2023-04-10 12:14:16.999000+00:00 2000-11-20 18:12:16.600 + 1 2000-05-16 06:21:21.123465+02:00 2025-08-25 14:18:22.666000+00:00 2020-10-30 10:20:25.123 + + >>> agnostic_datetime_selector(df_pl) + shape: (2, 3) + ┌─────────────────────────────────┬─────────────────────────────┬─────────────────────────┐ + │ tstamp_berlin ┆ tstamp_utc ┆ tstamp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs, Europe/Berlin] ┆ datetime[μs, UTC] ┆ datetime[μs] │ + ╞═════════════════════════════════╪═════════════════════════════╪═════════════════════════╡ + │ 1999-07-21 05:20:16.987654 CES… ┆ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │ + │ 2000-05-16 06:21:21.123465 CES… ┆ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │ + └─────────────────────────────────┴─────────────────────────────┴─────────────────────────┘ + + >>> agnostic_datetime_selector(df_pa) + pyarrow.Table + tstamp_berlin: timestamp[us, tz=Europe/Berlin] + tstamp_utc: timestamp[us, tz=UTC] + tstamp: timestamp[us] + ---- + tstamp_berlin: [[1999-07-21 05:20:16.987654Z,2000-05-16 06:21:21.123465Z]] + tstamp_utc: [[2023-04-10 12:14:16.999000Z,2025-08-25 14:18:22.666000Z]] + tstamp: [[2000-11-20 18:12:16.600000,2020-10-30 10:20:25.123000]] + + Select all datetime columns that have any time_zone specification: + + >>> def agnostic_datetime_selector_any_tz(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = ( + ... nw.from_native(df_native) + ... .with_columns( + ... tstamp_berlin=nw.col("tstamp_berlin").cast( + ... nw.Datetime(time_zone="Europe/Berlin") + ... ) + ... ) + ... .select(ncs.datetime(time_zone="*")) + ... ) + ... return df_nw.to_native() + + >>> agnostic_datetime_selector_any_tz(df_pd) + tstamp_berlin tstamp_utc + 0 1999-07-21 05:20:16.987654+02:00 2023-04-10 12:14:16.999000+00:00 + 1 2000-05-16 06:21:21.123465+02:00 2025-08-25 14:18:22.666000+00:00 + + >>> agnostic_datetime_selector_any_tz(df_pl) + shape: (2, 2) + ┌─────────────────────────────────┬─────────────────────────────┐ + │ tstamp_berlin ┆ tstamp_utc │ + │ --- ┆ --- │ + │ datetime[μs, Europe/Berlin] ┆ datetime[μs, UTC] │ + ╞═════════════════════════════════╪═════════════════════════════╡ + │ 1999-07-21 05:20:16.987654 CES… ┆ 2023-04-10 12:14:16.999 UTC │ + │ 2000-05-16 06:21:21.123465 CES… ┆ 2025-08-25 14:18:22.666 UTC │ + └─────────────────────────────────┴─────────────────────────────┘ + + >>> agnostic_datetime_selector_any_tz(df_pa) + pyarrow.Table + tstamp_berlin: timestamp[us, tz=Europe/Berlin] + tstamp_utc: timestamp[us, tz=UTC] + ---- + tstamp_berlin: [[1999-07-21 05:20:16.987654Z,2000-05-16 06:21:21.123465Z]] + tstamp_utc: [[2023-04-10 12:14:16.999000Z,2025-08-25 14:18:22.666000Z]] + """ + return Selector( + lambda plx: plx.selectors.datetime(time_unit=time_unit, time_zone=time_zone) + ) + + __all__ = [ "all", "boolean", diff --git a/narwhals/series_dt.py b/narwhals/series_dt.py index 5fea4ff5c..10f53128c 100644 --- a/narwhals/series_dt.py +++ b/narwhals/series_dt.py @@ -3,13 +3,13 @@ from typing import TYPE_CHECKING from typing import Any from typing import Generic -from typing import Literal from typing import TypeVar if TYPE_CHECKING: from typing_extensions import Self from narwhals.series import Series + from narwhals.typing import TimeUnit SeriesT = TypeVar("SeriesT", bound="Series[Any]") @@ -1212,7 +1212,7 @@ def convert_time_zone(self: Self, time_zone: str) -> SeriesT: self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> SeriesT: + def timestamp(self: Self, time_unit: TimeUnit) -> SeriesT: """Return a timestamp in the given time unit. Arguments: diff --git a/narwhals/typing.py b/narwhals/typing.py index 859e98dff..808eeb873 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -249,6 +249,8 @@ def lit( "terabytes", ] +TimeUnit: TypeAlias = Literal["ns", "us", "ms", "s"] + class DTypes: Decimal: type[dtypes.Decimal] diff --git a/narwhals/utils.py b/narwhals/utils.py index c913fceba..6068ed433 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -2,6 +2,7 @@ import os import re +from datetime import timezone from enum import Enum from enum import auto from secrets import token_hex @@ -35,6 +36,8 @@ from narwhals.exceptions import InvalidOperationError if TYPE_CHECKING: + from collections.abc import Collection + from collections.abc import Set as AbstractSet from types import ModuleType import pandas as pd @@ -43,10 +46,12 @@ from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame + from narwhals.dtypes import Datetime from narwhals.series import Series from narwhals.typing import DTypes from narwhals.typing import IntoSeriesT from narwhals.typing import SizeUnit + from narwhals.typing import TimeUnit FrameOrSeriesT = TypeVar( "FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]] @@ -1067,3 +1072,37 @@ def check_column_exists(columns: list[str], subset: list[str] | None) -> None: if subset is not None and (missing := set(subset).difference(columns)): msg = f"Column(s) {sorted(missing)} not found in {columns}" raise ColumnNotFoundError(msg) + + +def _parse_datetime_selector_to_datetimes( + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + version: Version, +) -> AbstractSet[Datetime]: + # Adapted from polars: https://github.com/pola-rs/polars/blob/725c96009e4c6cb6b05db7f7e33daf3330a4fa35/py-polars/polars/selectors.py#L1340-L1493 + time_units: list[TimeUnit] + if time_unit is None: + time_units = ["ms", "us", "ns"] + else: + time_units = [time_unit] if isinstance(time_unit, str) else list(time_unit) + + time_zones: list[str | timezone | None] + if time_zone is None: + time_zones = [None] + else: + time_zones = ( + [time_zone] if isinstance(time_zone, (str, timezone)) else list(time_zone) + ) + + if "*" in time_zones: + import zoneinfo + + time_zones.extend(list(zoneinfo.available_timezones())) + time_zones.remove("*") + + dtypes = import_dtypes_module(version=version) + return { + dtypes.Datetime(time_unit=tu, time_zone=tz) + for tu in time_units + for tz in time_zones + } From aedb28c2759225711d522059902c2aaf88fecb45 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 Jan 2025 00:30:18 +0100 Subject: [PATCH 02/12] unit test --- narwhals/_dask/selectors.py | 2 +- narwhals/stable/v1/selectors.py | 2 + tests/selectors_test.py | 153 ++++++++++++++++++++++++++------ 3 files changed, 129 insertions(+), 28 deletions(-) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 960c919fa..b42a610f6 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -99,7 +99,7 @@ def datetime( self: Self, time_unit: TimeUnit | Collection[TimeUnit] | None, time_zone: str | timezone | Collection[str | timezone | None] | None, - ) -> DaskSelector: + ) -> DaskSelector: # pragma: no cover from narwhals.utils import _parse_datetime_selector_to_datetimes datetime_dtypes = _parse_datetime_selector_to_datetimes( diff --git a/narwhals/stable/v1/selectors.py b/narwhals/stable/v1/selectors.py index 0d82484e9..5bd2ac938 100644 --- a/narwhals/stable/v1/selectors.py +++ b/narwhals/stable/v1/selectors.py @@ -4,6 +4,7 @@ from narwhals.selectors import boolean from narwhals.selectors import by_dtype from narwhals.selectors import categorical +from narwhals.selectors import datetime from narwhals.selectors import numeric from narwhals.selectors import string @@ -12,6 +13,7 @@ "boolean", "by_dtype", "categorical", + "datetime", "numeric", "string", ] diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 80aa64803..fb331f0e2 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -1,16 +1,16 @@ from __future__ import annotations +from datetime import datetime +from datetime import timezone +from typing import Literal + import pandas as pd import pyarrow as pa import pytest +from zoneinfo import ZoneInfo import narwhals.stable.v1 as nw -from narwhals.stable.v1.selectors import all -from narwhals.stable.v1.selectors import boolean -from narwhals.stable.v1.selectors import by_dtype -from narwhals.stable.v1.selectors import categorical -from narwhals.stable.v1.selectors import numeric -from narwhals.stable.v1.selectors import string +import narwhals.stable.v1.selectors as ncs from tests.utils import PYARROW_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data @@ -27,34 +27,34 @@ def test_selectors(constructor: Constructor, request: pytest.FixtureRequest) -> if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) + result = df.select(ncs.by_dtype([nw.Int64, nw.Float64]) + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) def test_numeric(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(numeric() + 1) + result = df.select(ncs.numeric() + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) def test_boolean(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(boolean()) + result = df.select(ncs.boolean()) expected = {"d": [True, False, True]} assert_equal_data(result, expected) def test_string(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(string()) + result = df.select(ncs.string()) expected = {"b": ["a", "b", "c"]} assert_equal_data(result, expected) @@ -72,22 +72,121 @@ def test_categorical( expected = {"b": ["a", "b", "c"]} df = nw.from_native(constructor(data)).with_columns(nw.col("b").cast(nw.Categorical)) - result = df.select(categorical()) + result = df.select(ncs.categorical()) assert_equal_data(result, expected) +def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ( + "pyspark" in str(constructor) + or "duckdb" in str(constructor) + or "dask" in str(constructor) + ): + request.applymarker(pytest.mark.xfail) + + ts1 = datetime(2000, 11, 20, 18, 12, 16, 600000) + ts2 = datetime(2020, 10, 30, 10, 20, 25, 123000) + + utc_tz = timezone.utc + berlin_tz = ZoneInfo("Europe/Berlin") + + data = { + "numeric": [3.14, 6.28], + "ts": [ts1, ts2], + "ts_utc": [ts1.astimezone(utc_tz), ts2.astimezone(utc_tz)], + "ts_berlin": [ts1.astimezone(berlin_tz), ts2.astimezone(berlin_tz)], + } + time_units: list[Literal["ns", "us", "ms", "s"]] = ["ms", "us", "ns"] + + df = nw.from_native(constructor(data)).select( + nw.col("numeric"), + *[ + nw.col("ts").cast(nw.Datetime(time_unit=tu)).alias(f"ts_{tu}") + for tu in time_units + ], + *[ + nw.col("ts_utc") + .cast(nw.Datetime(time_zone="UTC", time_unit=tu)) + .alias(f"ts_utc_{tu}") + for tu in time_units + ], + *[ + nw.col("ts_berlin") + .cast(nw.Datetime(time_zone="Europe/Berlin", time_unit=tu)) + .alias(f"ts_berlin_{tu}") + for tu in time_units + ], + ) + + assert df.select(ncs.datetime()).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + "ts_utc_ms", + "ts_utc_us", + "ts_utc_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + assert df.select(ncs.datetime(time_unit="ms")).collect_schema().names() == [ + "ts_ms", + "ts_utc_ms", + "ts_berlin_ms", + ] + assert df.select(ncs.datetime(time_unit=["us", "ns"])).collect_schema().names() == [ + "ts_us", + "ts_ns", + "ts_utc_us", + "ts_utc_ns", + "ts_berlin_us", + "ts_berlin_ns", + ] + + assert df.select(ncs.datetime(time_zone=None)).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + ] + assert df.select(ncs.datetime(time_zone="*")).collect_schema().names() == [ + "ts_utc_ms", + "ts_utc_us", + "ts_utc_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + assert df.select( + ncs.datetime(time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + + assert df.select( + ncs.datetime(time_unit="ns", time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == ["ts_ns", "ts_berlin_ns"] + assert df.select( + ncs.datetime(time_unit=["ms", "us"], time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == ["ts_ms", "ts_us", "ts_berlin_ms", "ts_berlin_us"] + + @pytest.mark.parametrize( ("selector", "expected"), [ - (numeric() | boolean(), ["a", "c", "d"]), - (numeric() & boolean(), []), - (numeric() & by_dtype(nw.Int64), ["a"]), - (numeric() | by_dtype(nw.Int64), ["a", "c"]), - (~numeric(), ["b", "d"]), - (boolean() & True, ["d"]), - (boolean() | True, ["d"]), - (numeric() - 1, ["a", "c"]), - (all(), ["a", "b", "c", "d"]), + (ncs.numeric() | ncs.boolean(), ["a", "c", "d"]), + (ncs.numeric() & ncs.boolean(), []), + (ncs.numeric() & ncs.by_dtype(nw.Int64), ["a"]), + (ncs.numeric() | ncs.by_dtype(nw.Int64), ["a", "c"]), + (~ncs.numeric(), ["b", "d"]), + (ncs.boolean() & True, ["d"]), + (ncs.boolean() | True, ["d"]), + (ncs.numeric() - 1, ["a", "c"]), + (ncs.all(), ["a", "b", "c", "d"]), ], ) def test_set_ops( @@ -96,7 +195,7 @@ def test_set_ops( expected: list[str], request: pytest.FixtureRequest, ) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(selector).collect_schema().names() @@ -111,8 +210,8 @@ def test_set_ops_invalid( request.applymarker(pytest.mark.xfail) df = nw.from_native(invalid_constructor(data)) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 - numeric()) + df.select(1 - ncs.numeric()) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 | numeric()) + df.select(1 | ncs.numeric()) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 & numeric()) + df.select(1 & ncs.numeric()) From 438eb27c74b6eabd1a09aa07a09c98141b889fff Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 Jan 2025 10:37:59 +0100 Subject: [PATCH 03/12] use .dt.convert_time_zone first --- tests/selectors_test.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index fb331f0e2..97515c1f6 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -1,13 +1,11 @@ from __future__ import annotations from datetime import datetime -from datetime import timezone from typing import Literal import pandas as pd import pyarrow as pa import pytest -from zoneinfo import ZoneInfo import narwhals.stable.v1 as nw import narwhals.stable.v1.selectors as ncs @@ -76,6 +74,7 @@ def test_categorical( assert_equal_data(result, expected) +# @pytest.mark.filterwarnings("ignore:Found complex group-by expression:UserWarning") def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: if ( "pyspark" in str(constructor) @@ -84,17 +83,15 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N ): request.applymarker(pytest.mark.xfail) + if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,): + request.applymarker(pytest.mark.xfail) + ts1 = datetime(2000, 11, 20, 18, 12, 16, 600000) ts2 = datetime(2020, 10, 30, 10, 20, 25, 123000) - utc_tz = timezone.utc - berlin_tz = ZoneInfo("Europe/Berlin") - data = { "numeric": [3.14, 6.28], "ts": [ts1, ts2], - "ts_utc": [ts1.astimezone(utc_tz), ts2.astimezone(utc_tz)], - "ts_berlin": [ts1.astimezone(berlin_tz), ts2.astimezone(berlin_tz)], } time_units: list[Literal["ns", "us", "ms", "s"]] = ["ms", "us", "ns"] @@ -105,13 +102,15 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N for tu in time_units ], *[ - nw.col("ts_utc") + nw.col("ts") + .dt.convert_time_zone("UTC") .cast(nw.Datetime(time_zone="UTC", time_unit=tu)) .alias(f"ts_utc_{tu}") for tu in time_units ], *[ - nw.col("ts_berlin") + nw.col("ts") + .dt.convert_time_zone("Europe/Berlin") .cast(nw.Datetime(time_zone="Europe/Berlin", time_unit=tu)) .alias(f"ts_berlin_{tu}") for tu in time_units From ef1d271daaa58b5f247466d6654d0bc1264b962d Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 Jan 2025 10:58:30 +0100 Subject: [PATCH 04/12] maybe with backport --- narwhals/_dask/group_by.py | 4 ++-- narwhals/_dask/utils.py | 2 +- narwhals/utils.py | 9 ++++++++- tests/selectors_test.py | 25 ++++++++++++------------- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 2fbb4edb6..b92cf471b 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -51,7 +51,7 @@ def var( try: import dask.dataframe.dask_expr as dx - except ModuleNotFoundError: + except ModuleNotFoundError: # pragma: no cover import dask_expr as dx return partial(dx._groupby.GroupBy.var, ddof=ddof) @@ -66,7 +66,7 @@ def std( try: import dask.dataframe.dask_expr as dx - except ModuleNotFoundError: + except ModuleNotFoundError: # pragma: no cover import dask_expr as dx return partial(dx._groupby.GroupBy.std, ddof=ddof) diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index cd303d8ec..d14383869 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -89,7 +89,7 @@ def add_row_index( def validate_comparand(lhs: dx.Series, rhs: dx.Series) -> None: try: import dask.dataframe.dask_expr as dx - except ModuleNotFoundError: + except ModuleNotFoundError: # pragma: no cover import dask_expr as dx if not dx._expr.are_co_aligned(lhs._expr, rhs._expr): # pragma: no cover diff --git a/narwhals/utils.py b/narwhals/utils.py index 6068ed433..340f02993 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1095,7 +1095,14 @@ def _parse_datetime_selector_to_datetimes( ) if "*" in time_zones: - import zoneinfo + import sys + + if sys.version_info >= (3, 9): + import zoneinfo + else: # pragma: no cover + # This code block is due to a typing issue with backports.zoneinfo package: + # https://github.com/pganssle/zoneinfo/issues/125 + from backports import zoneinfo time_zones.extend(list(zoneinfo.available_timezones())) time_zones.remove("*") diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 97515c1f6..7286ae563 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -74,7 +74,6 @@ def test_categorical( assert_equal_data(result, expected) -# @pytest.mark.filterwarnings("ignore:Found complex group-by expression:UserWarning") def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: if ( "pyspark" in str(constructor) @@ -103,9 +102,9 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N ], *[ nw.col("ts") - .dt.convert_time_zone("UTC") - .cast(nw.Datetime(time_zone="UTC", time_unit=tu)) - .alias(f"ts_utc_{tu}") + .dt.convert_time_zone("Europe/Lisbon") + .cast(nw.Datetime(time_zone="Europe/Lisbon", time_unit=tu)) + .alias(f"ts_lisbon_{tu}") for tu in time_units ], *[ @@ -121,23 +120,23 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N "ts_ms", "ts_us", "ts_ns", - "ts_utc_ms", - "ts_utc_us", - "ts_utc_ns", + "ts_lisbon_ms", + "ts_lisbon_us", + "ts_lisbon_ns", "ts_berlin_ms", "ts_berlin_us", "ts_berlin_ns", ] assert df.select(ncs.datetime(time_unit="ms")).collect_schema().names() == [ "ts_ms", - "ts_utc_ms", + "ts_lisbon_ms", "ts_berlin_ms", ] assert df.select(ncs.datetime(time_unit=["us", "ns"])).collect_schema().names() == [ "ts_us", "ts_ns", - "ts_utc_us", - "ts_utc_ns", + "ts_lisbon_us", + "ts_lisbon_ns", "ts_berlin_us", "ts_berlin_ns", ] @@ -148,9 +147,9 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N "ts_ns", ] assert df.select(ncs.datetime(time_zone="*")).collect_schema().names() == [ - "ts_utc_ms", - "ts_utc_us", - "ts_utc_ns", + "ts_lisbon_ms", + "ts_lisbon_us", + "ts_lisbon_ns", "ts_berlin_ms", "ts_berlin_us", "ts_berlin_ns", From 4f05167b49cf76f9e74e3213bf9bbcedfbe3f72f Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 Jan 2025 11:02:23 +0100 Subject: [PATCH 05/12] forgot pyproject ;) --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c461d7031..283a2e400 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,9 @@ build-backend = "hatchling.build" [project] name = "narwhals" version = "1.22.0" -dependencies = [] +dependencies = [ + 'backports.zoneinfo;python_version<"3.9"', +] requires-python = ">=3.8" authors = [ { name = "Marco Gorelli", email = "33491632+MarcoGorelli@users.noreply.github.com" }, From e5493e1018cf2929e03437c3928f8ca1e969c260 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 Jan 2025 11:26:56 +0100 Subject: [PATCH 06/12] fail pyarrow on windows, use replace_time_zone --- narwhals/translate.py | 3 +-- tests/selectors_test.py | 10 +++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/narwhals/translate.py b/narwhals/translate.py index 6ed82326d..0e49eb204 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -687,8 +687,7 @@ def _from_native_impl( # noqa: PLR0915 raise TypeError(msg) return native_object if ( - parse_version(get_dask().__version__) <= (2024, 12, 1) - and get_dask_expr() is None + parse_version(get_dask().__version__) < (2025, 1) and get_dask_expr() is None ): # pragma: no cover msg = "Please install dask-expr" raise ImportError(msg) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 7286ae563..5066a8bd6 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -12,6 +12,7 @@ from tests.utils import PYARROW_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data +from tests.utils import is_windows data = { "a": [1, 1, 2], @@ -79,12 +80,11 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N "pyspark" in str(constructor) or "duckdb" in str(constructor) or "dask" in str(constructor) + or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) + or ("pyarrow" in str(constructor) and is_windows()) ): request.applymarker(pytest.mark.xfail) - if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,): - request.applymarker(pytest.mark.xfail) - ts1 = datetime(2000, 11, 20, 18, 12, 16, 600000) ts2 = datetime(2020, 10, 30, 10, 20, 25, 123000) @@ -102,14 +102,14 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N ], *[ nw.col("ts") - .dt.convert_time_zone("Europe/Lisbon") + .dt.replace_time_zone("Europe/Lisbon") .cast(nw.Datetime(time_zone="Europe/Lisbon", time_unit=tu)) .alias(f"ts_lisbon_{tu}") for tu in time_units ], *[ nw.col("ts") - .dt.convert_time_zone("Europe/Berlin") + .dt.replace_time_zone("Europe/Berlin") .cast(nw.Datetime(time_zone="Europe/Berlin", time_unit=tu)) .alias(f"ts_berlin_{tu}") for tu in time_units From 78991035e5502a2234895efd3d4e445d378313c3 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 18 Jan 2025 11:43:00 +0100 Subject: [PATCH 07/12] fail for old pandas --- tests/selectors_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 5066a8bd6..11611c1e4 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -9,6 +9,7 @@ import narwhals.stable.v1 as nw import narwhals.stable.v1.selectors as ncs +from tests.utils import PANDAS_VERSION from tests.utils import PYARROW_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data @@ -82,6 +83,7 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N or "dask" in str(constructor) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("pyarrow" in str(constructor) and is_windows()) + or ("pandas" in str(constructor) and PANDAS_VERSION < (2,)) ): request.applymarker(pytest.mark.xfail) From 6b0b006877aed54ce204644859ae6e17afa2f492 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 19 Jan 2025 11:48:40 +0100 Subject: [PATCH 08/12] add is_order_dependent arg --- narwhals/selectors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 33970f9ba..c3533e9ef 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -487,7 +487,8 @@ def datetime( tstamp_utc: [[2023-04-10 12:14:16.999000Z,2025-08-25 14:18:22.666000Z]] """ return Selector( - lambda plx: plx.selectors.datetime(time_unit=time_unit, time_zone=time_zone) + lambda plx: plx.selectors.datetime(time_unit=time_unit, time_zone=time_zone), + is_order_dependent=False, ) From 57672ec9dd9a5fcc7f1741ce2153e185277e36b3 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 20 Jan 2025 09:32:08 +0100 Subject: [PATCH 09/12] it passes :) --- tests/expr_and_series/dt/datetime_attributes_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 3c8a16b7d..ad5f8dc3f 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -118,7 +118,6 @@ def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> No "pandas_nullable_constructor", "cudf", "modin_constructor", - "pyspark", ) ): request.applymarker(pytest.mark.xfail) From e5ad0c011f053e6f596ef7efb0a2b6e8f808cca2 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 20 Jan 2025 09:34:16 +0100 Subject: [PATCH 10/12] force pytest to run with utc --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 283a2e400..55451db75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -176,7 +176,8 @@ xfail_strict = true markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] env = [ "MODIN_ENGINE=python", - "PYARROW_IGNORE_TIMEZONE=1" + "PYARROW_IGNORE_TIMEZONE=1", + "TZ='UTC'", ] [tool.coverage.run] From 319624f7171005a87a37150da8811fade258859c Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 20 Jan 2025 09:45:09 +0100 Subject: [PATCH 11/12] Update pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 55451db75..d83802e8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -177,7 +177,6 @@ markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] env = [ "MODIN_ENGINE=python", "PYARROW_IGNORE_TIMEZONE=1", - "TZ='UTC'", ] [tool.coverage.run] From cb4823ffcfb192e8ab960ffe2934de1d115a7cbf Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 20 Jan 2025 15:02:05 +0100 Subject: [PATCH 12/12] fix up --- narwhals/selectors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 5b3e0b56a..e58e1e3eb 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -517,6 +517,8 @@ def datetime( return Selector( lambda plx: plx.selectors.datetime(time_unit=time_unit, time_zone=time_zone), is_order_dependent=False, + changes_length=False, + aggregates=False, )