diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md index 6480a869e..e75c6090d 100644 --- a/docs/api-reference/selectors.md +++ b/docs/api-reference/selectors.md @@ -15,6 +15,7 @@ set operations are supported: - boolean - by_dtype - categorical + - datetime - numeric - string show_root_heading: false diff --git a/docs/api-reference/typing.md b/docs/api-reference/typing.md index 9791b50a4..4097b6378 100644 --- a/docs/api-reference/typing.md +++ b/docs/api-reference/typing.md @@ -17,6 +17,8 @@ Narwhals comes fully statically typed. In addition to `nw.DataFrame`, `nw.Expr`, - IntoFrameT - IntoSeries - IntoSeriesT + - SizeUnit + - TimeUnit show_source: false show_bases: false diff --git a/narwhals/_arrow/expr_dt.py b/narwhals/_arrow/expr_dt.py index 1438eba81..30d2e22c8 100644 --- a/narwhals/_arrow/expr_dt.py +++ b/narwhals/_arrow/expr_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._expression_parsing import reuse_series_namespace_implementation @@ -9,6 +8,7 @@ from typing_extensions import Self from narwhals._arrow.expr import ArrowExpr + from narwhals.typing import TimeUnit class ArrowExprDateTimeNamespace: @@ -30,7 +30,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowExpr: + def timestamp(self: Self, time_unit: TimeUnit) -> ArrowExpr: return reuse_series_namespace_implementation( self._compliant_expr, "dt", "timestamp", time_unit=time_unit ) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 48e837ec7..7d85ee07d 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -10,11 +10,16 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from collections.abc import Collection + from collections.abc import Container + from datetime import timezone + from typing_extensions import Self from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -26,7 +31,7 @@ def __init__( self._implementation = Implementation.PYARROW self._version = version - def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> ArrowSelector: + def by_dtype(self: Self, dtypes: Container[DType | type[DType]]) -> ArrowSelector: def func(df: ArrowDataFrame) -> list[ArrowSeries]: return [df[col] for col in df.columns if df.schema[col] in dtypes] @@ -85,6 +90,18 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> ArrowSelector: + from narwhals.utils import _parse_datetime_selector_to_datetimes + + datetime_dtypes = _parse_datetime_selector_to_datetimes( + time_unit=time_unit, time_zone=time_zone, version=self._version + ) + return self.by_dtype(datetime_dtypes) + class ArrowSelector(ArrowExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/_arrow/series_dt.py b/narwhals/_arrow/series_dt.py index 6a2f53dc7..7f10324de 100644 --- a/narwhals/_arrow/series_dt.py +++ b/narwhals/_arrow/series_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal import pyarrow as pa import pyarrow.compute as pc @@ -13,6 +12,7 @@ from typing_extensions import Self from narwhals._arrow.series import ArrowSeries + from narwhals.typing import TimeUnit class ArrowSeriesDateTimeNamespace: @@ -49,7 +49,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: return self._compliant_series._from_native_series(result) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowSeries: + def timestamp(self: Self, time_unit: TimeUnit) -> ArrowSeries: s = self._compliant_series._native_series dtype = self._compliant_series.dtype dtypes = import_dtypes_module(self._compliant_series._version) diff --git a/narwhals/_dask/expr_dt.py b/narwhals/_dask/expr_dt.py index 177f2c236..1a67fe4dd 100644 --- a/narwhals/_dask/expr_dt.py +++ b/narwhals/_dask/expr_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime @@ -16,6 +15,7 @@ import dask_expr as dx from narwhals._dask.expr import DaskExpr + from narwhals.typing import TimeUnit class DaskExprDateTimeNamespace: @@ -143,8 +143,8 @@ def func(s: dx.Series, time_zone: str) -> dx.Series: returns_scalar=self._compliant_expr._returns_scalar, ) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: - def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"] = "us") -> dx.Series: + def timestamp(self, time_unit: TimeUnit) -> DaskExpr: + def func(s: dx.Series, time_unit: TimeUnit) -> dx.Series: dtype = native_to_narwhals_dtype( s, self._compliant_expr._version, Implementation.DASK ) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 703e24860..b42a610f6 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -12,10 +12,16 @@ import dask.dataframe.dask_expr as dx except ModuleNotFoundError: import dask_expr as dx + + from collections.abc import Collection + from collections.abc import Container + from datetime import timezone + from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -26,7 +32,7 @@ def __init__( self._backend_version = backend_version self._version = version - def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DaskSelector: + def by_dtype(self: Self, dtypes: Container[DType | type[DType]]) -> DaskSelector: def func(df: DaskLazyFrame) -> list[Any]: return [ df._native_frame[col] for col in df.columns if df.schema[col] in dtypes @@ -89,6 +95,18 @@ def func(df: DaskLazyFrame) -> list[Any]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> DaskSelector: # pragma: no cover + from narwhals.utils import _parse_datetime_selector_to_datetimes + + datetime_dtypes = _parse_datetime_selector_to_datetimes( + time_unit=time_unit, time_zone=time_zone, version=self._version + ) + return self.by_dtype(datetime_dtypes) + class DaskSelector(DaskExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/_pandas_like/expr_dt.py b/narwhals/_pandas_like/expr_dt.py index 13e94080e..6525d78c3 100644 --- a/narwhals/_pandas_like/expr_dt.py +++ b/narwhals/_pandas_like/expr_dt.py @@ -1,12 +1,12 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._expression_parsing import reuse_series_namespace_implementation if TYPE_CHECKING: from narwhals._pandas_like.expr import PandasLikeExpr + from narwhals.typing import TimeUnit class PandasLikeExprDateTimeNamespace: @@ -99,7 +99,7 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeExpr: self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone ) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeExpr: + def timestamp(self, time_unit: TimeUnit) -> PandasLikeExpr: return reuse_series_namespace_implementation( self._compliant_expr, "dt", "timestamp", time_unit=time_unit ) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index e7d7fe18d..24b9b0058 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -8,16 +8,23 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from collections.abc import Collection + from collections.abc import Container + from datetime import timezone + + from typing_extensions import Self + from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Implementation from narwhals.utils import Version class PandasSelectorNamespace: def __init__( - self, + self: Self, *, implementation: Implementation, backend_version: tuple[int, ...], @@ -27,7 +34,7 @@ def __init__( self._backend_version = backend_version self._version = version - def by_dtype(self, dtypes: list[DType | type[DType]]) -> PandasSelector: + def by_dtype(self: Self, dtypes: Container[DType | type[DType]]) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [df[col] for col in df.columns if df.schema[col] in dtypes] @@ -43,7 +50,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: kwargs={"dtypes": dtypes}, ) - def numeric(self) -> PandasSelector: + def numeric(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype( [ @@ -60,19 +67,19 @@ def numeric(self) -> PandasSelector: ], ) - def categorical(self) -> PandasSelector: + def categorical(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.Categorical]) - def string(self) -> PandasSelector: + def string(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.String]) - def boolean(self) -> PandasSelector: + def boolean(self: Self) -> PandasSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.Boolean]) - def all(self) -> PandasSelector: + def all(self: Self) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [df[col] for col in df.columns] @@ -88,6 +95,18 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> PandasSelector: + from narwhals.utils import _parse_datetime_selector_to_datetimes + + datetime_dtypes = _parse_datetime_selector_to_datetimes( + time_unit=time_unit, time_zone=time_zone, version=self._version + ) + return self.by_dtype(datetime_dtypes) + class PandasSelector(PandasLikeExpr): def __repr__(self) -> str: # pragma: no cover diff --git a/narwhals/_pandas_like/series_dt.py b/narwhals/_pandas_like/series_dt.py index 2bf768203..f05136ea5 100644 --- a/narwhals/_pandas_like/series_dt.py +++ b/narwhals/_pandas_like/series_dt.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING from typing import Any -from typing import Literal from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime @@ -12,6 +11,7 @@ if TYPE_CHECKING: from narwhals._pandas_like.series import PandasLikeSeries + from narwhals.typing import TimeUnit class PandasLikeSeriesDateTimeNamespace: @@ -206,7 +206,7 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: result = self._compliant_series._native_series.dt.tz_convert(time_zone) return self._compliant_series._from_native_series(result) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSeries: + def timestamp(self, time_unit: TimeUnit) -> PandasLikeSeries: s = self._compliant_series._native_series dtype = self._compliant_series.dtype is_pyarrow_dtype = "pyarrow" in str(self._compliant_series._native_series.dtype) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 6c813127d..37f8cc207 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable -from typing import Literal from typing import Sequence from typing import TypeVar @@ -23,6 +22,7 @@ from narwhals._pandas_like.expr import PandasLikeExpr from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version ExprT = TypeVar("ExprT", bound=PandasLikeExpr) @@ -449,13 +449,13 @@ def non_object_native_to_narwhals_dtype( if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( match_ := PATTERN_PA_DATETIME.match(dtype) ): - dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + dt_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] dt_time_zone: str | None = match_.group("time_zone") return dtypes.Datetime(dt_time_unit, dt_time_zone) if (match_ := PATTERN_PD_DURATION.match(dtype)) or ( match_ := PATTERN_PA_DURATION.match(dtype) ): - du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + du_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] return dtypes.Duration(du_time_unit) if dtype == "date32[day][pyarrow]": return dtypes.Date() diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index d9d711ea0..5bf6f9255 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -16,6 +16,9 @@ from narwhals.utils import Implementation if TYPE_CHECKING: + from collections.abc import Collection + from datetime import timezone + from typing_extensions import Self from narwhals._polars.dataframe import PolarsDataFrame @@ -23,6 +26,7 @@ from narwhals._polars.expr import PolarsExpr from narwhals._polars.typing import IntoPolarsExpr from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -285,3 +289,18 @@ def all(self: Self) -> PolarsExpr: version=self._version, backend_version=self._backend_version, ) + + def datetime( + self: Self, + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + ) -> PolarsExpr: + import polars as pl + + from narwhals._polars.expr import PolarsExpr + + return PolarsExpr( + pl.selectors.datetime(time_unit=time_unit, time_zone=time_zone), # type: ignore[arg-type] + version=self._version, + backend_version=self._backend_version, + ) diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 94510d2ef..929830316 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -3,7 +3,6 @@ from functools import lru_cache from typing import TYPE_CHECKING from typing import Any -from typing import Literal from typing import TypeVar from typing import overload @@ -17,6 +16,7 @@ from narwhals._polars.expr import PolarsExpr from narwhals._polars.series import PolarsSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version T = TypeVar("T") @@ -111,11 +111,11 @@ def native_to_narwhals_dtype( if dtype == pl.Date: return dtypes.Date() if dtype == pl.Datetime: - dt_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + dt_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") dt_time_zone = getattr(dtype, "time_zone", None) return dtypes.Datetime(time_unit=dt_time_unit, time_zone=dt_time_zone) if dtype == pl.Duration: - du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + du_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") return dtypes.Duration(time_unit=du_time_unit) if dtype == pl.Struct: return dtypes.Struct( @@ -186,12 +186,12 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> pl if dtype == dtypes.Date: return pl.Date() if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime): - dt_time_unit: Literal["ms", "us", "ns"] = getattr(dtype, "time_unit", "us") + dt_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") dt_time_zone = getattr(dtype, "time_zone", None) - return pl.Datetime(dt_time_unit, dt_time_zone) + return pl.Datetime(dt_time_unit, dt_time_zone) # type: ignore[arg-type] if dtype == dtypes.Duration or isinstance(dtype, dtypes.Duration): - du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") - return pl.Duration(time_unit=du_time_unit) + du_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") + return pl.Duration(time_unit=du_time_unit) # type: ignore[arg-type] if dtype == dtypes.List: return pl.List(narwhals_to_native_dtype(dtype.inner, version)) # type: ignore[union-attr] if dtype == dtypes.Struct: diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 57ee762eb..68ea212ac 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -9,11 +9,12 @@ if TYPE_CHECKING: from typing import Iterator - from typing import Literal from typing import Sequence from typing_extensions import Self + from narwhals.typing import TimeUnit + def _validate_dtype(dtype: DType | type[DType]) -> None: if not isinstance_or_issubclass(dtype, DType): @@ -437,7 +438,7 @@ class Datetime(TemporalType): def __init__( self: Self, - time_unit: Literal["us", "ns", "ms", "s"] = "us", + time_unit: TimeUnit = "us", time_zone: str | timezone | None = None, ) -> None: if time_unit not in {"s", "ms", "us", "ns"}: @@ -500,7 +501,7 @@ class Duration(TemporalType): def __init__( self: Self, - time_unit: Literal["us", "ns", "ms", "s"] = "us", + time_unit: TimeUnit = "us", ) -> None: if time_unit not in ("s", "ms", "us", "ns"): msg = ( diff --git a/narwhals/expr_dt.py b/narwhals/expr_dt.py index 6ea1fbbdd..cf0ecfad7 100644 --- a/narwhals/expr_dt.py +++ b/narwhals/expr_dt.py @@ -2,13 +2,13 @@ from typing import TYPE_CHECKING from typing import Generic -from typing import Literal from typing import TypeVar if TYPE_CHECKING: from typing_extensions import Self from narwhals.expr import Expr + from narwhals.typing import TimeUnit ExprT = TypeVar("ExprT", bound="Expr") @@ -1405,7 +1405,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: aggregates=self._expr._aggregates, ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: + def timestamp(self: Self, time_unit: TimeUnit = "us") -> ExprT: """Return a timestamp in the given time unit. Arguments: diff --git a/narwhals/selectors.py b/narwhals/selectors.py index e67424281..e58e1e3eb 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -1,10 +1,17 @@ from __future__ import annotations +from typing import TYPE_CHECKING from typing import Any from narwhals.expr import Expr from narwhals.utils import flatten +if TYPE_CHECKING: + from collections.abc import Collection + from datetime import timezone + + from narwhals.typing import TimeUnit + class Selector(Expr): ... @@ -19,29 +26,34 @@ def by_dtype(*dtypes: Any) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [4.1, 2.3]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function to select int64 and float64 dtypes and multiplies each value by 2: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.by_dtype(nw.Int64, nw.Float64) * 2) + >>> def agnostic_select_by_dtype(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.by_dtype(nw.Int64, nw.Float64) * 2).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_by_dtype`: - >>> func(df_pd) + >>> agnostic_select_by_dtype(df_pd) a c 0 2 8.2 1 4 4.6 - >>> func(df_pl) + + >>> agnostic_select_by_dtype(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ c │ @@ -51,6 +63,14 @@ def by_dtype(*dtypes: Any) -> Expr: │ 2 ┆ 8.2 │ │ 4 ┆ 4.6 │ └─────┴─────┘ + + >>> agnostic_select_by_dtype(df_pa) + pyarrow.Table + a: int64 + c: double + ---- + a: [[2,4]] + c: [[8.2,4.6]] """ return Selector( lambda plx: plx.selectors.by_dtype(flatten(dtypes)), @@ -67,29 +87,34 @@ def numeric() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [4.1, 2.3]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function to select numeric dtypes and multiplies each value by 2: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.numeric() * 2) + >>> def agnostic_select_numeric(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.numeric() * 2).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_numeric`: - >>> func(df_pd) + >>> agnostic_select_numeric(df_pd) a c 0 2 8.2 1 4 4.6 - >>> func(df_pl) + + >>> agnostic_select_numeric(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ c │ @@ -99,6 +124,14 @@ def numeric() -> Expr: │ 2 ┆ 8.2 │ │ 4 ┆ 4.6 │ └─────┴─────┘ + + >>> agnostic_select_numeric(df_pa) + pyarrow.Table + a: int64 + c: double + ---- + a: [[2,4]] + c: [[8.2,4.6]] """ return Selector( lambda plx: plx.selectors.numeric(), @@ -115,29 +148,33 @@ def boolean() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select boolean - dtypes: + Let's define a dataframe-agnostic function to select boolean dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.boolean()) + >>> def agnostic_select_boolean(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.boolean()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_boolean`: - >>> func(df_pd) + >>> agnostic_select_boolean(df_pd) c 0 False 1 True - >>> func(df_pl) + + >>> agnostic_select_boolean(df_pl) shape: (2, 1) ┌───────┐ │ c │ @@ -147,6 +184,12 @@ def boolean() -> Expr: │ false │ │ true │ └───────┘ + + >>> agnostic_select_boolean(df_pa) + pyarrow.Table + c: bool + ---- + c: [[false,true]] """ return Selector( lambda plx: plx.selectors.boolean(), @@ -163,29 +206,33 @@ def string() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select string - dtypes: + Let's define a dataframe-agnostic function to select string dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.string()) + >>> def agnostic_select_string(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.string()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_string`: - >>> func(df_pd) + >>> agnostic_select_string(df_pd) b 0 x 1 y - >>> func(df_pl) + + >>> agnostic_select_string(df_pl) shape: (2, 1) ┌─────┐ │ b │ @@ -195,6 +242,12 @@ def string() -> Expr: │ x │ │ y │ └─────┘ + + >>> agnostic_select_string(df_pa) + pyarrow.Table + b: string + ---- + b: [["x","y"]] """ return Selector( lambda plx: plx.selectors.string(), @@ -211,29 +264,36 @@ def categorical() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} - >>> df_pd = pd.DataFrame(data).astype({"b": "category"}) - >>> df_pl = pl.DataFrame(data, schema_overrides={"b": pl.Categorical}) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select string - dtypes: + Let's define a dataframe-agnostic function that first converts column "b" to + categorical, and then selects categorical dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.categorical()) + >>> def agnostic_select_categorical(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native).with_columns( + ... b=nw.col("b").cast(nw.Categorical()) + ... ) + ... return df_nw.select(ncs.categorical()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_categorical`: - >>> func(df_pd) + >>> agnostic_select_categorical(df_pd) b 0 x 1 y - >>> func(df_pl) + + >>> agnostic_select_categorical(df_pl) shape: (2, 1) ┌─────┐ │ b │ @@ -243,6 +303,14 @@ def categorical() -> Expr: │ x │ │ y │ └─────┘ + + >>> agnostic_select_categorical(df_pa) + pyarrow.Table + b: dictionary + ---- + b: [ -- dictionary: + ["x","y"] -- indices: + [0,1]] """ return Selector( lambda plx: plx.selectors.categorical(), @@ -259,38 +327,52 @@ def all() -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> import narwhals.selectors as ncs >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": ["x", "y"], "c": [False, True]} - >>> df_pd = pd.DataFrame(data).astype({"b": "category"}) - >>> df_pl = pl.DataFrame(data, schema_overrides={"b": pl.Categorical}) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function to select string - dtypes: + Let's define a dataframe-agnostic function to select all dtypes: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(ncs.all()) + >>> def agnostic_select_all(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = nw.from_native(df_native) + ... return df_nw.select(ncs.all()).to_native() - We can then pass either pandas or Polars dataframes: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_select_all`: - >>> func(df_pd) + >>> agnostic_select_all(df_pd) a b c 0 1 x False 1 2 y True - >>> func(df_pl) + + >>> agnostic_select_all(df_pl) shape: (2, 3) ┌─────┬─────┬───────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ - │ i64 ┆ cat ┆ bool │ + │ i64 ┆ str ┆ bool │ ╞═════╪═════╪═══════╡ │ 1 ┆ x ┆ false │ │ 2 ┆ y ┆ true │ └─────┴─────┴───────┘ + + >>> agnostic_select_all(df_pa) + pyarrow.Table + a: int64 + b: string + c: bool + ---- + a: [[1,2]] + b: [["x","y"]] + c: [[false,true]] """ return Selector( lambda plx: plx.selectors.all(), @@ -300,6 +382,146 @@ def all() -> Expr: ) +def datetime( + time_unit: TimeUnit | Collection[TimeUnit] | None = None, + time_zone: str | timezone | Collection[str | timezone | None] | None = ("*", None), +) -> Expr: + """Select all datetime columns, optionally filtering by time unit/zone. + + Arguments: + time_unit: One (or more) of the allowed timeunit precision strings, "ms", "us", + "ns" and "s". Omit to select columns with any valid timeunit. + time_zone: Specify which timezone(s) to select: + + * One or more timezone strings, as defined in zoneinfo (to see valid options + run `import zoneinfo; zoneinfo.available_timezones()` for a full list). + * Set `None` to select Datetime columns that do not have a timezone. + * Set `"*"` to select Datetime columns that have *any* timezone. + + Returns: + A new expression. + + Examples: + >>> from __future__ import annotations + >>> from datetime import datetime, timezone + >>> from zoneinfo import ZoneInfo + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> from narwhals.typing import IntoFrameT + >>> + >>> berlin_tz = ZoneInfo("Europe/Berlin") + >>> utc_tz = timezone.utc + >>> data = { + ... "tstamp_berlin": [ + ... datetime(1999, 7, 21, 5, 20, 16, 987654, tzinfo=berlin_tz), + ... datetime(2000, 5, 16, 6, 21, 21, 123465, tzinfo=berlin_tz), + ... ], + ... "tstamp_utc": [ + ... datetime(2023, 4, 10, 12, 14, 16, 999000, tzinfo=utc_tz), + ... datetime(2025, 8, 25, 14, 18, 22, 666000, tzinfo=utc_tz), + ... ], + ... "tstamp": [ + ... datetime(2000, 11, 20, 18, 12, 16, 600000), + ... datetime(2020, 10, 30, 10, 20, 25, 123000), + ... ], + ... "numeric": [3.14, 6.28], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function to select datetime dtypes: + + >>> def agnostic_datetime_selector(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = ( + ... nw.from_native(df_native) + ... .with_columns( + ... tstamp_berlin=nw.col("tstamp_berlin").cast( + ... nw.Datetime(time_zone="Europe/Berlin") + ... ) + ... ) + ... .select(ncs.datetime()) + ... ) + ... return df_nw.to_native() + + Select all datetime columns: + + >>> pd.set_option("display.width", 0) + >>> agnostic_datetime_selector(df_pd) + tstamp_berlin tstamp_utc tstamp + 0 1999-07-21 05:20:16.987654+02:00 2023-04-10 12:14:16.999000+00:00 2000-11-20 18:12:16.600 + 1 2000-05-16 06:21:21.123465+02:00 2025-08-25 14:18:22.666000+00:00 2020-10-30 10:20:25.123 + + >>> agnostic_datetime_selector(df_pl) + shape: (2, 3) + ┌─────────────────────────────────┬─────────────────────────────┬─────────────────────────┐ + │ tstamp_berlin ┆ tstamp_utc ┆ tstamp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs, Europe/Berlin] ┆ datetime[μs, UTC] ┆ datetime[μs] │ + ╞═════════════════════════════════╪═════════════════════════════╪═════════════════════════╡ + │ 1999-07-21 05:20:16.987654 CES… ┆ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │ + │ 2000-05-16 06:21:21.123465 CES… ┆ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │ + └─────────────────────────────────┴─────────────────────────────┴─────────────────────────┘ + + >>> agnostic_datetime_selector(df_pa) + pyarrow.Table + tstamp_berlin: timestamp[us, tz=Europe/Berlin] + tstamp_utc: timestamp[us, tz=UTC] + tstamp: timestamp[us] + ---- + tstamp_berlin: [[1999-07-21 05:20:16.987654Z,2000-05-16 06:21:21.123465Z]] + tstamp_utc: [[2023-04-10 12:14:16.999000Z,2025-08-25 14:18:22.666000Z]] + tstamp: [[2000-11-20 18:12:16.600000,2020-10-30 10:20:25.123000]] + + Select all datetime columns that have any time_zone specification: + + >>> def agnostic_datetime_selector_any_tz(df_native: IntoFrameT) -> IntoFrameT: + ... df_nw = ( + ... nw.from_native(df_native) + ... .with_columns( + ... tstamp_berlin=nw.col("tstamp_berlin").cast( + ... nw.Datetime(time_zone="Europe/Berlin") + ... ) + ... ) + ... .select(ncs.datetime(time_zone="*")) + ... ) + ... return df_nw.to_native() + + >>> agnostic_datetime_selector_any_tz(df_pd) + tstamp_berlin tstamp_utc + 0 1999-07-21 05:20:16.987654+02:00 2023-04-10 12:14:16.999000+00:00 + 1 2000-05-16 06:21:21.123465+02:00 2025-08-25 14:18:22.666000+00:00 + + >>> agnostic_datetime_selector_any_tz(df_pl) + shape: (2, 2) + ┌─────────────────────────────────┬─────────────────────────────┐ + │ tstamp_berlin ┆ tstamp_utc │ + │ --- ┆ --- │ + │ datetime[μs, Europe/Berlin] ┆ datetime[μs, UTC] │ + ╞═════════════════════════════════╪═════════════════════════════╡ + │ 1999-07-21 05:20:16.987654 CES… ┆ 2023-04-10 12:14:16.999 UTC │ + │ 2000-05-16 06:21:21.123465 CES… ┆ 2025-08-25 14:18:22.666 UTC │ + └─────────────────────────────────┴─────────────────────────────┘ + + >>> agnostic_datetime_selector_any_tz(df_pa) + pyarrow.Table + tstamp_berlin: timestamp[us, tz=Europe/Berlin] + tstamp_utc: timestamp[us, tz=UTC] + ---- + tstamp_berlin: [[1999-07-21 05:20:16.987654Z,2000-05-16 06:21:21.123465Z]] + tstamp_utc: [[2023-04-10 12:14:16.999000Z,2025-08-25 14:18:22.666000Z]] + """ + return Selector( + lambda plx: plx.selectors.datetime(time_unit=time_unit, time_zone=time_zone), + is_order_dependent=False, + changes_length=False, + aggregates=False, + ) + + __all__ = [ "all", "boolean", diff --git a/narwhals/series_dt.py b/narwhals/series_dt.py index 5fea4ff5c..10f53128c 100644 --- a/narwhals/series_dt.py +++ b/narwhals/series_dt.py @@ -3,13 +3,13 @@ from typing import TYPE_CHECKING from typing import Any from typing import Generic -from typing import Literal from typing import TypeVar if TYPE_CHECKING: from typing_extensions import Self from narwhals.series import Series + from narwhals.typing import TimeUnit SeriesT = TypeVar("SeriesT", bound="Series[Any]") @@ -1212,7 +1212,7 @@ def convert_time_zone(self: Self, time_zone: str) -> SeriesT: self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> SeriesT: + def timestamp(self: Self, time_unit: TimeUnit) -> SeriesT: """Return a timestamp in the given time unit. Arguments: diff --git a/narwhals/stable/v1/selectors.py b/narwhals/stable/v1/selectors.py index 0d82484e9..5bd2ac938 100644 --- a/narwhals/stable/v1/selectors.py +++ b/narwhals/stable/v1/selectors.py @@ -4,6 +4,7 @@ from narwhals.selectors import boolean from narwhals.selectors import by_dtype from narwhals.selectors import categorical +from narwhals.selectors import datetime from narwhals.selectors import numeric from narwhals.selectors import string @@ -12,6 +13,7 @@ "boolean", "by_dtype", "categorical", + "datetime", "numeric", "string", ] diff --git a/narwhals/translate.py b/narwhals/translate.py index 9c455055a..80b6b188d 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -639,8 +639,7 @@ def _from_native_impl( # noqa: PLR0915 raise TypeError(msg) return native_object if ( - parse_version(get_dask().__version__) <= (2024, 12, 1) - and get_dask_expr() is None + parse_version(get_dask().__version__) < (2025, 1) and get_dask_expr() is None ): # pragma: no cover msg = "Please install dask-expr" raise ImportError(msg) diff --git a/narwhals/typing.py b/narwhals/typing.py index 859e98dff..808eeb873 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -249,6 +249,8 @@ def lit( "terabytes", ] +TimeUnit: TypeAlias = Literal["ns", "us", "ms", "s"] + class DTypes: Decimal: type[dtypes.Decimal] diff --git a/narwhals/utils.py b/narwhals/utils.py index c913fceba..340f02993 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -2,6 +2,7 @@ import os import re +from datetime import timezone from enum import Enum from enum import auto from secrets import token_hex @@ -35,6 +36,8 @@ from narwhals.exceptions import InvalidOperationError if TYPE_CHECKING: + from collections.abc import Collection + from collections.abc import Set as AbstractSet from types import ModuleType import pandas as pd @@ -43,10 +46,12 @@ from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame + from narwhals.dtypes import Datetime from narwhals.series import Series from narwhals.typing import DTypes from narwhals.typing import IntoSeriesT from narwhals.typing import SizeUnit + from narwhals.typing import TimeUnit FrameOrSeriesT = TypeVar( "FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]] @@ -1067,3 +1072,44 @@ def check_column_exists(columns: list[str], subset: list[str] | None) -> None: if subset is not None and (missing := set(subset).difference(columns)): msg = f"Column(s) {sorted(missing)} not found in {columns}" raise ColumnNotFoundError(msg) + + +def _parse_datetime_selector_to_datetimes( + time_unit: TimeUnit | Collection[TimeUnit] | None, + time_zone: str | timezone | Collection[str | timezone | None] | None, + version: Version, +) -> AbstractSet[Datetime]: + # Adapted from polars: https://github.com/pola-rs/polars/blob/725c96009e4c6cb6b05db7f7e33daf3330a4fa35/py-polars/polars/selectors.py#L1340-L1493 + time_units: list[TimeUnit] + if time_unit is None: + time_units = ["ms", "us", "ns"] + else: + time_units = [time_unit] if isinstance(time_unit, str) else list(time_unit) + + time_zones: list[str | timezone | None] + if time_zone is None: + time_zones = [None] + else: + time_zones = ( + [time_zone] if isinstance(time_zone, (str, timezone)) else list(time_zone) + ) + + if "*" in time_zones: + import sys + + if sys.version_info >= (3, 9): + import zoneinfo + else: # pragma: no cover + # This code block is due to a typing issue with backports.zoneinfo package: + # https://github.com/pganssle/zoneinfo/issues/125 + from backports import zoneinfo + + time_zones.extend(list(zoneinfo.available_timezones())) + time_zones.remove("*") + + dtypes = import_dtypes_module(version=version) + return { + dtypes.Datetime(time_unit=tu, time_zone=tz) + for tu in time_units + for tz in time_zones + } diff --git a/pyproject.toml b/pyproject.toml index 9efc72c96..03ba7cfbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,9 @@ build-backend = "hatchling.build" [project] name = "narwhals" version = "1.22.0" -dependencies = [] +dependencies = [ + 'backports.zoneinfo;python_version<"3.9"', +] requires-python = ">=3.8" authors = [ { name = "Marco Gorelli", email = "33491632+MarcoGorelli@users.noreply.github.com" }, @@ -174,7 +176,7 @@ xfail_strict = true markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] env = [ "MODIN_ENGINE=python", - "PYARROW_IGNORE_TIMEZONE=1" + "PYARROW_IGNORE_TIMEZONE=1", ] [tool.coverage.run] diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 80aa64803..11611c1e4 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -1,19 +1,19 @@ from __future__ import annotations +from datetime import datetime +from typing import Literal + import pandas as pd import pyarrow as pa import pytest import narwhals.stable.v1 as nw -from narwhals.stable.v1.selectors import all -from narwhals.stable.v1.selectors import boolean -from narwhals.stable.v1.selectors import by_dtype -from narwhals.stable.v1.selectors import categorical -from narwhals.stable.v1.selectors import numeric -from narwhals.stable.v1.selectors import string +import narwhals.stable.v1.selectors as ncs +from tests.utils import PANDAS_VERSION from tests.utils import PYARROW_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data +from tests.utils import is_windows data = { "a": [1, 1, 2], @@ -27,34 +27,34 @@ def test_selectors(constructor: Constructor, request: pytest.FixtureRequest) -> if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) + result = df.select(ncs.by_dtype([nw.Int64, nw.Float64]) + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) def test_numeric(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(numeric() + 1) + result = df.select(ncs.numeric() + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) def test_boolean(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(boolean()) + result = df.select(ncs.boolean()) expected = {"d": [True, False, True]} assert_equal_data(result, expected) def test_string(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(string()) + result = df.select(ncs.string()) expected = {"b": ["a", "b", "c"]} assert_equal_data(result, expected) @@ -72,22 +72,121 @@ def test_categorical( expected = {"b": ["a", "b", "c"]} df = nw.from_native(constructor(data)).with_columns(nw.col("b").cast(nw.Categorical)) - result = df.select(categorical()) + result = df.select(ncs.categorical()) assert_equal_data(result, expected) +def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ( + "pyspark" in str(constructor) + or "duckdb" in str(constructor) + or "dask" in str(constructor) + or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) + or ("pyarrow" in str(constructor) and is_windows()) + or ("pandas" in str(constructor) and PANDAS_VERSION < (2,)) + ): + request.applymarker(pytest.mark.xfail) + + ts1 = datetime(2000, 11, 20, 18, 12, 16, 600000) + ts2 = datetime(2020, 10, 30, 10, 20, 25, 123000) + + data = { + "numeric": [3.14, 6.28], + "ts": [ts1, ts2], + } + time_units: list[Literal["ns", "us", "ms", "s"]] = ["ms", "us", "ns"] + + df = nw.from_native(constructor(data)).select( + nw.col("numeric"), + *[ + nw.col("ts").cast(nw.Datetime(time_unit=tu)).alias(f"ts_{tu}") + for tu in time_units + ], + *[ + nw.col("ts") + .dt.replace_time_zone("Europe/Lisbon") + .cast(nw.Datetime(time_zone="Europe/Lisbon", time_unit=tu)) + .alias(f"ts_lisbon_{tu}") + for tu in time_units + ], + *[ + nw.col("ts") + .dt.replace_time_zone("Europe/Berlin") + .cast(nw.Datetime(time_zone="Europe/Berlin", time_unit=tu)) + .alias(f"ts_berlin_{tu}") + for tu in time_units + ], + ) + + assert df.select(ncs.datetime()).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + "ts_lisbon_ms", + "ts_lisbon_us", + "ts_lisbon_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + assert df.select(ncs.datetime(time_unit="ms")).collect_schema().names() == [ + "ts_ms", + "ts_lisbon_ms", + "ts_berlin_ms", + ] + assert df.select(ncs.datetime(time_unit=["us", "ns"])).collect_schema().names() == [ + "ts_us", + "ts_ns", + "ts_lisbon_us", + "ts_lisbon_ns", + "ts_berlin_us", + "ts_berlin_ns", + ] + + assert df.select(ncs.datetime(time_zone=None)).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + ] + assert df.select(ncs.datetime(time_zone="*")).collect_schema().names() == [ + "ts_lisbon_ms", + "ts_lisbon_us", + "ts_lisbon_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + assert df.select( + ncs.datetime(time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + + assert df.select( + ncs.datetime(time_unit="ns", time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == ["ts_ns", "ts_berlin_ns"] + assert df.select( + ncs.datetime(time_unit=["ms", "us"], time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == ["ts_ms", "ts_us", "ts_berlin_ms", "ts_berlin_us"] + + @pytest.mark.parametrize( ("selector", "expected"), [ - (numeric() | boolean(), ["a", "c", "d"]), - (numeric() & boolean(), []), - (numeric() & by_dtype(nw.Int64), ["a"]), - (numeric() | by_dtype(nw.Int64), ["a", "c"]), - (~numeric(), ["b", "d"]), - (boolean() & True, ["d"]), - (boolean() | True, ["d"]), - (numeric() - 1, ["a", "c"]), - (all(), ["a", "b", "c", "d"]), + (ncs.numeric() | ncs.boolean(), ["a", "c", "d"]), + (ncs.numeric() & ncs.boolean(), []), + (ncs.numeric() & ncs.by_dtype(nw.Int64), ["a"]), + (ncs.numeric() | ncs.by_dtype(nw.Int64), ["a", "c"]), + (~ncs.numeric(), ["b", "d"]), + (ncs.boolean() & True, ["d"]), + (ncs.boolean() | True, ["d"]), + (ncs.numeric() - 1, ["a", "c"]), + (ncs.all(), ["a", "b", "c", "d"]), ], ) def test_set_ops( @@ -96,7 +195,7 @@ def test_set_ops( expected: list[str], request: pytest.FixtureRequest, ) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(selector).collect_schema().names() @@ -111,8 +210,8 @@ def test_set_ops_invalid( request.applymarker(pytest.mark.xfail) df = nw.from_native(invalid_constructor(data)) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 - numeric()) + df.select(1 - ncs.numeric()) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 | numeric()) + df.select(1 | ncs.numeric()) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 & numeric()) + df.select(1 & ncs.numeric())