Skip to content

feat: Add with_property to Column #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions dataframely/columns/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
warn_nullable_default_change,
)
from dataframely._polars import PolarsDataType
from dataframely.columns._utils import first_non_null
from dataframely.random import Generator

if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -246,6 +247,24 @@ def col(self) -> pl.Expr:
"""Obtain a Polars column expression for the column."""
return pl.col(self.name)

def with_property(
self,
*,
nullable: bool | None = None,
primary_key: bool | None = None,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
"""Create a copy of this column with updated properties."""
return self.__class__(
nullable=first_non_null(nullable, self.nullable, allow_null_response=True),
primary_key=first_non_null(primary_key, default=self.primary_key),
check=self.check if check is None else check,
alias=first_non_null(alias, self.alias, allow_null_response=True),
metadata=first_non_null(metadata, self.metadata, allow_null_response=True),
)

# ----------------------------------- SAMPLING ----------------------------------- #

def sample(self, generator: Generator, n: int = 1) -> pl.Series:
Expand Down
27 changes: 27 additions & 0 deletions dataframely/columns/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import polars as pl

from ._utils import first_non_null

if TYPE_CHECKING: # pragma: no cover
from ._base import Column

Expand Down Expand Up @@ -80,6 +82,26 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
result["max_exclusive"] = expr < self.max_exclusive # type: ignore
return result

def with_property(
self,
*,
min: T | None = None,
min_exclusive: T | None = None,
max: T | None = None,
max_exclusive: T | None = None,
**kwargs: Any,
) -> Self:
new_column = super().with_property(**kwargs)
new_column.min = first_non_null(min, self.min, allow_null_response=True)
new_column.min_exclusive = first_non_null(
min_exclusive, self.min_exclusive, allow_null_response=True
)
new_column.max = first_non_null(max, self.max, allow_null_response=True)
new_column.max_exclusive = first_non_null(
max_exclusive, self.max_exclusive, allow_null_response=True
)
return new_column


# ------------------------------------ IS IN MIXIN ----------------------------------- #

Expand All @@ -98,3 +120,8 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
if self.is_in is not None:
result["is_in"] = expr.is_in(self.is_in)
return result

def with_property(self, *, is_in: Sequence[U] | None = None, **kwargs: Any) -> Self:
new_column = super().with_property(**kwargs)
new_column.is_in = first_non_null(is_in, self.is_in, allow_null_response=True)
return new_column
27 changes: 27 additions & 0 deletions dataframely/columns/any.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@

from __future__ import annotations

import sys

if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self

import polars as pl

from dataframely._compat import pa, sa, sa_mssql, sa_TypeEngine
Expand All @@ -11,6 +18,7 @@

from ._base import Check, Column
from ._registry import register
from ._utils import first_non_null


@register
Expand Down Expand Up @@ -79,3 +87,22 @@ def pyarrow_dtype(self) -> pa.DataType:

def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
return pl.repeat(None, n, dtype=pl.Null, eager=True)

def with_property(
self,
*,
nullable: bool | None = None,
primary_key: bool | None = None,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
if nullable is not None and not nullable:
raise ValueError("Column `Any` must be nullable.")
if primary_key is not None and primary_key:
raise ValueError("Column `Any` can't be a primary key.")
return self.__class__(
check=check if check is not None else self.check,
alias=first_non_null(alias, self.alias, allow_null_response=True),
metadata=first_non_null(metadata, self.metadata, allow_null_response=True),
)
132 changes: 132 additions & 0 deletions dataframely/columns/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@
from __future__ import annotations

import datetime as dt
import sys
from typing import Any, cast

if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self

import polars as pl
from polars._typing import TimeUnit

Expand Down Expand Up @@ -149,6 +155,36 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
null_probability=self._null_probability,
)

def with_property(
self,
*,
nullable: bool | None = None,
primary_key: bool | None = None,
min: dt.date | None = None,
min_exclusive: dt.date | None = None,
max: dt.date | None = None,
max_exclusive: dt.date | None = None,
resolution: str | None = None,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
result = super().with_property(
nullable=nullable,
primary_key=primary_key,
min=min,
min_exclusive=min_exclusive,
max=max,
max_exclusive=max_exclusive,
check=check,
alias=alias,
metadata=metadata,
)
result.resolution = first_non_null(
resolution, self.resolution, allow_null_response=True
)
return result


@register
class Time(OrdinalMixin[dt.time], Column):
Expand Down Expand Up @@ -278,6 +314,36 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
null_probability=self._null_probability,
)

def with_property(
self,
*,
nullable: bool | None = None,
primary_key: bool | None = None,
min: dt.time | None = None,
min_exclusive: dt.time | None = None,
max: dt.time | None = None,
max_exclusive: dt.time | None = None,
resolution: str | None = None,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
result = super().with_property(
nullable=nullable,
primary_key=primary_key,
min=min,
min_exclusive=min_exclusive,
max=max,
max_exclusive=max_exclusive,
check=check,
alias=alias,
metadata=metadata,
)
result.resolution = first_non_null(
resolution, self.resolution, allow_null_response=True
)
return result


@register
class Datetime(OrdinalMixin[dt.datetime], Column):
Expand Down Expand Up @@ -425,6 +491,42 @@ def _attributes_match(
return lhs.utcoffset(now) == rhs.utcoffset(now)
return super()._attributes_match(lhs, rhs, name, column_expr)

def with_property(
self,
*,
nullable: bool | None = None,
primary_key: bool = False,
min: dt.datetime | None = None,
min_exclusive: dt.datetime | None = None,
max: dt.datetime | None = None,
max_exclusive: dt.datetime | None = None,
resolution: str | None = None,
time_zone: str | dt.tzinfo | None = None,
time_unit: TimeUnit | None = None,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
result = super().with_property(
nullable=nullable,
primary_key=primary_key,
min=min,
min_exclusive=min_exclusive,
max=max,
max_exclusive=max_exclusive,
check=check,
alias=alias,
metadata=metadata,
)
result.resolution = first_non_null(
resolution, self.resolution, allow_null_response=True
)
result.time_zone = first_non_null(
time_zone, self.time_zone, allow_null_response=True
)
result.time_unit = first_non_null(time_unit, default=self.time_unit)
return result


@register
class Duration(OrdinalMixin[dt.timedelta], Column):
Expand Down Expand Up @@ -546,6 +648,36 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
null_probability=self._null_probability,
)

def with_property(
self,
*,
nullable: bool | None = None,
primary_key: bool = False,
min: dt.timedelta | None = None,
min_exclusive: dt.timedelta | None = None,
max: dt.timedelta | None = None,
max_exclusive: dt.timedelta | None = None,
resolution: str | None = None,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
result = super().with_property(
nullable=nullable,
primary_key=primary_key,
min=min,
min_exclusive=min_exclusive,
max=max,
max_exclusive=max_exclusive,
check=check,
alias=alias,
metadata=metadata,
)
result.resolution = first_non_null(
resolution, self.resolution, allow_null_response=True
)
return result


# --------------------------------------- UTILS -------------------------------------- #

Expand Down
40 changes: 40 additions & 0 deletions dataframely/columns/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@

import decimal
import math
import sys
from typing import Any

import polars as pl

if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self


from dataframely._compat import pa, sa, sa_TypeEngine
from dataframely._polars import PolarsDataType
from dataframely.random import Generator
Expand Down Expand Up @@ -157,6 +164,39 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
)
return ((samples * 10**self.scale).floor() / 10**self.scale).cast(self.dtype)

def with_property(
self,
*,
precision: int | None = None,
scale: int | None = None,
nullable: bool | None = None,
primary_key: bool = False,
min: decimal.Decimal | None = None,
min_exclusive: decimal.Decimal | None = None,
max: decimal.Decimal | None = None,
max_exclusive: decimal.Decimal | None = None,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
# TODO validate
result = super().with_property(
nullable=nullable,
primary_key=primary_key,
min=min,
min_exclusive=min_exclusive,
max=max,
max_exclusive=max_exclusive,
check=check,
alias=alias,
metadata=metadata,
)
result.precision = first_non_null(
precision, self.precision, allow_null_response=True
)
result.scale = scale if scale is not None else self.scale
return result


# --------------------------------------- UTILS -------------------------------------- #

Expand Down
26 changes: 26 additions & 0 deletions dataframely/columns/enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from __future__ import annotations

import sys
from collections.abc import Sequence
from typing import Any

Expand All @@ -15,6 +16,11 @@
from ._base import Check, Column
from ._registry import register

if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self


@register
class Enum(Column):
Expand Down Expand Up @@ -88,3 +94,23 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
return generator.sample_choice(
n, choices=self.categories, null_probability=self._null_probability
).cast(self.dtype)

def with_property(
self,
*,
categories: Sequence[str] | None = None,
nullable: bool | None = None,
primary_key: bool = False,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
) -> Self:
result = super().with_property(
nullable=nullable,
primary_key=primary_key,
check=check,
alias=alias,
metadata=metadata,
)
result.categories = categories if categories is not None else self.categories
return result
Loading
Loading