diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 758bd73..3e30863 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.3.1 + rev: v0.3.2 hooks: - id: ruff args: [--fix] diff --git a/ibisml/__init__.py b/ibisml/__init__.py index edd3497..1161b02 100644 --- a/ibisml/__init__.py +++ b/ibisml/__init__.py @@ -1,24 +1,25 @@ -from ._version import __version__ from ibisml.core import Recipe, Step from ibisml.select import ( - selector, - everything, + categorical, cols, contains, + date, endswith, - startswith, - matches, + everything, + floating, has_type, - numeric, + integer, + matches, nominal, - categorical, + numeric, + selector, + startswith, string, - integer, - floating, temporal, - date, time, timestamp, where, ) -from ibisml.steps import * # noqa: F403 +from ibisml.steps import * + +from ._version import __version__ diff --git a/ibisml/core.py b/ibisml/core.py index c3eff7e..a126524 100644 --- a/ibisml/core.py +++ b/ibisml/core.py @@ -1,19 +1,19 @@ from __future__ import annotations import copy -from collections.abc import Sequence, Iterable -from typing import Any, Callable, Literal, cast, TYPE_CHECKING +from collections.abc import Iterable, Sequence from functools import cache +from typing import TYPE_CHECKING, Any, Callable, Literal, cast -import numpy as np -import pyarrow as pa -import pandas as pd import ibis import ibis.expr.types as ir +import numpy as np +import pandas as pd +import pyarrow as pa if TYPE_CHECKING: - import polars as pl import dask.dataframe as dd + import polars as pl import xgboost as xgb @@ -21,7 +21,9 @@ def _as_table(X: Any): if isinstance(X, ir.Table): return X elif isinstance(X, np.ndarray): - return ibis.memtable(pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[-1])])) + return ibis.memtable( + pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[-1])]) + ) else: return ibis.memtable(X) @@ -74,10 +76,7 @@ def _get_categorize_chunk() -> Callable[[str, list[str], Any], pd.DataFrame]: dask cluster. """ - def categorize( - df: pd.DataFrame, - categories: dict[str, list[Any]], - ) -> pd.DataFrame: + def categorize(df: pd.DataFrame, categories: dict[str, list[Any]]) -> pd.DataFrame: import pandas as pd new = {} @@ -130,7 +129,9 @@ def set_params(self, **kwargs): self.steps = kwargs.get("steps") def set_output( - self, *, transform: Literal["default", "pandas", "pyarrow", "polars", None] = None + self, + *, + transform: Literal["default", "pandas", "pyarrow", "polars", None] = None, ) -> Recipe: """Set output type returned by `transform`. @@ -146,6 +147,7 @@ def set_output( - `"polars"`: Polars dataframe - `"pyarrow"`: Pyarrow table - `None`: Transform configuration is unchanged + """ if transform is None: return self @@ -153,7 +155,9 @@ def set_output( formats = ("default", "pandas", "polars", "pyarrow") if transform not in formats: - raise ValueError(f"`transform` must be one of {formats!r}, got {transform}") + raise ValueError( + f"`transform` must be one of {formats!r}, got {transform!r}" + ) self._output_format = transform return self @@ -265,7 +269,9 @@ def _categorize_dask_dataframe(self, ddf: dd.DataFrame) -> dd.DataFrame: categorize = _get_categorize_chunk() - categories = {col: cats.values for col, cats in self.metadata_.categories.items()} + categories = { + col: cats.values for col, cats in self.metadata_.categories.items() + } return ddf.map_partitions(categorize, categories) def _categorize_pyarrow_batches( @@ -298,7 +304,6 @@ def to_table(self, X: ir.Table) -> ir.Table: X : table-like The input data to transform. """ - table = _as_table(X) for step in self.steps: table = step.transform_table(table) @@ -365,7 +370,9 @@ def to_pyarrow(self, X: Any, categories: bool = False) -> pa.Table: table = self._categorize_pyarrow(table) return table - def to_pyarrow_batches(self, X: Any, categories: bool = False) -> pa.RecordBatchReader: + def to_pyarrow_batches( + self, X: Any, categories: bool = False + ) -> pa.RecordBatchReader: """Transform X and return a ``pyarrow.RecordBatchReader``. Parameters @@ -407,7 +414,7 @@ def to_dask_dataframe(self, X: Any, categories: bool = False) -> dd.DataFrame: return self._categorize_dask_dataframe(ddf) return ddf else: - # TODO: this is suboptimal, but may not matter. In practice I'd only + # TODO(jcrist): this is suboptimal, but may not matter. In practice I'd only # expect the dask conversion path to be used for backends where dask # integration makes sense. df = table.to_pandas() @@ -427,7 +434,9 @@ def to_dmatrix(self, X: Any) -> xgb.DMatrix: import xgboost as xgb df = self.to_pandas(X, categories=True) - return xgb.DMatrix(df[self.features], df[self.outcomes], enable_categorical=True) + return xgb.DMatrix( + df[self.features], df[self.outcomes], enable_categorical=True + ) def to_dask_dmatrix(self, X: Any) -> xgb.dask.DaskDMatrix: """Transform X and return a ``xgboost.dask.DMatrix`` diff --git a/ibisml/select.py b/ibisml/select.py index bc636b6..cebf14a 100644 --- a/ibisml/select.py +++ b/ibisml/select.py @@ -2,12 +2,13 @@ import re from collections.abc import Collection -from typing import Callable, Union, ClassVar +from typing import TYPE_CHECKING, Callable, ClassVar, Union -import ibis.expr.types as ir import ibis.expr.datatypes as dt +import ibis.expr.types as ir -from ibisml.core import Metadata +if TYPE_CHECKING: + from ibisml.core import Metadata class Selector: @@ -285,7 +286,8 @@ class _TypeSelector(Selector): def matches(self, col: ir.Column, metadata: Metadata) -> bool: return metadata.get_categories(col.get_name()) is None and isinstance( - col.type(), self._type + col.type(), + self._type, ) diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py index 3e0f573..509f530 100644 --- a/ibisml/steps/__init__.py +++ b/ibisml/steps/__init__.py @@ -1,9 +1,8 @@ -from ibisml.steps.common import Cast, Drop, MutateAt, Mutate +from ibisml.steps.common import Cast, Drop, Mutate, MutateAt +from ibisml.steps.encode import CategoricalEncode, OneHotEncode from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode from ibisml.steps.standardize import ScaleMinMax, ScaleStandard -from ibisml.steps.encode import OneHotEncode, CategoricalEncode -from ibisml.steps.temporal import ExpandDateTime, ExpandDate, ExpandTime - +from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime __all__ = ( "Cast", diff --git a/ibisml/steps/common.py b/ibisml/steps/common.py index cfe98e3..a6f7081 100644 --- a/ibisml/steps/common.py +++ b/ibisml/steps/common.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Callable, Iterable, Any +from typing import Any, Callable, Iterable import ibis.expr.datatypes as dt import ibis.expr.types as ir @@ -134,8 +134,7 @@ def _repr(self) -> Iterable[tuple[str, Any]]: yield ("", self.inputs) if self.expr is not None: yield ("", self.expr) - for name, expr in self.named_exprs.items(): - yield name, expr + yield from self.named_exprs.items() def fit_table(self, table: ir.Table, metadata: Metadata) -> None: self.columns_ = self.inputs.select_columns(table, metadata) diff --git a/ibisml/steps/encode.py b/ibisml/steps/encode.py index 02ed127..58240f9 100644 --- a/ibisml/steps/encode.py +++ b/ibisml/steps/encode.py @@ -2,7 +2,7 @@ import uuid from collections import defaultdict -from typing import Any, Iterable +from typing import TYPE_CHECKING, Any, Iterable import ibis import ibis.expr.types as ir @@ -17,7 +17,8 @@ def _compute_categories( min_frequency: int | float | None = None, max_categories: int | None = None, ) -> dict[str, list[Any]]: - import pandas as pd + if TYPE_CHECKING: + import pandas as pd # We execute once for each type kind in the inputs. In the common case # (only string inputs) this means a single execution even for multiple @@ -56,7 +57,7 @@ def collect(col: str) -> ir.Table: def process(df: pd.DataFrame) -> list[Any]: return df["value"].sort_values().to_list() - for group_type, group_cols in groups.items(): + for group_cols in groups.values(): query = ibis.union(*(collect(col) for col in group_cols)) result_groups = query.execute().groupby("column") @@ -137,7 +138,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: to_compute.append(column) categories.update( - _compute_categories(table, to_compute, self.min_frequency, self.max_categories) + _compute_categories( + table, + to_compute, + self.min_frequency, + self.max_categories, + ), ) self.categories_ = categories @@ -150,7 +156,7 @@ def transform_table(self, table: ir.Table) -> ir.Table: (table[col] == cat).cast("int8").name(f"{col}_{cat}") for col, cats in self.categories_.items() for cat in cats - ] + ], ).drop(*self.categories_) @@ -214,9 +220,14 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: columns = self.inputs.select_columns(table, metadata) # Filter out already categorized columns - columns = [column for column in columns if metadata.get_categories(column) is None] + columns = [ + column for column in columns if metadata.get_categories(column) is None + ] categories = _compute_categories( - table, columns, self.min_frequency, self.max_categories + table, + columns, + self.min_frequency, + self.max_categories, ) for col, cats in categories.items(): metadata.set_categories(col, cats) @@ -225,7 +236,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: suffix = uuid.uuid4().hex[:6] for col, cats in categories.items(): table = pa.Table.from_pydict( - {f"key_{suffix}": cats, col: list(range(len(cats)))} + {f"key_{suffix}": cats, col: list(range(len(cats)))}, ) tables[col] = ibis.memtable(table, name=f"{col}_cats_{suffix}") diff --git a/ibisml/steps/impute.py b/ibisml/steps/impute.py index df25fa0..1d5e41b 100644 --- a/ibisml/steps/impute.py +++ b/ibisml/steps/impute.py @@ -2,11 +2,11 @@ from typing import Any, Iterable +import ibis.expr.types as ir + from ibisml.core import Metadata, Step from ibisml.select import SelectionType, selector -import ibis.expr.types as ir - def _fillna(col, val): if col.type().is_floating(): @@ -52,7 +52,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: def transform_table(self, table: ir.Table) -> ir.Table: return table.mutate( - [_fillna(table[c], self.fill_value).name(c) for c in self.columns_] + [_fillna(table[c], self.fill_value).name(c) for c in self.columns_], ) @@ -76,7 +76,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: def transform_table(self, table: ir.Table) -> ir.Table: return table.mutate( - [_fillna(table[c], v).name(c) for c, v in self.fill_values_.items()] + [_fillna(table[c], v).name(c) for c, v in self.fill_values_.items()], ) @@ -102,7 +102,8 @@ class ImputeMean(_BaseImpute): def _stat(self, col: ir.Column) -> ir.Scalar: if not isinstance(col, ir.NumericColumn): raise ValueError( - f"Cannot compute mean of {col.get_name()} - " "this column is not numeric" + f"Cannot compute mean of {col.get_name()} - " + "this column is not numeric", ) return col.mean() @@ -129,7 +130,8 @@ class ImputeMedian(_BaseImpute): def _stat(self, col: ir.Column) -> ir.Scalar: if not isinstance(col, ir.NumericColumn): raise ValueError( - f"Cannot compute median of {col.get_name()} - " "this column is not numeric" + f"Cannot compute median of {col.get_name()} - " + "this column is not numeric", ) return col.median() diff --git a/ibisml/steps/standardize.py b/ibisml/steps/standardize.py index 7d45d0b..47f6250 100644 --- a/ibisml/steps/standardize.py +++ b/ibisml/steps/standardize.py @@ -2,11 +2,11 @@ from typing import Any, Iterable +import ibis.expr.types as ir + from ibisml.core import Metadata, Step from ibisml.select import SelectionType, selector -import ibis.expr.types as ir - class ScaleMinMax(Step): """A step for normalizing selected numeric columns to have a maximum value of 1 @@ -46,7 +46,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: c = table[name] if not isinstance(c, ir.NumericColumn): raise ValueError( - f"Cannot be normalized {name!r} - this column is not numeric" + f"Cannot be normalized {name!r} - this column is not numeric", ) aggs.append(c.max().name(f"{name}_max")) @@ -63,7 +63,7 @@ def transform_table(self, table: ir.Table) -> ir.Table: [ ((table[c] - min) / (max - min)).name(c) # type: ignore for c, (max, min) in self.stats_.items() - ] + ], ) @@ -105,7 +105,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: c = table[name] if not isinstance(c, ir.NumericColumn): raise ValueError( - f"Cannot standardize {name!r} - this column is not numeric" + f"Cannot standardize {name!r} - this column is not numeric", ) aggs.append(c.mean().name(f"{name}_mean")) @@ -122,5 +122,5 @@ def transform_table(self, table: ir.Table) -> ir.Table: [ ((table[c] - center) / scale).name(c) # type: ignore for c, (center, scale) in self.stats_.items() - ] + ], ) diff --git a/ibisml/steps/temporal.py b/ibisml/steps/temporal.py index 290f291..a5b52c8 100644 --- a/ibisml/steps/temporal.py +++ b/ibisml/steps/temporal.py @@ -1,12 +1,13 @@ from __future__ import annotations -from typing import Any, Iterable, Sequence, Literal - -import ibis.expr.types as ir +from typing import TYPE_CHECKING, Any, Iterable, Literal, Sequence from ibisml.core import Metadata, Step from ibisml.select import SelectionType, selector +if TYPE_CHECKING: + import ibis.expr.types as ir + class ExpandDateTime(Step): """A step for expanding date and time columns into one or more features. diff --git a/pyproject.toml b/pyproject.toml index 3f3333d..5d46f1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,11 +7,9 @@ name = "ibisml" description = "Tools for developing ML pipelines using Ibis" readme = "README.md" requires-python = ">= 3.8" -license = {text = "Apache 2.0"} +license = { text = "Apache 2.0" } dynamic = ["version"] -dependencies = [ - "ibis-framework" -] +dependencies = ["ibis-framework"] [project.optional-dependencies] dask = ["dask[dataframe]"] @@ -23,8 +21,52 @@ include-package-data = false [tool.setuptools.packages.find] include = ["ibisml*"] -[tool.ruff] -line-length = 92 +[tool.ruff.lint] +select = [ + "F", # Pyflakes + "E", # pycodestyle: Error + "W", # pycodestyle: Warning + "I", # isort + # "D", # pydocstyle # TODO(deepyaman): Add missing docstrings. + "UP", # pyupgrade + # "S", # flake8-bandit # TODO(deepyaman): Remove assert and ignore tests. + "BLE", # flake8-blind-except + "B", # flake8-bugbear + # "A", # flake8-builtins # TODO(deepyaman): Rename variables. + "COM", # flake8-commas + "C4", # flake8-comprehensions + "T10", # flake8-debugger + "FA", # flake8-future-annotations + "ISC", # flake8-implicit-str-concat + "ICN", # flake8-import-conventions + "G", # flake8-logging-format + "INP", # flake8-no-pep420 + "PIE", # flake8-pie + "T20", # flake8-print + "PT", # flake8-pytest-style + "Q", # flake8-quotes + "RET", # flake8-return + "SLF", # flake8-self + "SIM", # flake8-simplify + "TID", # flake8-tidy-imports + "TCH", # flake8-type-checking + "PTH", # flake8-use-pathlib + "TD", # flake8-todos + # "ERA",# eradicate # TODO(deepyaman): Fix commented-out code. + # "PD", # pandas-vet # TODO(deepyaman): Use `.to_numpy()` instead of `.values`. + # "PGH",# pygrep-hooks # TODO(deepyaman): Use specific rule codes when ignoring type issues. + "FLY", # flynt + "NPY", # NumPy-specific rules + "PERF", # Perflint + "LOG", # flake8-logging + "RUF", # Ruff-specific rules +] +ignore = [ + "COM812", # missing-trailing-comma + "ISC001", # single-line-implicit-string-concatenation + "RET505", # superfluous-else-return, stylistic choice + "TD003", # missing-todo-link +] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401", "F403"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_core.py b/tests/test_core.py index 25fcbaf..830f0d8 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,25 +1,25 @@ -import ibisml as ml - +import ibis import numpy as np import pandas as pd import pyarrow as pa -import ibis -from ibis import _ import pytest +from ibis import _ + +import ibisml as ml -@pytest.fixture +@pytest.fixture() def df(): return pd.DataFrame( { "a": [1, 2, 3, 4, 5], "b": [1, 0, 1, 0, 1], "c": ["x", "x", "y", "x", "y"], - } + }, ) -@pytest.fixture +@pytest.fixture() def table(df): return ibis.memtable(df) @@ -75,7 +75,9 @@ def test_set_output(): recipe.set_output(transform=None) # None -> leave unchanged assert recipe.output_format == "polars" - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match=r"`transform` must be one of \(.*\), got 'unsupported'" + ): recipe.set_output(transform="unsupported") @@ -107,7 +109,10 @@ def test_to_numpy_errors_non_numeric(table): r.to_numpy(table) -@pytest.mark.parametrize("format", ["numpy", "pandas", "pyarrow", "polars", "ibis-table"]) +@pytest.mark.parametrize( + "format", + ["numpy", "pandas", "pyarrow", "polars", "ibis-table"], +) def test_input_formats(format): r = ml.Recipe(ml.Cast(ml.everything(), "float64")) X = np.eye(3, dtype="i8") @@ -128,8 +133,8 @@ def test_input_formats(format): def test_can_use_in_sklearn_pipeline(): sklearn = pytest.importorskip("sklearn") - from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression + from sklearn.pipeline import Pipeline X = np.array([[1, 3], [2, 4], [3, 5]]) y = np.array([10, 11, 12]) diff --git a/tests/test_select.py b/tests/test_select.py index 0ccbac0..e11e2b4 100644 --- a/tests/test_select.py +++ b/tests/test_select.py @@ -1,7 +1,6 @@ -import pytest - import ibis import ibis.expr.datatypes as dt +import pytest import ibisml as ml @@ -21,7 +20,7 @@ def eval_select(selector): "b_time": "time", "b_date": "date", "b_timestamp": "timestamp", - } + }, ) return selector.select_columns(t, metadata) @@ -95,7 +94,7 @@ def test_matches(): @pytest.mark.parametrize( - "selector, cols", + ("selector", "cols"), [ (ml.integer(), ["a_int"]), (ml.floating(), ["a_float"]),