Skip to content

Commit

Permalink
style(ruff): select additional rules such as isort
Browse files Browse the repository at this point in the history
  • Loading branch information
deepyaman committed Mar 12, 2024
1 parent d726f70 commit b4129a6
Show file tree
Hide file tree
Showing 14 changed files with 154 additions and 85 deletions.
3 changes: 1 addition & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.3.1
rev: v0.3.2
hooks:
- id: ruff
args: [--fix]
Expand Down
23 changes: 12 additions & 11 deletions ibisml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
from ._version import __version__
from ibisml.core import Recipe, Step
from ibisml.select import (
selector,
everything,
categorical,
cols,
contains,
date,
endswith,
startswith,
matches,
everything,
floating,
has_type,
numeric,
integer,
matches,
nominal,
categorical,
numeric,
selector,
startswith,
string,
integer,
floating,
temporal,
date,
time,
timestamp,
where,
)
from ibisml.steps import * # noqa: F403
from ibisml.steps import *

from ._version import __version__
45 changes: 27 additions & 18 deletions ibisml/core.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
from __future__ import annotations

import copy
from collections.abc import Sequence, Iterable
from typing import Any, Callable, Literal, cast, TYPE_CHECKING
from collections.abc import Iterable, Sequence
from functools import cache
from typing import TYPE_CHECKING, Any, Callable, Literal, cast

import numpy as np
import pyarrow as pa
import pandas as pd
import ibis
import ibis.expr.types as ir
import numpy as np
import pandas as pd
import pyarrow as pa

if TYPE_CHECKING:
import polars as pl
import dask.dataframe as dd
import polars as pl
import xgboost as xgb


def _as_table(X: Any):
if isinstance(X, ir.Table):
return X
elif isinstance(X, np.ndarray):
return ibis.memtable(pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[-1])]))
return ibis.memtable(
pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[-1])])
)
else:
return ibis.memtable(X)

Expand Down Expand Up @@ -74,10 +76,7 @@ def _get_categorize_chunk() -> Callable[[str, list[str], Any], pd.DataFrame]:
dask cluster.
"""

def categorize(
df: pd.DataFrame,
categories: dict[str, list[Any]],
) -> pd.DataFrame:
def categorize(df: pd.DataFrame, categories: dict[str, list[Any]]) -> pd.DataFrame:
import pandas as pd

new = {}
Expand Down Expand Up @@ -130,7 +129,9 @@ def set_params(self, **kwargs):
self.steps = kwargs.get("steps")

def set_output(
self, *, transform: Literal["default", "pandas", "pyarrow", "polars", None] = None
self,
*,
transform: Literal["default", "pandas", "pyarrow", "polars", None] = None,
) -> Recipe:
"""Set output type returned by `transform`.
Expand All @@ -146,14 +147,17 @@ def set_output(
- `"polars"`: Polars dataframe
- `"pyarrow"`: Pyarrow table
- `None`: Transform configuration is unchanged
"""
if transform is None:
return self

formats = ("default", "pandas", "polars", "pyarrow")

if transform not in formats:
raise ValueError(f"`transform` must be one of {formats!r}, got {transform}")
raise ValueError(
f"`transform` must be one of {formats!r}, got {transform!r}"
)

self._output_format = transform
return self
Expand Down Expand Up @@ -265,7 +269,9 @@ def _categorize_dask_dataframe(self, ddf: dd.DataFrame) -> dd.DataFrame:

categorize = _get_categorize_chunk()

categories = {col: cats.values for col, cats in self.metadata_.categories.items()}
categories = {
col: cats.values for col, cats in self.metadata_.categories.items()
}
return ddf.map_partitions(categorize, categories)

def _categorize_pyarrow_batches(
Expand Down Expand Up @@ -298,7 +304,6 @@ def to_table(self, X: ir.Table) -> ir.Table:
X : table-like
The input data to transform.
"""

table = _as_table(X)
for step in self.steps:
table = step.transform_table(table)
Expand Down Expand Up @@ -365,7 +370,9 @@ def to_pyarrow(self, X: Any, categories: bool = False) -> pa.Table:
table = self._categorize_pyarrow(table)
return table

def to_pyarrow_batches(self, X: Any, categories: bool = False) -> pa.RecordBatchReader:
def to_pyarrow_batches(
self, X: Any, categories: bool = False
) -> pa.RecordBatchReader:
"""Transform X and return a ``pyarrow.RecordBatchReader``.
Parameters
Expand Down Expand Up @@ -407,7 +414,7 @@ def to_dask_dataframe(self, X: Any, categories: bool = False) -> dd.DataFrame:
return self._categorize_dask_dataframe(ddf)
return ddf
else:
# TODO: this is suboptimal, but may not matter. In practice I'd only
# TODO(jcrist): this is suboptimal, but may not matter. In practice I'd only
# expect the dask conversion path to be used for backends where dask
# integration makes sense.
df = table.to_pandas()
Expand All @@ -427,7 +434,9 @@ def to_dmatrix(self, X: Any) -> xgb.DMatrix:
import xgboost as xgb

df = self.to_pandas(X, categories=True)
return xgb.DMatrix(df[self.features], df[self.outcomes], enable_categorical=True)
return xgb.DMatrix(
df[self.features], df[self.outcomes], enable_categorical=True
)

def to_dask_dmatrix(self, X: Any) -> xgb.dask.DaskDMatrix:
"""Transform X and return a ``xgboost.dask.DMatrix``
Expand Down
10 changes: 6 additions & 4 deletions ibisml/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import re
from collections.abc import Collection
from typing import Callable, Union, ClassVar
from typing import TYPE_CHECKING, Callable, ClassVar, Union

import ibis.expr.types as ir
import ibis.expr.datatypes as dt
import ibis.expr.types as ir

from ibisml.core import Metadata
if TYPE_CHECKING:
from ibisml.core import Metadata


class Selector:
Expand Down Expand Up @@ -285,7 +286,8 @@ class _TypeSelector(Selector):

def matches(self, col: ir.Column, metadata: Metadata) -> bool:
return metadata.get_categories(col.get_name()) is None and isinstance(
col.type(), self._type
col.type(),
self._type,
)


Expand Down
7 changes: 3 additions & 4 deletions ibisml/steps/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from ibisml.steps.common import Cast, Drop, MutateAt, Mutate
from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
from ibisml.steps.encode import CategoricalEncode, OneHotEncode
from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
from ibisml.steps.standardize import ScaleMinMax, ScaleStandard
from ibisml.steps.encode import OneHotEncode, CategoricalEncode
from ibisml.steps.temporal import ExpandDateTime, ExpandDate, ExpandTime

from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime

__all__ = (
"Cast",
Expand Down
5 changes: 2 additions & 3 deletions ibisml/steps/common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import Callable, Iterable, Any
from typing import Any, Callable, Iterable

import ibis.expr.datatypes as dt
import ibis.expr.types as ir
Expand Down Expand Up @@ -134,8 +134,7 @@ def _repr(self) -> Iterable[tuple[str, Any]]:
yield ("", self.inputs)
if self.expr is not None:
yield ("", self.expr)
for name, expr in self.named_exprs.items():
yield name, expr
yield from self.named_exprs.items()

def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
self.columns_ = self.inputs.select_columns(table, metadata)
Expand Down
27 changes: 19 additions & 8 deletions ibisml/steps/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import uuid
from collections import defaultdict
from typing import Any, Iterable
from typing import TYPE_CHECKING, Any, Iterable

import ibis
import ibis.expr.types as ir
Expand All @@ -17,7 +17,8 @@ def _compute_categories(
min_frequency: int | float | None = None,
max_categories: int | None = None,
) -> dict[str, list[Any]]:
import pandas as pd
if TYPE_CHECKING:
import pandas as pd

# We execute once for each type kind in the inputs. In the common case
# (only string inputs) this means a single execution even for multiple
Expand Down Expand Up @@ -56,7 +57,7 @@ def collect(col: str) -> ir.Table:
def process(df: pd.DataFrame) -> list[Any]:
return df["value"].sort_values().to_list()

for group_type, group_cols in groups.items():
for group_cols in groups.values():
query = ibis.union(*(collect(col) for col in group_cols))
result_groups = query.execute().groupby("column")

Expand Down Expand Up @@ -137,7 +138,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
to_compute.append(column)

categories.update(
_compute_categories(table, to_compute, self.min_frequency, self.max_categories)
_compute_categories(
table,
to_compute,
self.min_frequency,
self.max_categories,
),
)

self.categories_ = categories
Expand All @@ -150,7 +156,7 @@ def transform_table(self, table: ir.Table) -> ir.Table:
(table[col] == cat).cast("int8").name(f"{col}_{cat}")
for col, cats in self.categories_.items()
for cat in cats
]
],
).drop(*self.categories_)


Expand Down Expand Up @@ -214,9 +220,14 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:

columns = self.inputs.select_columns(table, metadata)
# Filter out already categorized columns
columns = [column for column in columns if metadata.get_categories(column) is None]
columns = [
column for column in columns if metadata.get_categories(column) is None
]
categories = _compute_categories(
table, columns, self.min_frequency, self.max_categories
table,
columns,
self.min_frequency,
self.max_categories,
)
for col, cats in categories.items():
metadata.set_categories(col, cats)
Expand All @@ -225,7 +236,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
suffix = uuid.uuid4().hex[:6]
for col, cats in categories.items():
table = pa.Table.from_pydict(
{f"key_{suffix}": cats, col: list(range(len(cats)))}
{f"key_{suffix}": cats, col: list(range(len(cats)))},
)
tables[col] = ibis.memtable(table, name=f"{col}_cats_{suffix}")

Expand Down
14 changes: 8 additions & 6 deletions ibisml/steps/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

from typing import Any, Iterable

import ibis.expr.types as ir

from ibisml.core import Metadata, Step
from ibisml.select import SelectionType, selector

import ibis.expr.types as ir


def _fillna(col, val):
if col.type().is_floating():
Expand Down Expand Up @@ -52,7 +52,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:

def transform_table(self, table: ir.Table) -> ir.Table:
return table.mutate(
[_fillna(table[c], self.fill_value).name(c) for c in self.columns_]
[_fillna(table[c], self.fill_value).name(c) for c in self.columns_],
)


Expand All @@ -76,7 +76,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:

def transform_table(self, table: ir.Table) -> ir.Table:
return table.mutate(
[_fillna(table[c], v).name(c) for c, v in self.fill_values_.items()]
[_fillna(table[c], v).name(c) for c, v in self.fill_values_.items()],
)


Expand All @@ -102,7 +102,8 @@ class ImputeMean(_BaseImpute):
def _stat(self, col: ir.Column) -> ir.Scalar:
if not isinstance(col, ir.NumericColumn):
raise ValueError(
f"Cannot compute mean of {col.get_name()} - " "this column is not numeric"
f"Cannot compute mean of {col.get_name()} - "
"this column is not numeric",
)
return col.mean()

Expand All @@ -129,7 +130,8 @@ class ImputeMedian(_BaseImpute):
def _stat(self, col: ir.Column) -> ir.Scalar:
if not isinstance(col, ir.NumericColumn):
raise ValueError(
f"Cannot compute median of {col.get_name()} - " "this column is not numeric"
f"Cannot compute median of {col.get_name()} - "
"this column is not numeric",
)
return col.median()

Expand Down
Loading

0 comments on commit b4129a6

Please sign in to comment.