style(ruff): select additional rules such as isort

jitingxu1 · Mar 12, 2024 · b4129a6 · b4129a6
1 parent d726f70
commit b4129a6
Show file tree

Hide file tree

Showing 14 changed files with 154 additions and 85 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.3.1
+    rev: v0.3.2
     hooks:
       - id: ruff
         args: [--fix]

diff --git a/ibisml/__init__.py b/ibisml/__init__.py
@@ -1,24 +1,25 @@
-from ._version import __version__
 from ibisml.core import Recipe, Step
 from ibisml.select import (
-    selector,
-    everything,
+    categorical,
     cols,
     contains,
+    date,
     endswith,
-    startswith,
-    matches,
+    everything,
+    floating,
     has_type,
-    numeric,
+    integer,
+    matches,
     nominal,
-    categorical,
+    numeric,
+    selector,
+    startswith,
     string,
-    integer,
-    floating,
     temporal,
-    date,
     time,
     timestamp,
     where,
 )
-from ibisml.steps import *  # noqa: F403
+from ibisml.steps import *
+
+from ._version import __version__
diff --git a/ibisml/core.py b/ibisml/core.py
@@ -1,27 +1,29 @@
 from __future__ import annotations
 
 import copy
-from collections.abc import Sequence, Iterable
-from typing import Any, Callable, Literal, cast, TYPE_CHECKING
+from collections.abc import Iterable, Sequence
 from functools import cache
+from typing import TYPE_CHECKING, Any, Callable, Literal, cast
 
-import numpy as np
-import pyarrow as pa
-import pandas as pd
 import ibis
 import ibis.expr.types as ir
+import numpy as np
+import pandas as pd
+import pyarrow as pa
 
 if TYPE_CHECKING:
-    import polars as pl
     import dask.dataframe as dd
+    import polars as pl
     import xgboost as xgb
 
 
 def _as_table(X: Any):
     if isinstance(X, ir.Table):
         return X
     elif isinstance(X, np.ndarray):
-        return ibis.memtable(pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[-1])]))
+        return ibis.memtable(
+            pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[-1])])
+        )
     else:
         return ibis.memtable(X)
 
@@ -74,10 +76,7 @@ def _get_categorize_chunk() -> Callable[[str, list[str], Any], pd.DataFrame]:
     dask cluster.
     """
 
-    def categorize(
-        df: pd.DataFrame,
-        categories: dict[str, list[Any]],
-    ) -> pd.DataFrame:
+    def categorize(df: pd.DataFrame, categories: dict[str, list[Any]]) -> pd.DataFrame:
         import pandas as pd
 
         new = {}
@@ -130,7 +129,9 @@ def set_params(self, **kwargs):
             self.steps = kwargs.get("steps")
 
     def set_output(
-        self, *, transform: Literal["default", "pandas", "pyarrow", "polars", None] = None
+        self,
+        *,
+        transform: Literal["default", "pandas", "pyarrow", "polars", None] = None,
     ) -> Recipe:
         """Set output type returned by `transform`.
 
@@ -146,14 +147,17 @@ def set_output(
             - `"polars"`: Polars dataframe
             - `"pyarrow"`: Pyarrow table
             - `None`: Transform configuration is unchanged
+
         """
         if transform is None:
             return self
 
         formats = ("default", "pandas", "polars", "pyarrow")
 
         if transform not in formats:
-            raise ValueError(f"`transform` must be one of {formats!r}, got {transform}")
+            raise ValueError(
+                f"`transform` must be one of {formats!r}, got {transform!r}"
+            )
 
         self._output_format = transform
         return self
@@ -265,7 +269,9 @@ def _categorize_dask_dataframe(self, ddf: dd.DataFrame) -> dd.DataFrame:
 
         categorize = _get_categorize_chunk()
 
-        categories = {col: cats.values for col, cats in self.metadata_.categories.items()}
+        categories = {
+            col: cats.values for col, cats in self.metadata_.categories.items()
+        }
         return ddf.map_partitions(categorize, categories)
 
     def _categorize_pyarrow_batches(
@@ -298,7 +304,6 @@ def to_table(self, X: ir.Table) -> ir.Table:
         X : table-like
             The input data to transform.
         """
-
         table = _as_table(X)
         for step in self.steps:
             table = step.transform_table(table)
@@ -365,7 +370,9 @@ def to_pyarrow(self, X: Any, categories: bool = False) -> pa.Table:
             table = self._categorize_pyarrow(table)
         return table
 
-    def to_pyarrow_batches(self, X: Any, categories: bool = False) -> pa.RecordBatchReader:
+    def to_pyarrow_batches(
+        self, X: Any, categories: bool = False
+    ) -> pa.RecordBatchReader:
         """Transform X and return a ``pyarrow.RecordBatchReader``.
 
         Parameters
@@ -407,7 +414,7 @@ def to_dask_dataframe(self, X: Any, categories: bool = False) -> dd.DataFrame:
                 return self._categorize_dask_dataframe(ddf)
             return ddf
         else:
-            # TODO: this is suboptimal, but may not matter. In practice I'd only
+            # TODO(jcrist): this is suboptimal, but may not matter. In practice I'd only
             # expect the dask conversion path to be used for backends where dask
             # integration makes sense.
             df = table.to_pandas()
@@ -427,7 +434,9 @@ def to_dmatrix(self, X: Any) -> xgb.DMatrix:
         import xgboost as xgb
 
         df = self.to_pandas(X, categories=True)
-        return xgb.DMatrix(df[self.features], df[self.outcomes], enable_categorical=True)
+        return xgb.DMatrix(
+            df[self.features], df[self.outcomes], enable_categorical=True
+        )
 
     def to_dask_dmatrix(self, X: Any) -> xgb.dask.DaskDMatrix:
         """Transform X and return a ``xgboost.dask.DMatrix``

diff --git a/ibisml/select.py b/ibisml/select.py
@@ -2,12 +2,13 @@
 
 import re
 from collections.abc import Collection
-from typing import Callable, Union, ClassVar
+from typing import TYPE_CHECKING, Callable, ClassVar, Union
 
-import ibis.expr.types as ir
 import ibis.expr.datatypes as dt
+import ibis.expr.types as ir
 
-from ibisml.core import Metadata
+if TYPE_CHECKING:
+    from ibisml.core import Metadata
 
 
 class Selector:
@@ -285,7 +286,8 @@ class _TypeSelector(Selector):
 
     def matches(self, col: ir.Column, metadata: Metadata) -> bool:
         return metadata.get_categories(col.get_name()) is None and isinstance(
-            col.type(), self._type
+            col.type(),
+            self._type,
         )
 
 

diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py
@@ -1,9 +1,8 @@
-from ibisml.steps.common import Cast, Drop, MutateAt, Mutate
+from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
+from ibisml.steps.encode import CategoricalEncode, OneHotEncode
 from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
 from ibisml.steps.standardize import ScaleMinMax, ScaleStandard
-from ibisml.steps.encode import OneHotEncode, CategoricalEncode
-from ibisml.steps.temporal import ExpandDateTime, ExpandDate, ExpandTime
-
+from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime
 
 __all__ = (
     "Cast",

diff --git a/ibisml/steps/common.py b/ibisml/steps/common.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Callable, Iterable, Any
+from typing import Any, Callable, Iterable
 
 import ibis.expr.datatypes as dt
 import ibis.expr.types as ir
@@ -134,8 +134,7 @@ def _repr(self) -> Iterable[tuple[str, Any]]:
         yield ("", self.inputs)
         if self.expr is not None:
             yield ("", self.expr)
-        for name, expr in self.named_exprs.items():
-            yield name, expr
+        yield from self.named_exprs.items()
 
     def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
         self.columns_ = self.inputs.select_columns(table, metadata)

diff --git a/ibisml/steps/encode.py b/ibisml/steps/encode.py
@@ -2,7 +2,7 @@
 
 import uuid
 from collections import defaultdict
-from typing import Any, Iterable
+from typing import TYPE_CHECKING, Any, Iterable
 
 import ibis
 import ibis.expr.types as ir
@@ -17,7 +17,8 @@ def _compute_categories(
     min_frequency: int | float | None = None,
     max_categories: int | None = None,
 ) -> dict[str, list[Any]]:
-    import pandas as pd
+    if TYPE_CHECKING:
+        import pandas as pd
 
     # We execute once for each type kind in the inputs. In the common case
     # (only string inputs) this means a single execution even for multiple
@@ -56,7 +57,7 @@ def collect(col: str) -> ir.Table:
         def process(df: pd.DataFrame) -> list[Any]:
             return df["value"].sort_values().to_list()
 
-    for group_type, group_cols in groups.items():
+    for group_cols in groups.values():
         query = ibis.union(*(collect(col) for col in group_cols))
         result_groups = query.execute().groupby("column")
 
@@ -137,7 +138,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
                 to_compute.append(column)
 
         categories.update(
-            _compute_categories(table, to_compute, self.min_frequency, self.max_categories)
+            _compute_categories(
+                table,
+                to_compute,
+                self.min_frequency,
+                self.max_categories,
+            ),
         )
 
         self.categories_ = categories
@@ -150,7 +156,7 @@ def transform_table(self, table: ir.Table) -> ir.Table:
                 (table[col] == cat).cast("int8").name(f"{col}_{cat}")
                 for col, cats in self.categories_.items()
                 for cat in cats
-            ]
+            ],
         ).drop(*self.categories_)
 
 
@@ -214,9 +220,14 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
 
         columns = self.inputs.select_columns(table, metadata)
         # Filter out already categorized columns
-        columns = [column for column in columns if metadata.get_categories(column) is None]
+        columns = [
+            column for column in columns if metadata.get_categories(column) is None
+        ]
         categories = _compute_categories(
-            table, columns, self.min_frequency, self.max_categories
+            table,
+            columns,
+            self.min_frequency,
+            self.max_categories,
         )
         for col, cats in categories.items():
             metadata.set_categories(col, cats)
@@ -225,7 +236,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
         suffix = uuid.uuid4().hex[:6]
         for col, cats in categories.items():
             table = pa.Table.from_pydict(
-                {f"key_{suffix}": cats, col: list(range(len(cats)))}
+                {f"key_{suffix}": cats, col: list(range(len(cats)))},
             )
             tables[col] = ibis.memtable(table, name=f"{col}_cats_{suffix}")
 

diff --git a/ibisml/steps/impute.py b/ibisml/steps/impute.py
@@ -2,11 +2,11 @@
 
 from typing import Any, Iterable
 
+import ibis.expr.types as ir
+
 from ibisml.core import Metadata, Step
 from ibisml.select import SelectionType, selector
 
-import ibis.expr.types as ir
-
 
 def _fillna(col, val):
     if col.type().is_floating():
@@ -52,7 +52,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
 
     def transform_table(self, table: ir.Table) -> ir.Table:
         return table.mutate(
-            [_fillna(table[c], self.fill_value).name(c) for c in self.columns_]
+            [_fillna(table[c], self.fill_value).name(c) for c in self.columns_],
         )
 
 
@@ -76,7 +76,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
 
     def transform_table(self, table: ir.Table) -> ir.Table:
         return table.mutate(
-            [_fillna(table[c], v).name(c) for c, v in self.fill_values_.items()]
+            [_fillna(table[c], v).name(c) for c, v in self.fill_values_.items()],
         )
 
 
@@ -102,7 +102,8 @@ class ImputeMean(_BaseImpute):
     def _stat(self, col: ir.Column) -> ir.Scalar:
         if not isinstance(col, ir.NumericColumn):
             raise ValueError(
-                f"Cannot compute mean of {col.get_name()} - " "this column is not numeric"
+                f"Cannot compute mean of {col.get_name()} - "
+                "this column is not numeric",
             )
         return col.mean()
 
@@ -129,7 +130,8 @@ class ImputeMedian(_BaseImpute):
     def _stat(self, col: ir.Column) -> ir.Scalar:
         if not isinstance(col, ir.NumericColumn):
             raise ValueError(
-                f"Cannot compute median of {col.get_name()} - " "this column is not numeric"
+                f"Cannot compute median of {col.get_name()} - "
+                "this column is not numeric",
             )
         return col.median()