From 6fbf9d83589a78dcbd12b8b2122df7eb8d2488ad Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Fri, 13 Sep 2024 06:50:55 +0000 Subject: [PATCH] add unit test and fix unique col scaling --- ibis_ml/steps/_standardize.py | 16 +++++++++++-- tests/test_standardize.py | 45 +++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 tests/test_standardize.py diff --git a/ibis_ml/steps/_standardize.py b/ibis_ml/steps/_standardize.py index 2ccefa1..70c9eb9 100644 --- a/ibis_ml/steps/_standardize.py +++ b/ibis_ml/steps/_standardize.py @@ -61,7 +61,14 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: self._fit_expr = [expr] results = expr.execute().to_dict("records")[0] for name in columns: - stats[name] = (results[f"{name}_max"], results[f"{name}_min"]) + col_max = results[f"{name}_max"] + col_min = results[f"{name}_min"] + if col_max == col_min: + raise ValueError( + f"Cannot standardize {name!r} -" + "the maximum and minimum values are equal" + ) + stats[name] = (col_max, col_min) self.stats_ = stats @@ -121,7 +128,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: self._fit_expr = [table.aggregate(aggs)] results = self._fit_expr[-1].execute().to_dict("records")[0] for name in columns: - stats[name] = (results[f"{name}_mean"], results[f"{name}_std"]) + col_std = results[f"{name}_std"] + if col_std == 0: + raise ValueError( + f"Cannot standardize {name!r} - the standard deviation is zero" + ) + stats[name] = (results[f"{name}_mean"], col_std) self.stats_ = stats diff --git a/tests/test_standardize.py b/tests/test_standardize.py new file mode 100644 index 0000000..1de975a --- /dev/null +++ b/tests/test_standardize.py @@ -0,0 +1,45 @@ +import ibis +import numpy as np +import pandas as pd +import pandas.testing as tm +import pytest + +import ibis_ml as ml + +def test_scalestandard(): + cols = np.arange(0, 100) + mean = np.mean(cols) + std = np.std(cols) + table = ibis.memtable({"col": cols}) + step = ml.ScaleStandard("col") + step.fit_table(table, ml.core.Metadata()) + result = step.transform_table(table) + expected = pd.DataFrame({"col": (cols - mean) / std}) + tm.assert_frame_equal(result.execute(), expected, check_exact=False) + + +def test_scaleminmax(): + cols = np.arange(0, 100) + min_val = np.min(cols) + max_val = np.max(cols) + table = ibis.memtable({"col": cols}) + step = ml.ScaleMinMax("col") + step.fit_table(table, ml.core.Metadata()) + result = step.transform_table(table) + expected = pd.DataFrame({"col": (cols - min_val) / (max_val - min_val)}) + tm.assert_frame_equal(result.execute(), expected, check_exact=False) + + +@pytest.mark.parametrize( + ("model", "msg"), + [ + ("ScaleStandard", "Cannot standardize 'col' - the standard deviation is zero"), + ("ScaleMinMax", "Cannot standardize 'col' - the maximum and minimum values are equal"), + ], +) +def test_scale_unique_col(model, msg): + table = ibis.memtable({"col": [1]}) + scale_class = getattr(ml, model) + step = scale_class("col") + with pytest.raises(ValueError, match=msg): + step.fit_table(table, ml.core.Metadata())