From 6fbf9d83589a78dcbd12b8b2122df7eb8d2488ad Mon Sep 17 00:00:00 2001
From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com>
Date: Fri, 13 Sep 2024 06:50:55 +0000
Subject: [PATCH] add unit test and fix unique col scaling

---
 ibis_ml/steps/_standardize.py | 16 +++++++++++--
 tests/test_standardize.py     | 45 +++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_standardize.py

diff --git a/ibis_ml/steps/_standardize.py b/ibis_ml/steps/_standardize.py
index 2ccefa1..70c9eb9 100644
--- a/ibis_ml/steps/_standardize.py
+++ b/ibis_ml/steps/_standardize.py
@@ -61,7 +61,14 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
             self._fit_expr = [expr]
             results = expr.execute().to_dict("records")[0]
             for name in columns:
-                stats[name] = (results[f"{name}_max"], results[f"{name}_min"])
+                col_max = results[f"{name}_max"]
+                col_min = results[f"{name}_min"]
+                if col_max == col_min:
+                    raise ValueError(
+                        f"Cannot standardize {name!r} -"
+                        "the maximum and minimum values are equal"
+                    )
+                stats[name] = (col_max, col_min)
 
         self.stats_ = stats
 
@@ -121,7 +128,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
             self._fit_expr = [table.aggregate(aggs)]
             results = self._fit_expr[-1].execute().to_dict("records")[0]
             for name in columns:
-                stats[name] = (results[f"{name}_mean"], results[f"{name}_std"])
+                col_std = results[f"{name}_std"]
+                if col_std == 0:
+                    raise ValueError(
+                        f"Cannot standardize {name!r} - the standard deviation is zero"
+                    )
+                stats[name] = (results[f"{name}_mean"], col_std)
 
         self.stats_ = stats
 
diff --git a/tests/test_standardize.py b/tests/test_standardize.py
new file mode 100644
index 0000000..1de975a
--- /dev/null
+++ b/tests/test_standardize.py
@@ -0,0 +1,45 @@
+import ibis
+import numpy as np
+import pandas as pd
+import pandas.testing as tm
+import pytest
+
+import ibis_ml as ml
+
+def test_scalestandard():
+    cols = np.arange(0, 100)
+    mean = np.mean(cols)
+    std = np.std(cols)
+    table = ibis.memtable({"col": cols})
+    step = ml.ScaleStandard("col")
+    step.fit_table(table, ml.core.Metadata())
+    result = step.transform_table(table)
+    expected = pd.DataFrame({"col": (cols - mean) / std})
+    tm.assert_frame_equal(result.execute(), expected, check_exact=False)
+
+
+def test_scaleminmax():
+    cols = np.arange(0, 100)
+    min_val = np.min(cols)
+    max_val = np.max(cols)
+    table = ibis.memtable({"col": cols})
+    step = ml.ScaleMinMax("col")
+    step.fit_table(table, ml.core.Metadata())
+    result = step.transform_table(table)
+    expected = pd.DataFrame({"col": (cols - min_val) / (max_val - min_val)})
+    tm.assert_frame_equal(result.execute(), expected, check_exact=False)
+
+
+@pytest.mark.parametrize(
+    ("model", "msg"),
+    [
+        ("ScaleStandard", "Cannot standardize 'col' - the standard deviation is zero"),
+        ("ScaleMinMax", "Cannot standardize 'col' - the maximum and minimum values are equal"),
+    ],
+)
+def test_scale_unique_col(model, msg):
+    table = ibis.memtable({"col": [1]})
+    scale_class = getattr(ml, model)
+    step = scale_class("col")
+    with pytest.raises(ValueError, match=msg):
+        step.fit_table(table, ml.core.Metadata())