Fix the coercion of scores to floats in the optimizer (#789)

motus · bpkroth · web-flow · commit 6fe46caa9eb0 · 2024-07-22T14:20:30.000-05:00
Closes #785 Also, add more unit tests to make sure the optimizer handles string inputs correctly. --------- Co-authored-by: Brian Kroth <bpkroth@users.noreply.github.com>
diff --git a/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py b/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py
@@ -116,20 +116,19 @@ def bulk_register(
             pd.DataFrame([{} if score is None else score for score in scores])
         )
 
-        opt_targets = list(self._opt_targets)
         if status is not None:
             # Select only the completed trials, set scores for failed trials to +inf.
             df_status = pd.Series(status)
             # TODO: Be more flexible with values used for failed trials (not just +inf).
             # Issue: https://github.com/microsoft/MLOS/issues/523
-            df_scores.loc[df_status != Status.SUCCEEDED, opt_targets] = float("inf")
+            df_scores[df_status != Status.SUCCEEDED] = float("inf")
             df_status_completed = df_status.apply(Status.is_completed)
             df_configs = df_configs[df_status_completed]
             df_scores = df_scores[df_status_completed]
 
         # TODO: Specify (in the config) which metrics to pass to the optimizer.
         # Issue: https://github.com/microsoft/MLOS/issues/745
-        self._opt.register(configs=df_configs, scores=df_scores[opt_targets].astype(float))
+        self._opt.register(configs=df_configs, scores=df_scores)
 
         if _LOG.isEnabledFor(logging.DEBUG):
             (score, _) = self.get_best_observation()
@@ -138,10 +137,19 @@ def bulk_register(
         return True
 
     def _adjust_signs_df(self, df_scores: pd.DataFrame) -> pd.DataFrame:
-        """In-place adjust the signs of the scores for MINIMIZATION problem."""
-        for opt_target, opt_dir in self._opt_targets.items():
-            df_scores[opt_target] *= opt_dir
-        return df_scores
+        """Coerce optimization target scores to floats and adjust the signs for
+        MINIMIZATION problem.
+        """
+        df_targets = df_scores[list(self._opt_targets)]
+        try:
+            return df_targets.astype(float) * self._opt_targets.values()
+        except ValueError as ex:
+            _LOG.error(
+                "Some score values cannot be converted to float - check the data ::\n%s",
+                df_targets,
+                exc_info=True,
+            )
+            raise ValueError("Some score values cannot be converted to float") from ex
 
     def _to_df(self, configs: Sequence[Dict[str, TunableValue]]) -> pd.DataFrame:
         """
diff --git a/mlos_bench/mlos_bench/tests/optimizers/mlos_core_opt_df_test.py b/mlos_bench/mlos_bench/tests/optimizers/mlos_core_opt_df_test.py
@@ -23,6 +23,10 @@ def mlos_core_optimizer(tunable_groups: TunableGroups) -> MlosCoreOptimizer:
         "optimizer_type": "FLAML",
         "max_suggestions": 10,
         "seed": SEED,
+        "optimization_targets": {
+            "latency": "min",
+            "throughput": "max",
+        },
     }
     return MlosCoreOptimizer(tunable_groups, test_opt_config)
 
@@ -74,3 +78,85 @@ def test_df(mlos_core_optimizer: MlosCoreOptimizer, mock_configs: List[dict]) ->
             "vmSize": "Standard_B2s",
         },
     ]
+
+
+def test_df_str(mlos_core_optimizer: MlosCoreOptimizer, mock_configs: List[dict]) -> None:
+    """Test `MlosCoreOptimizer._to_df()` type coercion on tunables with string
+    values.
+    """
+    df_config_orig = mlos_core_optimizer._to_df(mock_configs)
+    df_config_str = mlos_core_optimizer._to_df(
+        [{key: str(val) for (key, val) in config.items()} for config in mock_configs]
+    )
+    assert df_config_orig.equals(df_config_str)
+
+
+def test_adjust_signs_df(mlos_core_optimizer: MlosCoreOptimizer) -> None:
+    """Test `MlosCoreOptimizer._adjust_signs_df()` on different types of inputs."""
+    df_scores_input = pandas.DataFrame(
+        {
+            "latency": [88.88, 66.66, 99.99, None],
+            "throughput": [111, 222, 333, None],
+        }
+    )
+
+    df_scores_output = pandas.DataFrame(
+        {
+            "latency": [88.88, 66.66, 99.99, float("NaN")],
+            "throughput": [-111, -222, -333, float("NaN")],
+        }
+    )
+
+    # Make sure we adjust the signs for minimization.
+    df_scores = mlos_core_optimizer._adjust_signs_df(df_scores_input)
+    assert df_scores.equals(df_scores_output)
+
+    # Check that the same operation works for string inputs.
+    df_scores = mlos_core_optimizer._adjust_signs_df(df_scores_input.astype(str))
+    assert df_scores.equals(df_scores_output)
+
+
+def test_adjust_signs_df_nan(mlos_core_optimizer: MlosCoreOptimizer) -> None:
+    """Test `MlosCoreOptimizer._adjust_signs_df()` handling None, NaN, and Inf
+    values.
+    """
+    df_scores = mlos_core_optimizer._adjust_signs_df(
+        pandas.DataFrame(
+            {
+                "latency": ["88.88", "NaN", "Inf", "-Inf", None],
+                "throughput": ["111", "NaN", "Inf", "-Inf", None],
+            }
+        )
+    )
+
+    assert df_scores.equals(
+        pandas.DataFrame(
+            {
+                "latency": [88.88, float("NaN"), float("Inf"), float("-Inf"), float("NaN")],
+                "throughput": [-111, float("NaN"), float("-Inf"), float("Inf"), float("NaN")],
+            }
+        )
+    )
+
+
+def test_adjust_signs_df_invalid(mlos_core_optimizer: MlosCoreOptimizer) -> None:
+    """Test `MlosCoreOptimizer._adjust_signs_df()` on invalid inputs."""
+    with pytest.raises(ValueError):
+        mlos_core_optimizer._adjust_signs_df(
+            pandas.DataFrame(
+                {
+                    "latency": ["INVALID"],
+                    "throughput": ["no input"],
+                }
+            )
+        )
+
+    with pytest.raises(ValueError):
+        mlos_core_optimizer._adjust_signs_df(
+            pandas.DataFrame(
+                {
+                    "latency": ["88.88", ""],
+                    "throughput": ["111", ""],
+                }
+            )
+        )