Skip to content

Commit 2e34c95

Browse files
committed
update experiment_utils.py
Add results classes
1 parent 56ae88c commit 2e34c95

File tree

8 files changed

+727
-39
lines changed

8 files changed

+727
-39
lines changed

tabrepo/benchmark/experiment/experiment_utils.py

+147-38
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from __future__ import annotations
22

3+
import copy
34
from typing import Any, Literal, Type
45

56
import pandas as pd
6-
from tabrepo.benchmark.task.openml import OpenMLTaskWrapper, OpenMLS3TaskWrapper
77

8+
from tabrepo.benchmark.result import AGBagResult, BaselineResult, ConfigResult
9+
from tabrepo.benchmark.task.openml import OpenMLTaskWrapper, OpenMLS3TaskWrapper
810
from tabrepo.repository.repo_utils import convert_time_infer_s_from_batch_to_sample as _convert_time_infer_s_from_batch_to_sample
911
from tabrepo.utils.cache import AbstractCacheFunction, CacheFunctionPickle, CacheFunctionDummy
1012
from tabrepo import EvaluationRepository
@@ -212,31 +214,151 @@ def generate_repo_from_experiments(
212214

213215
return repo
214216

215-
def repo_from_results(
216-
self,
217-
results_lst: list[dict[str, Any]],
218-
convert_time_infer_s_from_batch_to_sample: bool = True, # FIXME: Remove this, it should be False eventually
219-
) -> EvaluationRepository:
220-
configs_hyperparameters = self.get_configs_hyperparameters(results_lst=results_lst)
217+
# TODO: Maybe calibrating model binary pred proba will improve ensemble roc_auc?
218+
def temp_scale(self, y_val, y_pred_proba_val, method: str = "v2"):
219+
init_val = 1.0
220+
max_iter = 200
221+
lr = 0.1
222+
from tabrepo.utils.temp_scaling.calibrators import AutoGluonTemperatureScalingCalibrator, TemperatureScalingCalibrator, AutoGluonTemperatureScalingCalibratorFixed, TemperatureScalingCalibratorFixed
223+
if method == "v1":
224+
calibrator = AutoGluonTemperatureScalingCalibrator(init_val=init_val, max_iter=max_iter, lr=lr)
225+
elif method == "v2":
226+
calibrator = TemperatureScalingCalibrator(max_iter=max_iter, lr=lr)
227+
elif method == "v1_fix":
228+
calibrator = AutoGluonTemperatureScalingCalibratorFixed(init_val=init_val, max_iter=max_iter, lr=lr)
229+
elif method == "v2_fix":
230+
calibrator = TemperatureScalingCalibratorFixed(max_iter=max_iter, lr=lr)
231+
else:
232+
raise ValueError(f"Unknown temp_scale method: {method}")
233+
calibrator.fit(X=y_pred_proba_val, y=y_val)
234+
return calibrator
235+
236+
def generate_calibrated(self, result, method: str = "v2", name_suffix: str = "_CAL"):
237+
sim_artifact = result["simulation_artifacts"]
238+
metric = sim_artifact["metric"]
239+
from autogluon.core.metrics import get_metric
240+
problem_type = sim_artifact["problem_type_transform"]
241+
ag_metric = get_metric(metric=metric, problem_type=problem_type)
242+
y_test = sim_artifact["y_test"]
243+
244+
y_val = sim_artifact["y_val"]
245+
y_pred_proba_val = sim_artifact["pred_val"]
246+
calibrator = self.temp_scale(y_val=y_val, y_pred_proba_val=y_pred_proba_val, method=method)
247+
y_pred_proba_test = sim_artifact["pred_test"]
248+
y_pred_proba_test_scaled = calibrator.predict_proba(y_pred_proba_test)
249+
y_pred_proba_val_scaled = calibrator.predict_proba(y_pred_proba_val)
250+
251+
# metric_error_og = ag_metric.error(y_test, y_pred_proba_test)
252+
metric_error_cal = ag_metric.error(y_test, y_pred_proba_test_scaled)
253+
metric_error_val_og = ag_metric.error(y_val, y_pred_proba_val)
254+
metric_error_val_cal = ag_metric.error(y_val, y_pred_proba_val_scaled)
255+
256+
if metric_error_val_cal > metric_error_val_og:
257+
print(f"WARNING:")
258+
print(metric_error_val_cal, metric_error_val_og)
259+
print(result["framework"], result["dataset"], result["fold"])
260+
261+
result_calibrated = copy.deepcopy(result)
262+
result_calibrated["metric_error"] = metric_error_cal
263+
result_calibrated["metric_error_val"] = metric_error_val_cal
264+
result_calibrated["simulation_artifacts"]["pred_test"] = y_pred_proba_test_scaled
265+
result_calibrated["simulation_artifacts"]["pred_val"] = y_pred_proba_val_scaled
266+
result_calibrated["framework"] = result_calibrated["framework"] + name_suffix
267+
# FIXME: Fix bag children? Should they be calibrated?
268+
269+
return result_calibrated
270+
271+
def _align_result_input_format(self, result: dict | BaselineResult) -> BaselineResult:
272+
"""
273+
Converts results in old format to new format
274+
Keeps results in new format as-is.
221275
222-
results_baselines = [result["df_results"] for result in results_lst if result["simulation_artifacts"] is None]
223-
df_baselines = pd.concat(results_baselines, ignore_index=True) if results_baselines else None
276+
This enables the use of results in the old format alongside results in the new format.
224277
225-
results_configs = [result for result in results_lst if result["simulation_artifacts"] is not None]
278+
Parameters
279+
----------
280+
result
226281
227-
results_lst_simulation_artifacts = [result["simulation_artifacts"] for result in results_configs]
228-
results_lst_df = [result["df_results"] for result in results_configs]
282+
Returns
283+
-------
229284
230-
if results_lst_df:
231-
df_configs = pd.concat(results_lst_df, ignore_index=True)
232-
if convert_time_infer_s_from_batch_to_sample:
233-
df_configs = _convert_time_infer_s_from_batch_to_sample(df=df_configs, task_metadata=self.task_metadata)
285+
"""
286+
if isinstance(result, BaselineResult):
287+
return result
288+
assert isinstance(result, dict)
289+
result_cls = BaselineResult
290+
sim_artifacts = result.get("simulation_artifacts", None)
291+
if sim_artifacts is not None:
292+
assert isinstance(sim_artifacts, dict)
293+
dataset = result["dataset"]
294+
fold = result["fold"]
295+
result_cls = ConfigResult
296+
if list(sim_artifacts.keys()) == [dataset]:
297+
sim_artifacts = sim_artifacts[dataset][fold]
298+
bag_info = sim_artifacts.get("bag_info", None)
299+
if bag_info is not None:
300+
assert isinstance(bag_info, dict)
301+
result_cls = AGBagResult
302+
result_obj = result_cls(result=result, convert_format=True, inplace=False)
303+
return result_obj
304+
305+
def _calibrate(self, result: ConfigResult) -> ConfigResult:
306+
problem_type = result.result["problem_type"]
307+
if problem_type == "multiclass":
308+
# FIXME: What about binary?
309+
result_calibrated = result.generate_calibrated(method="v2", name_suffix="_CAL")
234310
else:
235-
df_configs = None
311+
result_calibrated = copy.deepcopy(result)
312+
result_calibrated.result["framework"] = result_calibrated.result["framework"] + "_CAL"
313+
return result_calibrated
236314

237-
if df_baselines is not None:
238-
if convert_time_infer_s_from_batch_to_sample:
239-
df_baselines = _convert_time_infer_s_from_batch_to_sample(df=df_baselines, task_metadata=self.task_metadata)
315+
def repo_from_results(
316+
self,
317+
results_lst: list[dict[str, Any] | BaselineResult],
318+
calibrate: bool = False,
319+
include_holdout: bool = False,
320+
convert_time_infer_s_from_batch_to_sample: bool = True, # FIXME: Remove this, it should be False eventually
321+
) -> EvaluationRepository:
322+
results_lst: list[BaselineResult] = [self._align_result_input_format(result) for result in results_lst]
323+
324+
results_configs: list[ConfigResult] = []
325+
results_baselines: list[BaselineResult] = []
326+
for result in results_lst:
327+
if isinstance(result, ConfigResult):
328+
results_configs.append(result)
329+
else:
330+
results_baselines.append(result)
331+
332+
n_configs = len(results_configs)
333+
if calibrate:
334+
results_configs_calibrated = []
335+
for i, result in enumerate(results_configs):
336+
if i % 100 == 0:
337+
print(f"Calibrating: {i+1}/{n_configs}\t{result.framework}")
338+
results_configs_calibrated.append(self._calibrate(result=result))
339+
results_configs += results_configs_calibrated
340+
341+
n_configs = len(results_configs)
342+
if include_holdout:
343+
for r_i, result in enumerate(results_configs):
344+
if isinstance(result, AGBagResult):
345+
if r_i % 100 == 0:
346+
print(f"Generating Holdout Results: {r_i + 1}/{n_configs}\t{result.framework}")
347+
results_new: list[BaselineResult] = result.bag_artifacts()
348+
results_baselines += results_new
349+
350+
results_lst_df = [result.compute_df_result() for result in results_configs]
351+
results_lst_df_baselines = [result.compute_df_result() for result in results_baselines]
352+
df_configs = pd.concat(results_lst_df, ignore_index=True) if results_lst_df else None
353+
df_baselines = pd.concat(results_lst_df_baselines, ignore_index=True) if results_lst_df_baselines else None
354+
355+
if df_configs is not None and convert_time_infer_s_from_batch_to_sample:
356+
df_configs = _convert_time_infer_s_from_batch_to_sample(df=df_configs, task_metadata=self.task_metadata)
357+
if df_baselines is not None and convert_time_infer_s_from_batch_to_sample:
358+
df_baselines = _convert_time_infer_s_from_batch_to_sample(df=df_baselines, task_metadata=self.task_metadata)
359+
360+
configs_hyperparameters = self.get_configs_hyperparameters(results_configs=results_configs)
361+
results_lst_simulation_artifacts = [result.generate_old_sim_artifact() for result in results_configs]
240362

241363
# TODO: per-fold pred_proba_test and pred_proba_val (indices?)
242364
repo: EvaluationRepository = EvaluationRepository.from_raw(
@@ -249,25 +371,12 @@ def repo_from_results(
249371

250372
return repo
251373

252-
def get_configs_hyperparameters(self, results_lst: list[dict]) -> dict | None:
374+
def get_configs_hyperparameters(self, results_configs: list[ConfigResult]) -> dict | None:
253375
configs_hyperparameters = {}
254-
for result in results_lst:
255-
if "method_metadata" in result and "model_hyperparameters" in result["method_metadata"]:
256-
method_name = result["framework"]
257-
if method_name in configs_hyperparameters:
258-
continue
259-
method_metadata = result["method_metadata"]
260-
model_hyperparameters = method_metadata["model_hyperparameters"]
261-
model_cls = method_metadata.get("model_cls", None)
262-
model_type = method_metadata.get("model_type", None)
263-
name_prefix = method_metadata.get("name_prefix", None)
264-
265-
configs_hyperparameters[method_name] = dict(
266-
model_cls=model_cls,
267-
model_type=model_type,
268-
name_prefix=name_prefix,
269-
hyperparameters=model_hyperparameters,
270-
)
376+
for result in results_configs:
377+
if result.framework in configs_hyperparameters:
378+
continue
379+
configs_hyperparameters[result.framework] = result.hyperparameters
271380
if not configs_hyperparameters:
272381
configs_hyperparameters = None
273382
return configs_hyperparameters

tabrepo/benchmark/result/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .ag_bag_result import AGBagResult
2+
from .baseline_result import BaselineResult
3+
from .config_result import ConfigResult
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import copy
2+
3+
4+
class AbstractResult:
5+
def __init__(self, result: dict, inplace: bool = False):
6+
if not inplace:
7+
result = copy.deepcopy(result)
8+
self.result: dict = result
+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
2+
import copy
3+
4+
from autogluon.core.metrics import get_metric
5+
import numpy as np
6+
import pandas as pd
7+
8+
from tabrepo.benchmark.result.baseline_result import BaselineResult
9+
from tabrepo.benchmark.result.config_result import ConfigResult
10+
11+
12+
class AGBagResult(ConfigResult):
13+
@property
14+
def bag_info(self) -> dict:
15+
return self.simulation_artifacts["bag_info"]
16+
17+
def _align_result_input_format(self) -> dict:
18+
self.result = super()._align_result_input_format()
19+
20+
bag_info = self.result["simulation_artifacts"]["bag_info"]
21+
if "pred_proba_test_per_child" in bag_info:
22+
bag_info["pred_test_per_child"] = bag_info.pop("pred_proba_test_per_child")
23+
num_samples_val = len(self.result["simulation_artifacts"]["y_val_idx"])
24+
if "val_idx_per_child" in bag_info and "pred_val_per_child" not in bag_info:
25+
# Ensure no repeated bagging
26+
assert num_samples_val == sum([len(val_idx_child) for val_idx_child in bag_info["val_idx_per_child"]])
27+
# convert to pred_val_per_child
28+
pred_val_per_child = []
29+
for val_idx_child in bag_info["val_idx_per_child"]:
30+
pred_val_child_cur = self.result["simulation_artifacts"]["pred_val"][val_idx_child]
31+
pred_val_per_child.append(pred_val_child_cur)
32+
bag_info["pred_val_per_child"] = pred_val_per_child
33+
34+
pred_val = self._pred_val_from_children()
35+
if "pred_val" in self.result["simulation_artifacts"]:
36+
assert np.isclose(pred_val, self.simulation_artifacts["pred_val"]).all()
37+
self.simulation_artifacts["pred_val"] = pred_val
38+
pred_test = self._pred_test_from_children()
39+
if "pred_test" in self.result["simulation_artifacts"]:
40+
assert np.isclose(pred_test, self.simulation_artifacts["pred_test"]).all()
41+
self.simulation_artifacts["pred_test"] = pred_test
42+
return self.result
43+
44+
def _pred_val_from_children(self) -> np.ndarray:
45+
num_samples_val = len(self.simulation_artifacts["y_val_idx"])
46+
if len(self.bag_info["pred_val_per_child"][0].shape) == 1:
47+
pred_val = np.zeros(dtype=np.float64, shape=num_samples_val)
48+
else:
49+
pred_val = np.zeros(dtype=np.float64, shape=(num_samples_val, self.bag_info["pred_val_per_child"][0].shape[1]))
50+
val_child_count = np.zeros(dtype=int, shape=num_samples_val)
51+
for val_idx_child, pred_val_child in zip(self.bag_info["val_idx_per_child"], self.bag_info["pred_val_per_child"]):
52+
val_child_count[val_idx_child] += 1
53+
pred_val[val_idx_child] += pred_val_child
54+
pass
55+
pred_val = pred_val / val_child_count[:, None]
56+
pred_val = pred_val.astype(np.float32)
57+
return pred_val
58+
59+
def _pred_test_from_children(self) -> np.ndarray:
60+
num_samples_test = len(self.simulation_artifacts["y_test_idx"])
61+
if len(self.bag_info["pred_val_per_child"][0].shape) == 1:
62+
pred_test = np.zeros(dtype=np.float64, shape=num_samples_test)
63+
else:
64+
pred_test = np.zeros(dtype=np.float64, shape=(num_samples_test, self.bag_info["pred_test_per_child"][0].shape[1]))
65+
num_children = len(self.bag_info["pred_test_per_child"])
66+
for pred_test_child in self.bag_info["pred_test_per_child"]:
67+
pred_test += pred_test_child
68+
pred_test = pred_test / num_children
69+
pred_test = pred_test.astype(np.float32)
70+
return pred_test
71+
72+
def bag_artifacts(self) -> list[BaselineResult]:
73+
results_new = []
74+
sim_artifact = self.simulation_artifacts
75+
pred_proba_test_per_child = sim_artifact["bag_info"]["pred_test_per_child"]
76+
num_children = len(pred_proba_test_per_child)
77+
metric = self.result["metric"]
78+
framework = self.result["framework"]
79+
80+
problem_type = self.result["problem_type"]
81+
ag_metric = get_metric(metric=metric, problem_type=problem_type)
82+
y_test = sim_artifact["y_test"]
83+
84+
y_test_idx = sim_artifact["y_test_idx"]
85+
y_test = pd.Series(data=y_test, index=y_test_idx)
86+
87+
if num_children > 1:
88+
for i, c in enumerate(pred_proba_test_per_child):
89+
if i != 0:
90+
break
91+
if problem_type == "multiclass":
92+
y_pred_proba_test_child = pd.DataFrame(data=c, index=y_test_idx, columns=sim_artifact["ordered_class_labels_transformed"])
93+
else:
94+
y_pred_proba_test_child = pd.Series(data=c, index=y_test_idx)
95+
96+
# FIXME: needs to work with predictions too, not just pred proba
97+
metric_error = ag_metric.error(y_test, y_pred_proba_test_child)
98+
99+
result_baseline_new = copy.deepcopy(self.result)
100+
101+
holdout_name = framework + "_HOLDOUT"
102+
result_baseline_new["framework"] = holdout_name
103+
result_baseline_new["metric_error"] = metric_error
104+
result_baseline_new["time_train_s"] /= num_children
105+
result_baseline_new["time_infer_s"] /= num_children
106+
result_baseline_new = BaselineResult(result=result_baseline_new, convert_format=False, inplace=True)
107+
results_new.append(result_baseline_new)
108+
print(i, metric_error)
109+
110+
print("ens", self.result["metric_error"])
111+
return results_new

0 commit comments

Comments
 (0)