Skip to content

Commit 6e8ff28

Browse files
committed
Optimize memory usage, add bag info support
1 parent 92354ee commit 6e8ff28

File tree

4 files changed

+133
-14
lines changed

4 files changed

+133
-14
lines changed

tabrepo/benchmark/experiment/experiment_constructor.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from autogluon.core.models import AbstractModel
1010

1111
from tabrepo.benchmark.models.wrapper.abstract_class import AbstractExecModel
12-
from tabrepo.benchmark.models.wrapper.AutoGluon_class import AGSingleWrapper
12+
from tabrepo.benchmark.models.wrapper.AutoGluon_class import AGSingleWrapper, AGSingleBagWrapper
1313
from tabrepo.benchmark.models.wrapper.ag_model import AGModelWrapper
1414
from tabrepo.benchmark.experiment.experiment_runner import ExperimentRunner, OOFExperimentRunner
1515
from tabrepo.benchmark.models.model_register import infer_model_cls
@@ -220,6 +220,7 @@ class AGModelExperiment(Experiment):
220220
experiment_kwargs: dict, optional
221221
The kwargs passed to the init of `experiment_cls`.
222222
"""
223+
_method_cls = AGSingleWrapper
223224

224225
def __init__(
225226
self,
@@ -251,7 +252,7 @@ def __init__(
251252
method_kwargs["fit_kwargs"]["raise_on_model_failure"] = raise_on_model_failure
252253
super().__init__(
253254
name=name,
254-
method_cls=AGSingleWrapper,
255+
method_cls=self._method_cls,
255256
method_kwargs={
256257
"model_cls": model_cls,
257258
"model_hyperparameters": model_hyperparameters,
@@ -328,6 +329,8 @@ class AGModelBagExperiment(AGModelExperiment):
328329
method_kwargs: dict, optional
329330
experiment_kwargs: dict, optional
330331
"""
332+
_method_cls = AGSingleBagWrapper
333+
331334
def __init__(
332335
self,
333336
name: str,

tabrepo/benchmark/experiment/experiment_runner.py

+43-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import datetime
44
from typing import Literal, Type
55

6+
import numpy as np
67
import pandas as pd
8+
from pandas.api.types import is_integer_dtype
79

810
from autogluon.core.data.label_cleaner import LabelCleaner, LabelCleanerDummy
911
from autogluon.core.metrics import get_metric, Scorer
@@ -13,17 +15,18 @@
1315
from tabrepo.benchmark.models.wrapper.abstract_class import AbstractExecModel
1416

1517

18+
# TODO: make a dataclass so type hinter is happy with subclasses?
1619
class ExperimentRunner:
1720
def __init__(
1821
self,
22+
*,
1923
method_cls: Type[AbstractExecModel],
2024
task: OpenMLTaskWrapper,
2125
fold: int,
2226
task_name: str,
2327
method: str,
2428
fit_args: dict | None = None,
2529
cleanup: bool = True,
26-
compute_simulation_artifacts: bool = True,
2730
input_format: Literal["openml", "csv"] = "openml",
2831
cacher: AbstractCacheFunction | None = None,
2932
):
@@ -36,7 +39,6 @@ def __init__(
3639
self.fit_args = fit_args
3740
self.cleanup = cleanup
3841
self.input_format = input_format
39-
self.compute_simulation_artifacts = compute_simulation_artifacts
4042
self.eval_metric_name = ag_eval_metric_map[self.task.problem_type] # FIXME: Don't hardcode eval metric
4143
self.eval_metric: Scorer = get_metric(metric=self.eval_metric_name, problem_type=self.task.problem_type)
4244
self.model = None
@@ -194,6 +196,19 @@ def _cleanup(self):
194196

195197

196198
class OOFExperimentRunner(ExperimentRunner):
199+
def __init__(
200+
self,
201+
*,
202+
compute_simulation_artifacts: bool = True,
203+
compute_bag_info: bool = True,
204+
optimize_simulation_artifacts_memory: bool = True,
205+
**kwargs,
206+
):
207+
super().__init__(**kwargs)
208+
self.compute_simulation_artifacts = compute_simulation_artifacts
209+
self.compute_bag_info = compute_bag_info
210+
self.optimize_simulation_artifacts_memory = optimize_simulation_artifacts_memory
211+
197212
def post_evaluate(self, out: dict) -> dict:
198213
out = super().post_evaluate(out=out)
199214
if self.compute_simulation_artifacts and self.model.can_get_oof:
@@ -205,16 +220,36 @@ def post_evaluate(self, out: dict) -> dict:
205220
if self.task.problem_type == "binary":
206221
simulation_artifact["pred_proba_dict_test"] = simulation_artifact["pred_proba_dict_test"].iloc[:, 1]
207222
simulation_artifact["y_test"] = self.label_cleaner.transform(self.y_test)
223+
224+
if self.optimize_simulation_artifacts_memory:
225+
# optimize memory
226+
simulation_artifact["y_test"].index = pd.to_numeric(simulation_artifact["y_test"].index, downcast="integer")
227+
simulation_artifact["y_val"].index = pd.to_numeric(simulation_artifact["y_val"].index, downcast="integer")
228+
229+
simulation_artifact["y_test_idx"] = simulation_artifact["y_test"].index.values
230+
simulation_artifact["y_val_idx"] = simulation_artifact["y_val"].index.values
231+
232+
simulation_artifact["y_test"] = simulation_artifact["y_test"].values
233+
simulation_artifact["y_val"] = simulation_artifact["y_val"].values
234+
if is_integer_dtype(simulation_artifact["y_test"]):
235+
simulation_artifact["y_test"] = pd.to_numeric(simulation_artifact["y_test"], downcast="integer")
236+
if is_integer_dtype(simulation_artifact["y_val"]):
237+
simulation_artifact["y_val"] = pd.to_numeric(simulation_artifact["y_val"], downcast="integer")
238+
239+
simulation_artifact["pred_proba_dict_test"] = simulation_artifact["pred_proba_dict_test"].astype(np.float32)
240+
simulation_artifact["pred_proba_dict_val"] = simulation_artifact["pred_proba_dict_val"].astype(np.float32)
241+
242+
simulation_artifact["pred_proba_dict_test"] = simulation_artifact["pred_proba_dict_test"].values
243+
simulation_artifact["pred_proba_dict_val"] = simulation_artifact["pred_proba_dict_val"].values
244+
208245
simulation_artifact["label"] = self.task.label
209246
simulation_artifact["metric"] = self.eval_metric_name
210247

211248
out["metric_error_val"] = self.model.get_metric_error_val()
212-
# out["metric_error_val"] = evaluate(
213-
# y_true=simulation_artifact["y_val"],
214-
# y_pred=self.label_cleaner.transform(out["predictions"]),
215-
# y_pred_proba=self.label_cleaner.transform_proba(out["probabilities"])
216-
# )
217-
# out["metric_error_val"] = self.eval_metric.error(simulation_artifact["y_val"], simulation_artifact["pred_proba_dict_val"])
249+
250+
if self.compute_bag_info and (self.model.can_get_per_child_oof and self.model.can_get_per_child_val_idx):
251+
simulation_artifact["bag_info"] = self.model.bag_artifact(X_test=self.X_test)
252+
218253

219254
simulation_artifact["pred_proba_dict_val"] = {self.method: simulation_artifact["pred_proba_dict_val"]}
220255
simulation_artifact["pred_proba_dict_test"] = {self.method: simulation_artifact["pred_proba_dict_test"]}

tabrepo/benchmark/models/wrapper/AutoGluon_class.py

+77-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import shutil
55
from typing import Type
66

7+
import numpy as np
78
import pandas as pd
89

910
from tabrepo.benchmark.models.wrapper.abstract_class import AbstractExecModel
@@ -132,6 +133,9 @@ def __init__(
132133

133134
super().__init__(init_kwargs=init_kwargs, fit_kwargs=fit_kwargs, preprocess_data=preprocess_data, preprocess_label=preprocess_label, **kwargs)
134135

136+
def post_fit(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame):
137+
self.failure_artifact = self.get_metadata_failure()
138+
135139
def get_hyperparameters(self):
136140
hyperparameters = self.predictor.model_hyperparameters(model=self.predictor.model_best, output_format="user")
137141
return hyperparameters
@@ -146,22 +150,91 @@ def model_cls(self) -> Type["AbstractModel"]:
146150
model_cls = ag_model_register.key_to_cls(key=self._model_cls)
147151
return model_cls
148152

149-
def get_metadata(self) -> dict:
150-
metadata = {}
153+
def _load_model(self):
154+
model_names = self.predictor.model_names(can_infer=True)
155+
assert len(model_names) == 1
156+
model_name = self.predictor.model_names()[0]
157+
return self.predictor._trainer.load_model(model_name)
151158

152-
model = self.predictor._trainer.load_model(self.predictor.model_best)
153-
metadata["info"] = model.get_info(include_feature_metadata=False)
159+
def get_metadata_init(self) -> dict:
160+
metadata = {}
154161
metadata["hyperparameters"] = self.get_hyperparameters()
155162
metadata["model_cls"] = self.model_cls.__name__
156163
metadata["model_type"] = self.model_cls.ag_key # TODO: rename to ag_key?
157164
metadata["name_prefix"] = self.model_cls.ag_name # TODO: rename to ag_name?
158165
metadata["model_hyperparameters"] = self.model_hyperparameters
159166
metadata["init_kwargs_extra"] = self.init_kwargs_extra
160167
metadata["fit_kwargs_extra"] = self.fit_kwargs_extra
168+
return metadata
169+
170+
def get_metadata_fit(self) -> dict:
171+
metadata = {}
172+
model = self.predictor._trainer.load_model(self.predictor.model_best)
173+
metadata["info"] = model.get_info(include_feature_metadata=False)
161174
metadata["disk_usage"] = model.disk_usage()
162175
metadata["num_cpus"] = model.fit_num_cpus
163176
metadata["num_gpus"] = model.fit_num_gpus
164177
metadata["num_cpus_child"] = model.fit_num_cpus_child
165178
metadata["num_gpus_child"] = model.fit_num_gpus_child
166179
metadata["fit_metadata"] = model.get_fit_metadata()
167180
return metadata
181+
182+
def get_metadata_failure(self) -> dict:
183+
metadata = {
184+
"model_failures": self.predictor.model_failures()
185+
}
186+
return metadata
187+
188+
def get_metadata(self) -> dict:
189+
metadata = self.get_metadata_init()
190+
metadata_fit = self.get_metadata_fit()
191+
192+
metadata.update(metadata_fit)
193+
return metadata
194+
195+
196+
class AGSingleBagWrapper(AGSingleWrapper):
197+
can_get_per_child_oof = True
198+
can_get_per_child_val_idx = True
199+
200+
def bag_artifact(self, X_test: pd.DataFrame):
201+
model = self._load_model()
202+
bag_info = {}
203+
bag_info["pred_proba_test_per_child"] = self.get_per_child_test(X_test=X_test, model=model)
204+
bag_info["val_idx_per_child"] = self.get_per_child_val_idx(model=model)
205+
return bag_info
206+
207+
def get_per_child_val_idx(self, model=None) -> list[np.ndarray]:
208+
if model is None:
209+
model = self._load_model()
210+
X, y = self.predictor.load_data_internal()
211+
all_kfolds = []
212+
# TODO: Make this a bagged ensemble method
213+
if model._child_oof:
214+
all_kfolds = [(None, X.index.values)]
215+
else:
216+
for n_repeat, k in enumerate(model._k_per_n_repeat):
217+
kfolds = model._cv_splitters[n_repeat].split(X=X, y=y)
218+
cur_kfolds = kfolds[n_repeat * k: (n_repeat + 1) * k]
219+
all_kfolds += cur_kfolds
220+
221+
val_idx_per_child = []
222+
for fold_idx, (train_idx, val_idx) in enumerate(all_kfolds):
223+
val_idx = pd.to_numeric(val_idx, downcast="integer") # memory opt
224+
val_idx_per_child.append(val_idx)
225+
226+
return val_idx_per_child
227+
228+
# TODO: Can avoid predicting on test twice by doing it all in one go
229+
def get_per_child_test(self, X_test: pd.DataFrame, model=None) -> list[np.ndarray]:
230+
if model is None:
231+
model = self._load_model()
232+
X_test_inner = self.predictor.transform_features(data=X_test, model=model.name)
233+
234+
if model.can_predict_proba():
235+
per_child_test_preds = model.predict_proba_children(X=X_test_inner)
236+
else:
237+
per_child_test_preds = model.predict_children(X=X_test_inner)
238+
239+
per_child_test_preds = [preds_child.astype(np.float32) for preds_child in per_child_test_preds] # memory opt
240+
return per_child_test_preds

tabrepo/benchmark/models/wrapper/abstract_class.py

+8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
class AbstractExecModel:
1212
can_get_oof = False
13+
can_get_per_child_oof = False
14+
can_get_per_child_test = False
1315

1416
# TODO: Prateek: Find a way to put AutoGluon as default - in the case the user does not want their own class
1517
def __init__(
@@ -25,6 +27,7 @@ def __init__(
2527
self.preprocess_label = preprocess_label
2628
self.label_cleaner: LabelCleaner = None
2729
self._feature_generator = None
30+
self.failure_artifact = None
2831

2932
def transform_y(self, y: pd.Series) -> pd.Series:
3033
return self.label_cleaner.transform(y)
@@ -54,6 +57,9 @@ def _preprocess_fit_transform(self, X: pd.DataFrame, y: pd.Series):
5457
y = self.transform_y(y)
5558
return X, y
5659

60+
def post_fit(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame):
61+
pass
62+
5763
# TODO: Prateek, Add a toggle here to see if user wants to fit or fit and predict, also add model saving functionality
5864
# TODO: Nick: Temporary name
5965
def fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame):
@@ -68,6 +74,8 @@ def fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame):
6874
with (Timer() as timer_fit):
6975
self.fit(X, y)
7076

77+
self.post_fit(X=X, y=y, X_test=X_test)
78+
7179
if self.problem_type in ['binary', 'multiclass']:
7280
with Timer() as timer_predict:
7381
y_pred_proba = self.predict_proba(X_test)

0 commit comments

Comments
 (0)