4
4
import shutil
5
5
from typing import Type
6
6
7
+ import numpy as np
7
8
import pandas as pd
8
9
9
10
from tabrepo .benchmark .models .wrapper .abstract_class import AbstractExecModel
@@ -132,6 +133,9 @@ def __init__(
132
133
133
134
super ().__init__ (init_kwargs = init_kwargs , fit_kwargs = fit_kwargs , preprocess_data = preprocess_data , preprocess_label = preprocess_label , ** kwargs )
134
135
136
+ def post_fit (self , X : pd .DataFrame , y : pd .Series , X_test : pd .DataFrame ):
137
+ self .failure_artifact = self .get_metadata_failure ()
138
+
135
139
def get_hyperparameters (self ):
136
140
hyperparameters = self .predictor .model_hyperparameters (model = self .predictor .model_best , output_format = "user" )
137
141
return hyperparameters
@@ -146,22 +150,91 @@ def model_cls(self) -> Type["AbstractModel"]:
146
150
model_cls = ag_model_register .key_to_cls (key = self ._model_cls )
147
151
return model_cls
148
152
149
- def get_metadata (self ) -> dict :
150
- metadata = {}
153
+ def _load_model (self ):
154
+ model_names = self .predictor .model_names (can_infer = True )
155
+ assert len (model_names ) == 1
156
+ model_name = self .predictor .model_names ()[0 ]
157
+ return self .predictor ._trainer .load_model (model_name )
151
158
152
- model = self . predictor . _trainer . load_model (self . predictor . model_best )
153
- metadata [ "info" ] = model . get_info ( include_feature_metadata = False )
159
+ def get_metadata_init (self ) -> dict :
160
+ metadata = {}
154
161
metadata ["hyperparameters" ] = self .get_hyperparameters ()
155
162
metadata ["model_cls" ] = self .model_cls .__name__
156
163
metadata ["model_type" ] = self .model_cls .ag_key # TODO: rename to ag_key?
157
164
metadata ["name_prefix" ] = self .model_cls .ag_name # TODO: rename to ag_name?
158
165
metadata ["model_hyperparameters" ] = self .model_hyperparameters
159
166
metadata ["init_kwargs_extra" ] = self .init_kwargs_extra
160
167
metadata ["fit_kwargs_extra" ] = self .fit_kwargs_extra
168
+ return metadata
169
+
170
+ def get_metadata_fit (self ) -> dict :
171
+ metadata = {}
172
+ model = self .predictor ._trainer .load_model (self .predictor .model_best )
173
+ metadata ["info" ] = model .get_info (include_feature_metadata = False )
161
174
metadata ["disk_usage" ] = model .disk_usage ()
162
175
metadata ["num_cpus" ] = model .fit_num_cpus
163
176
metadata ["num_gpus" ] = model .fit_num_gpus
164
177
metadata ["num_cpus_child" ] = model .fit_num_cpus_child
165
178
metadata ["num_gpus_child" ] = model .fit_num_gpus_child
166
179
metadata ["fit_metadata" ] = model .get_fit_metadata ()
167
180
return metadata
181
+
182
+ def get_metadata_failure (self ) -> dict :
183
+ metadata = {
184
+ "model_failures" : self .predictor .model_failures ()
185
+ }
186
+ return metadata
187
+
188
+ def get_metadata (self ) -> dict :
189
+ metadata = self .get_metadata_init ()
190
+ metadata_fit = self .get_metadata_fit ()
191
+
192
+ metadata .update (metadata_fit )
193
+ return metadata
194
+
195
+
196
+ class AGSingleBagWrapper (AGSingleWrapper ):
197
+ can_get_per_child_oof = True
198
+ can_get_per_child_val_idx = True
199
+
200
+ def bag_artifact (self , X_test : pd .DataFrame ):
201
+ model = self ._load_model ()
202
+ bag_info = {}
203
+ bag_info ["pred_proba_test_per_child" ] = self .get_per_child_test (X_test = X_test , model = model )
204
+ bag_info ["val_idx_per_child" ] = self .get_per_child_val_idx (model = model )
205
+ return bag_info
206
+
207
+ def get_per_child_val_idx (self , model = None ) -> list [np .ndarray ]:
208
+ if model is None :
209
+ model = self ._load_model ()
210
+ X , y = self .predictor .load_data_internal ()
211
+ all_kfolds = []
212
+ # TODO: Make this a bagged ensemble method
213
+ if model ._child_oof :
214
+ all_kfolds = [(None , X .index .values )]
215
+ else :
216
+ for n_repeat , k in enumerate (model ._k_per_n_repeat ):
217
+ kfolds = model ._cv_splitters [n_repeat ].split (X = X , y = y )
218
+ cur_kfolds = kfolds [n_repeat * k : (n_repeat + 1 ) * k ]
219
+ all_kfolds += cur_kfolds
220
+
221
+ val_idx_per_child = []
222
+ for fold_idx , (train_idx , val_idx ) in enumerate (all_kfolds ):
223
+ val_idx = pd .to_numeric (val_idx , downcast = "integer" ) # memory opt
224
+ val_idx_per_child .append (val_idx )
225
+
226
+ return val_idx_per_child
227
+
228
+ # TODO: Can avoid predicting on test twice by doing it all in one go
229
+ def get_per_child_test (self , X_test : pd .DataFrame , model = None ) -> list [np .ndarray ]:
230
+ if model is None :
231
+ model = self ._load_model ()
232
+ X_test_inner = self .predictor .transform_features (data = X_test , model = model .name )
233
+
234
+ if model .can_predict_proba ():
235
+ per_child_test_preds = model .predict_proba_children (X = X_test_inner )
236
+ else :
237
+ per_child_test_preds = model .predict_children (X = X_test_inner )
238
+
239
+ per_child_test_preds = [preds_child .astype (np .float32 ) for preds_child in per_child_test_preds ] # memory opt
240
+ return per_child_test_preds
0 commit comments