Skip to content

Commit 1c1ff8a

Browse files
committed
rebase and fix flake
1 parent d49ed68 commit 1c1ff8a

File tree

6 files changed

+94
-55
lines changed

6 files changed

+94
-55
lines changed

autoPyTorch/api/base_task.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
import pandas as pd
2929

30-
from smac.runhistory.runhistory import DataOrigin, RunHistory
30+
from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
3131
from smac.stats.stats import Stats
3232
from smac.tae import StatusType
3333

@@ -291,7 +291,10 @@ def _get_dataset_input_validator(
291291
y_train: Union[List, pd.DataFrame, np.ndarray],
292292
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
293293
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
294-
resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
294+
resampling_strategy: Optional[Union[
295+
CrossValTypes,
296+
HoldoutValTypes,
297+
NoResamplingStrategyTypes]] = None,
295298
resampling_strategy_args: Optional[Dict[str, Any]] = None,
296299
dataset_name: Optional[str] = None,
297300
) -> Tuple[BaseDataset, BaseInputValidator]:
@@ -335,7 +338,10 @@ def get_dataset(
335338
y_train: Union[List, pd.DataFrame, np.ndarray],
336339
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
337340
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
338-
resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
341+
resampling_strategy: Optional[Union[
342+
CrossValTypes,
343+
HoldoutValTypes,
344+
NoResamplingStrategyTypes]] = None,
339345
resampling_strategy_args: Optional[Dict[str, Any]] = None,
340346
dataset_name: Optional[str] = None,
341347
) -> BaseDataset:
@@ -593,18 +599,6 @@ def _load_models(self) -> bool:
593599
raise ValueError("Resampling strategy is needed to determine what models to load")
594600
self.ensemble_ = self._backend.load_ensemble(self.seed)
595601

596-
# TODO: remove this code after `fit_pipeline` is rebased.
597-
if hasattr(self, '_disable_file_output'):
598-
if isinstance(self._disable_file_output, List):
599-
disabled_file_outputs = self._disable_file_output
600-
disable_file_output = False
601-
elif isinstance(self._disable_file_output, bool):
602-
disable_file_output = self._disable_file_output
603-
disabled_file_outputs = []
604-
else:
605-
disable_file_output = False
606-
disabled_file_outputs = []
607-
608602
# If no ensemble is loaded, try to get the best performing model
609603
if not self.ensemble_:
610604
self.ensemble_ = self._load_best_individual_model()
@@ -619,7 +613,7 @@ def _load_models(self) -> bool:
619613
if len(self.cv_models_) == 0:
620614
raise ValueError('No models fitted!')
621615

622-
elif disable_file_output or 'pipeline' not in disabled_file_outputs:
616+
elif 'pipeline' not in self._disable_file_output:
623617
model_names = self._backend.list_all_models(self.seed)
624618

625619
if len(model_names) == 0:
@@ -1395,7 +1389,10 @@ def fit_pipeline(
13951389
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
13961390
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
13971391
dataset_name: Optional[str] = None,
1398-
resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes]] = None,
1392+
resampling_strategy: Optional[Union[
1393+
CrossValTypes,
1394+
HoldoutValTypes,
1395+
NoResamplingStrategyTypes]] = None,
13991396
resampling_strategy_args: Optional[Dict[str, Any]] = None,
14001397
run_time_limit_secs: int = 60,
14011398
memory_limit: Optional[int] = None,
@@ -1511,7 +1508,6 @@ def fit_pipeline(
15111508
(BaseDataset):
15121509
Dataset created from the given tensors
15131510
"""
1514-
self.dataset_name = dataset.dataset_name
15151511

15161512
if dataset is None:
15171513
if (

autoPyTorch/api/tabular_classification.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,10 @@ def _get_dataset_input_validator(
156156
y_train: Union[List, pd.DataFrame, np.ndarray],
157157
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
158158
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
159-
resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
159+
resampling_strategy: Optional[Union[
160+
CrossValTypes,
161+
HoldoutValTypes,
162+
NoResamplingStrategyTypes]] = None,
160163
resampling_strategy_args: Optional[Dict[str, Any]] = None,
161164
dataset_name: Optional[str] = None,
162165
) -> Tuple[TabularDataset, TabularInputValidator]:
@@ -371,19 +374,6 @@ def search(
371374
self
372375
373376
"""
374-
if dataset_name is None:
375-
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
376-
377-
# we have to create a logger for at this point for the validator
378-
self._logger = self._get_logger(dataset_name)
379-
380-
# Create a validator object to make sure that the data provided by
381-
# the user matches the autopytorch requirements
382-
self.InputValidator = TabularInputValidator(
383-
is_classification=True,
384-
logger_port=self._logger_port,
385-
)
386-
387377
self.dataset, self.InputValidator = self._get_dataset_input_validator(
388378
X_train=X_train,
389379
y_train=y_train,
@@ -401,9 +391,6 @@ def search(
401391
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
402392
)
403393

404-
if self.dataset is None:
405-
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
406-
407394
return self._search(
408395
dataset=self.dataset,
409396
optimize_metric=optimize_metric,

autoPyTorch/api/tabular_regression.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,10 @@ def _get_dataset_input_validator(
156156
y_train: Union[List, pd.DataFrame, np.ndarray],
157157
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
158158
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
159-
resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
159+
resampling_strategy: Optional[Union[
160+
CrossValTypes,
161+
HoldoutValTypes,
162+
NoResamplingStrategyTypes]] = None,
160163
resampling_strategy_args: Optional[Dict[str, Any]] = None,
161164
dataset_name: Optional[str] = None,
162165
) -> Tuple[TabularDataset, TabularInputValidator]:
@@ -386,9 +389,6 @@ def search(
386389
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
387390
)
388391

389-
if self.dataset is None:
390-
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
391-
392392
return self._search(
393393
dataset=self.dataset,
394394
optimize_metric=optimize_metric,

autoPyTorch/evaluation/fit_evaluator.py

Lines changed: 71 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
AbstractEvaluator,
1717
fit_and_suppress_warnings
1818
)
19+
from autoPyTorch.evaluation.utils import DisableFileOutputParameters
1920
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
2021
from autoPyTorch.utils.common import subsampler
2122
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -33,7 +34,7 @@ def __init__(self, backend: Backend, queue: Queue,
3334
num_run: Optional[int] = None,
3435
include: Optional[Dict[str, Any]] = None,
3536
exclude: Optional[Dict[str, Any]] = None,
36-
disable_file_output: Union[bool, List] = False,
37+
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
3738
init_params: Optional[Dict[str, Any]] = None,
3839
logger_port: Optional[int] = None,
3940
keep_models: Optional[bool] = None,
@@ -241,14 +242,11 @@ def file_output(
241242
)
242243

243244
# Abort if we don't want to output anything.
244-
if hasattr(self, 'disable_file_output'):
245-
if self.disable_file_output:
246-
return None, {}
247-
else:
248-
self.disabled_file_outputs = []
245+
if 'all' in self.disable_file_output:
246+
return None, {}
249247

250-
if hasattr(self, 'pipeline') and self.pipeline is not None:
251-
if 'pipeline' not in self.disabled_file_outputs:
248+
if getattr(self, 'pipeline', None) is not None:
249+
if 'pipeline' not in self.disable_file_output:
252250
pipeline = self.pipeline
253251
else:
254252
pipeline = None
@@ -265,11 +263,11 @@ def file_output(
265263
ensemble_predictions=None,
266264
valid_predictions=(
267265
Y_valid_pred if 'y_valid' not in
268-
self.disabled_file_outputs else None
266+
self.disable_file_output else None
269267
),
270268
test_predictions=(
271269
Y_test_pred if 'y_test' not in
272-
self.disabled_file_outputs else None
270+
self.disable_file_output else None
273271
),
274272
)
275273

@@ -287,8 +285,8 @@ def eval_function(
287285
num_run: int,
288286
include: Optional[Dict[str, Any]],
289287
exclude: Optional[Dict[str, Any]],
290-
disable_file_output: Union[bool, List],
291288
output_y_hat_optimization: bool = False,
289+
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
292290
pipeline_config: Optional[Dict[str, Any]] = None,
293291
budget_type: str = None,
294292
init_params: Optional[Dict[str, Any]] = None,
@@ -297,14 +295,75 @@ def eval_function(
297295
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
298296
instance: str = None,
299297
) -> None:
298+
"""
299+
This closure allows the communication between the ExecuteTaFuncWithQueue and the
300+
pipeline trainer (TrainEvaluator).
301+
302+
Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally
303+
builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files
304+
to disc via the backend, and puts the performance result of the run in the queue.
305+
306+
307+
Attributes:
308+
backend (Backend):
309+
An object to interface with the disk storage. In particular, allows to
310+
access the train and test datasets
311+
queue (Queue):
312+
Each worker available will instantiate an evaluator, and after completion,
313+
it will return the evaluation result via a multiprocessing queue
314+
metric (autoPyTorchMetric):
315+
A scorer object that is able to evaluate how good a pipeline was fit. It
316+
is a wrapper on top of the actual score method (a wrapper on top of scikit
317+
lean accuracy for example) that formats the predictions accordingly.
318+
budget: (float):
319+
The amount of epochs/time a configuration is allowed to run.
320+
budget_type (str):
321+
The budget type, which can be epochs or time
322+
pipeline_config (Optional[Dict[str, Any]]):
323+
Defines the content of the pipeline being evaluated. For example, it
324+
contains pipeline specific settings like logging name, or whether or not
325+
to use tensorboard.
326+
config (Union[int, str, Configuration]):
327+
Determines the pipeline to be constructed.
328+
seed (int):
329+
A integer that allows for reproducibility of results
330+
output_y_hat_optimization (bool):
331+
Whether this worker should output the target predictions, so that they are
332+
stored on disk. Fundamentally, the resampling strategy might shuffle the
333+
Y_train targets, so we store the split in order to re-use them for ensemble
334+
selection.
335+
num_run (Optional[int]):
336+
An identifier of the current configuration being fit. This number is unique per
337+
configuration.
338+
include (Optional[Dict[str, Any]]):
339+
An optional dictionary to include components of the pipeline steps.
340+
exclude (Optional[Dict[str, Any]]):
341+
An optional dictionary to exclude components of the pipeline steps.
342+
disable_file_output (Union[bool, List[str]]):
343+
By default, the model, it's predictions and other metadata is stored on disk
344+
for each finished configuration. This argument allows the user to skip
345+
saving certain file type, for example the model, from being written to disk.
346+
init_params (Optional[Dict[str, Any]]):
347+
Optional argument that is passed to each pipeline step. It is the equivalent of
348+
kwargs for the pipeline steps.
349+
logger_port (Optional[int]):
350+
Logging is performed using a socket-server scheme to be robust against many
351+
parallel entities that want to write to the same file. This integer states the
352+
socket port for the communication channel. If None is provided, a traditional
353+
logger is used.
354+
instance (str):
355+
An instance on which to evaluate the current pipeline. By default we work
356+
with a single instance, being the provided X_train, y_train of a single dataset.
357+
This instance is a compatibility argument for SMAC, that is capable of working
358+
with multiple datasets at the same time.
359+
"""
300360
evaluator = FitEvaluator(
301361
backend=backend,
302362
queue=queue,
303363
metric=metric,
304364
configuration=config,
305365
seed=seed,
306366
num_run=num_run,
307-
output_y_hat_optimization=output_y_hat_optimization,
308367
include=include,
309368
exclude=exclude,
310369
disable_file_output=disable_file_output,

autoPyTorch/evaluation/tae.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@
3030
HoldoutValTypes,
3131
NoResamplingStrategyTypes
3232
)
33-
import autoPyTorch.evaluation.fit_evaluator
34-
import autoPyTorch.evaluation.train_evaluator
35-
from autoPyTorch.automl_common.common.utils.backend import Backend
3633
from autoPyTorch.evaluation.utils import (
3734
DisableFileOutputParameters,
3835
empty_queue,

autoPyTorch/evaluation/train_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,10 +418,10 @@ def eval_function(
418418
budget: float,
419419
config: Optional[Configuration],
420420
seed: int,
421-
output_y_hat_optimization: bool,
422421
num_run: int,
423422
include: Optional[Dict[str, Any]],
424423
exclude: Optional[Dict[str, Any]],
424+
output_y_hat_optimization: bool,
425425
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
426426
pipeline_config: Optional[Dict[str, Any]] = None,
427427
budget_type: str = None,

0 commit comments

Comments
 (0)