Skip to content

Commit a679b09

Browse files
[ADD] dataset compression (#387)
* Initial implementation without tests * add tests and make necessary changes * improve documentation * fix tests * Apply suggestions from code review Co-authored-by: nabenabe0928 <[email protected]> * undo change in as it causes tests to fail * change name from InputValidator to input_validator * extract statements to methods * refactor code * check if mapping is the same as expected * update precision reduction for dataframes and tests * fix flake Co-authored-by: nabenabe0928 <[email protected]>
1 parent dafd480 commit a679b09

File tree

8 files changed

+675
-37
lines changed

8 files changed

+675
-37
lines changed

autoPyTorch/api/base_task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ def __init__(
243243
if self.n_jobs == 1:
244244
self._multiprocessing_context = 'fork'
245245

246-
self.InputValidator: Optional[BaseInputValidator] = None
246+
self.input_validator: Optional[BaseInputValidator] = None
247247

248248
self.search_space_updates = search_space_updates
249249
if search_space_updates is not None:

autoPyTorch/api/tabular_classification.py

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1+
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
22

33
import numpy as np
44

@@ -11,6 +11,9 @@
1111
TASK_TYPES_TO_STRING,
1212
)
1313
from autoPyTorch.data.tabular_validator import TabularInputValidator
14+
from autoPyTorch.data.utils import (
15+
get_dataset_compression_mapping
16+
)
1417
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1518
from autoPyTorch.datasets.resampling_strategy import (
1619
HoldoutValTypes,
@@ -163,6 +166,7 @@ def _get_dataset_input_validator(
163166
resampling_strategy: Optional[ResamplingStrategies] = None,
164167
resampling_strategy_args: Optional[Dict[str, Any]] = None,
165168
dataset_name: Optional[str] = None,
169+
dataset_compression: Optional[Mapping[str, Any]] = None,
166170
) -> Tuple[TabularDataset, TabularInputValidator]:
167171
"""
168172
Returns an object of `TabularDataset` and an object of
@@ -199,26 +203,27 @@ def _get_dataset_input_validator(
199203

200204
# Create a validator object to make sure that the data provided by
201205
# the user matches the autopytorch requirements
202-
InputValidator = TabularInputValidator(
206+
input_validator = TabularInputValidator(
203207
is_classification=True,
204208
logger_port=self._logger_port,
209+
dataset_compression=dataset_compression
205210
)
206211

207212
# Fit a input validator to check the provided data
208213
# Also, an encoder is fit to both train and test data,
209214
# to prevent unseen categories during inference
210-
InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
215+
input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
211216

212217
dataset = TabularDataset(
213218
X=X_train, Y=y_train,
214219
X_test=X_test, Y_test=y_test,
215-
validator=InputValidator,
220+
validator=input_validator,
216221
resampling_strategy=resampling_strategy,
217222
resampling_strategy_args=resampling_strategy_args,
218223
dataset_name=dataset_name
219224
)
220225

221-
return dataset, InputValidator
226+
return dataset, input_validator
222227

223228
def search(
224229
self,
@@ -234,14 +239,15 @@ def search(
234239
total_walltime_limit: int = 100,
235240
func_eval_time_limit_secs: Optional[int] = None,
236241
enable_traditional_pipeline: bool = True,
237-
memory_limit: Optional[int] = 4096,
242+
memory_limit: int = 4096,
238243
smac_scenario_args: Optional[Dict[str, Any]] = None,
239244
get_smac_object_callback: Optional[Callable] = None,
240245
all_supported_metrics: bool = True,
241246
precision: int = 32,
242247
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
243248
load_models: bool = True,
244249
portfolio_selection: Optional[str] = None,
250+
dataset_compression: Union[Mapping[str, Any], bool] = False,
245251
) -> 'BaseTask':
246252
"""
247253
Search for the best pipeline configuration for the given dataset.
@@ -310,7 +316,7 @@ def search(
310316
feature by turning this flag to False. All machine learning
311317
algorithms that are fitted during search() are considered for
312318
ensemble building.
313-
memory_limit (Optional[int]: default=4096):
319+
memory_limit (int: default=4096):
314320
Memory limit in MB for the machine learning algorithm.
315321
Autopytorch will stop fitting the machine learning algorithm
316322
if it tries to allocate more than memory_limit MB. If None
@@ -368,20 +374,52 @@ def search(
368374
Additionally, the keyword 'greedy' is supported,
369375
which would use the default portfolio from
370376
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
377+
dataset_compression: Union[bool, Mapping[str, Any]] = True
378+
We compress datasets so that they fit into some predefined amount of memory.
379+
**NOTE**
380+
381+
Default configuration when left as ``True``:
382+
.. code-block:: python
383+
{
384+
"memory_allocation": 0.1,
385+
"methods": ["precision"]
386+
}
387+
You can also pass your own configuration with the same keys and choosing
388+
from the available ``"methods"``.
389+
The available options are described here:
390+
**memory_allocation**
391+
By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
392+
float value can be set with ``"memory_allocation": 0.1``. We also allow for
393+
specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
394+
The memory used by the dataset is checked after each reduction method is
395+
performed. If the dataset fits into the allocated memory, any further methods
396+
listed in ``"methods"`` will not be performed.
397+
398+
**methods**
399+
We currently provide the following methods for reducing the dataset size.
400+
These can be provided in a list and are performed in the order as given.
401+
* ``"precision"`` - We reduce floating point precision as follows:
402+
* ``np.float128 -> np.float64``
403+
* ``np.float96 -> np.float64``
404+
* ``np.float64 -> np.float32``
405+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
406+
to the lowest possible precision.
371407
372408
Returns:
373409
self
374410
375411
"""
412+
self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
376413

377-
self.dataset, self.InputValidator = self._get_dataset_input_validator(
414+
self.dataset, self.input_validator = self._get_dataset_input_validator(
378415
X_train=X_train,
379416
y_train=y_train,
380417
X_test=X_test,
381418
y_test=y_test,
382419
resampling_strategy=self.resampling_strategy,
383420
resampling_strategy_args=self.resampling_strategy_args,
384-
dataset_name=dataset_name)
421+
dataset_name=dataset_name,
422+
dataset_compression=self._dataset_compression)
385423

386424
return self._search(
387425
dataset=self.dataset,
@@ -418,28 +456,28 @@ def predict(
418456
Returns:
419457
Array with estimator predictions.
420458
"""
421-
if self.InputValidator is None or not self.InputValidator._is_fitted:
459+
if self.input_validator is None or not self.input_validator._is_fitted:
422460
raise ValueError("predict() is only supported after calling search. Kindly call first "
423461
"the estimator search() method.")
424462

425-
X_test = self.InputValidator.feature_validator.transform(X_test)
463+
X_test = self.input_validator.feature_validator.transform(X_test)
426464
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
427465
n_jobs=n_jobs)
428466

429-
if self.InputValidator.target_validator.is_single_column_target():
467+
if self.input_validator.target_validator.is_single_column_target():
430468
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
431469
else:
432470
predicted_indexes = (predicted_probabilities > 0.5).astype(int)
433471

434472
# Allow to predict in the original domain -- that is, the user is not interested
435473
# in our encoded values
436-
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
474+
return self.input_validator.target_validator.inverse_transform(predicted_indexes)
437475

438476
def predict_proba(self,
439477
X_test: Union[np.ndarray, pd.DataFrame, List],
440478
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
441-
if self.InputValidator is None or not self.InputValidator._is_fitted:
479+
if self.input_validator is None or not self.input_validator._is_fitted:
442480
raise ValueError("predict() is only supported after calling search. Kindly call first "
443481
"the estimator search() method.")
444-
X_test = self.InputValidator.feature_validator.transform(X_test)
482+
X_test = self.input_validator.feature_validator.transform(X_test)
445483
return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)

autoPyTorch/api/tabular_regression.py

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1+
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
22

33
import numpy as np
44

@@ -11,6 +11,9 @@
1111
TASK_TYPES_TO_STRING
1212
)
1313
from autoPyTorch.data.tabular_validator import TabularInputValidator
14+
from autoPyTorch.data.utils import (
15+
get_dataset_compression_mapping
16+
)
1417
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1518
from autoPyTorch.datasets.resampling_strategy import (
1619
HoldoutValTypes,
@@ -164,6 +167,7 @@ def _get_dataset_input_validator(
164167
resampling_strategy: Optional[ResamplingStrategies] = None,
165168
resampling_strategy_args: Optional[Dict[str, Any]] = None,
166169
dataset_name: Optional[str] = None,
170+
dataset_compression: Optional[Mapping[str, Any]] = None,
167171
) -> Tuple[TabularDataset, TabularInputValidator]:
168172
"""
169173
Returns an object of `TabularDataset` and an object of
@@ -200,26 +204,27 @@ def _get_dataset_input_validator(
200204

201205
# Create a validator object to make sure that the data provided by
202206
# the user matches the autopytorch requirements
203-
InputValidator = TabularInputValidator(
207+
input_validator = TabularInputValidator(
204208
is_classification=False,
205209
logger_port=self._logger_port,
210+
dataset_compression=dataset_compression
206211
)
207212

208213
# Fit a input validator to check the provided data
209214
# Also, an encoder is fit to both train and test data,
210215
# to prevent unseen categories during inference
211-
InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
216+
input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
212217

213218
dataset = TabularDataset(
214219
X=X_train, Y=y_train,
215220
X_test=X_test, Y_test=y_test,
216-
validator=InputValidator,
221+
validator=input_validator,
217222
resampling_strategy=resampling_strategy,
218223
resampling_strategy_args=resampling_strategy_args,
219224
dataset_name=dataset_name
220225
)
221226

222-
return dataset, InputValidator
227+
return dataset, input_validator
223228

224229
def search(
225230
self,
@@ -235,14 +240,15 @@ def search(
235240
total_walltime_limit: int = 100,
236241
func_eval_time_limit_secs: Optional[int] = None,
237242
enable_traditional_pipeline: bool = True,
238-
memory_limit: Optional[int] = 4096,
243+
memory_limit: int = 4096,
239244
smac_scenario_args: Optional[Dict[str, Any]] = None,
240245
get_smac_object_callback: Optional[Callable] = None,
241246
all_supported_metrics: bool = True,
242247
precision: int = 32,
243248
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
244249
load_models: bool = True,
245250
portfolio_selection: Optional[str] = None,
251+
dataset_compression: Union[Mapping[str, Any], bool] = False,
246252
) -> 'BaseTask':
247253
"""
248254
Search for the best pipeline configuration for the given dataset.
@@ -311,7 +317,7 @@ def search(
311317
feature by turning this flag to False. All machine learning
312318
algorithms that are fitted during search() are considered for
313319
ensemble building.
314-
memory_limit (Optional[int]: default=4096):
320+
memory_limit (int: default=4096):
315321
Memory limit in MB for the machine learning algorithm.
316322
Autopytorch will stop fitting the machine learning algorithm
317323
if it tries to allocate more than memory_limit MB. If None
@@ -369,19 +375,53 @@ def search(
369375
Additionally, the keyword 'greedy' is supported,
370376
which would use the default portfolio from
371377
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
378+
dataset_compression: Union[bool, Mapping[str, Any]] = True
379+
We compress datasets so that they fit into some predefined amount of memory.
380+
**NOTE**
381+
382+
Default configuration when left as ``True``:
383+
.. code-block:: python
384+
{
385+
"memory_allocation": 0.1,
386+
"methods": ["precision"]
387+
}
388+
You can also pass your own configuration with the same keys and choosing
389+
from the available ``"methods"``.
390+
The available options are described here:
391+
**memory_allocation**
392+
By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
393+
float value can be set with ``"memory_allocation": 0.1``. We also allow for
394+
specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
395+
The memory used by the dataset is checked after each reduction method is
396+
performed. If the dataset fits into the allocated memory, any further methods
397+
listed in ``"methods"`` will not be performed.
398+
399+
**methods**
400+
We currently provide the following methods for reducing the dataset size.
401+
These can be provided in a list and are performed in the order as given.
402+
* ``"precision"`` - We reduce floating point precision as follows:
403+
* ``np.float128 -> np.float64``
404+
* ``np.float96 -> np.float64``
405+
* ``np.float64 -> np.float32``
406+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
407+
to the lowest possible precision.
372408
373409
Returns:
374410
self
375411
376412
"""
377-
self.dataset, self.InputValidator = self._get_dataset_input_validator(
413+
414+
self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
415+
416+
self.dataset, self.input_validator = self._get_dataset_input_validator(
378417
X_train=X_train,
379418
y_train=y_train,
380419
X_test=X_test,
381420
y_test=y_test,
382421
resampling_strategy=self.resampling_strategy,
383422
resampling_strategy_args=self.resampling_strategy_args,
384-
dataset_name=dataset_name)
423+
dataset_name=dataset_name,
424+
dataset_compression=self._dataset_compression)
385425

386426
return self._search(
387427
dataset=self.dataset,
@@ -408,14 +448,14 @@ def predict(
408448
batch_size: Optional[int] = None,
409449
n_jobs: int = 1
410450
) -> np.ndarray:
411-
if self.InputValidator is None or not self.InputValidator._is_fitted:
451+
if self.input_validator is None or not self.input_validator._is_fitted:
412452
raise ValueError("predict() is only supported after calling search. Kindly call first "
413453
"the estimator search() method.")
414454

415-
X_test = self.InputValidator.feature_validator.transform(X_test)
455+
X_test = self.input_validator.feature_validator.transform(X_test)
416456
predicted_values = super().predict(X_test, batch_size=batch_size,
417457
n_jobs=n_jobs)
418458

419459
# Allow to predict in the original domain -- that is, the user is not interested
420460
# in our encoded values
421-
return self.InputValidator.target_validator.inverse_transform(predicted_values)
461+
return self.input_validator.target_validator.inverse_transform(predicted_values)

0 commit comments

Comments
 (0)