Skip to content

Commit ea9cceb

Browse files
authored
Merge branch 'development' into reg_cocktails
2 parents cca08d5 + bf264d6 commit ea9cceb

29 files changed

+1289
-203
lines changed

autoPyTorch/configs/greedy_portfolio.json

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[{"data_loader:batch_size": 60,
22
"encoder:__choice__": "OneHotEncoder",
3+
"coalescer:__choice__": "NoCoalescer",
34
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
4-
"imputer:categorical_strategy": "most_frequent",
55
"imputer:numerical_strategy": "mean",
66
"lr_scheduler:__choice__": "CosineAnnealingLR",
77
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -31,8 +31,8 @@
3131
"network_backbone:ShapedMLPBackbone:max_dropout": 0.023271935735825866},
3232
{"data_loader:batch_size": 255,
3333
"encoder:__choice__": "OneHotEncoder",
34+
"coalescer:__choice__": "NoCoalescer",
3435
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
35-
"imputer:categorical_strategy": "most_frequent",
3636
"imputer:numerical_strategy": "mean",
3737
"lr_scheduler:__choice__": "CosineAnnealingLR",
3838
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -65,8 +65,8 @@
6565
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7662454727603789},
6666
{"data_loader:batch_size": 165,
6767
"encoder:__choice__": "OneHotEncoder",
68+
"coalescer:__choice__": "NoCoalescer",
6869
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
69-
"imputer:categorical_strategy": "most_frequent",
7070
"imputer:numerical_strategy": "mean",
7171
"lr_scheduler:__choice__": "CosineAnnealingLR",
7272
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -96,8 +96,8 @@
9696
"network_head:fully_connected:units_layer_1": 128},
9797
{"data_loader:batch_size": 299,
9898
"encoder:__choice__": "OneHotEncoder",
99+
"coalescer:__choice__": "NoCoalescer",
99100
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
100-
"imputer:categorical_strategy": "most_frequent",
101101
"imputer:numerical_strategy": "mean",
102102
"lr_scheduler:__choice__": "CosineAnnealingLR",
103103
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -128,8 +128,8 @@
128128
"network_head:fully_connected:units_layer_1": 128},
129129
{"data_loader:batch_size": 183,
130130
"encoder:__choice__": "OneHotEncoder",
131+
"coalescer:__choice__": "NoCoalescer",
131132
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
132-
"imputer:categorical_strategy": "most_frequent",
133133
"imputer:numerical_strategy": "mean",
134134
"lr_scheduler:__choice__": "CosineAnnealingLR",
135135
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -162,8 +162,8 @@
162162
"network_backbone:ShapedResNetBackbone:max_dropout": 0.27204101593048097},
163163
{"data_loader:batch_size": 21,
164164
"encoder:__choice__": "OneHotEncoder",
165+
"coalescer:__choice__": "NoCoalescer",
165166
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
166-
"imputer:categorical_strategy": "most_frequent",
167167
"imputer:numerical_strategy": "mean",
168168
"lr_scheduler:__choice__": "CosineAnnealingLR",
169169
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -191,8 +191,8 @@
191191
"network_head:fully_connected:units_layer_1": 128},
192192
{"data_loader:batch_size": 159,
193193
"encoder:__choice__": "OneHotEncoder",
194+
"coalescer:__choice__": "NoCoalescer",
194195
"feature_preprocessor:__choice__": "TruncatedSVD",
195-
"imputer:categorical_strategy": "most_frequent",
196196
"imputer:numerical_strategy": "mean",
197197
"lr_scheduler:__choice__": "CosineAnnealingLR",
198198
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -221,8 +221,8 @@
221221
"network_head:fully_connected:units_layer_1": 128},
222222
{"data_loader:batch_size": 442,
223223
"encoder:__choice__": "OneHotEncoder",
224+
"coalescer:__choice__": "NoCoalescer",
224225
"feature_preprocessor:__choice__": "TruncatedSVD",
225-
"imputer:categorical_strategy": "most_frequent",
226226
"imputer:numerical_strategy": "mean",
227227
"lr_scheduler:__choice__": "CosineAnnealingLR",
228228
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -254,8 +254,8 @@
254254
"network_head:fully_connected:units_layer_1": 128},
255255
{"data_loader:batch_size": 140,
256256
"encoder:__choice__": "OneHotEncoder",
257+
"coalescer:__choice__": "NoCoalescer",
257258
"feature_preprocessor:__choice__": "TruncatedSVD",
258-
"imputer:categorical_strategy": "most_frequent",
259259
"imputer:numerical_strategy": "mean",
260260
"lr_scheduler:__choice__": "CosineAnnealingLR",
261261
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -287,8 +287,8 @@
287287
"network_head:fully_connected:units_layer_1": 128},
288288
{"data_loader:batch_size": 48,
289289
"encoder:__choice__": "OneHotEncoder",
290+
"coalescer:__choice__": "NoCoalescer",
290291
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
291-
"imputer:categorical_strategy": "most_frequent",
292292
"imputer:numerical_strategy": "mean",
293293
"lr_scheduler:__choice__": "CosineAnnealingLR",
294294
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -315,8 +315,8 @@
315315
"network_head:fully_connected:units_layer_1": 128},
316316
{"data_loader:batch_size": 168,
317317
"encoder:__choice__": "OneHotEncoder",
318+
"coalescer:__choice__": "NoCoalescer",
318319
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
319-
"imputer:categorical_strategy": "most_frequent",
320320
"imputer:numerical_strategy": "mean",
321321
"lr_scheduler:__choice__": "CosineAnnealingLR",
322322
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -348,8 +348,8 @@
348348
"network_backbone:ShapedResNetBackbone:max_dropout": 0.8992826006547855},
349349
{"data_loader:batch_size": 21,
350350
"encoder:__choice__": "OneHotEncoder",
351+
"coalescer:__choice__": "NoCoalescer",
351352
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
352-
"imputer:categorical_strategy": "most_frequent",
353353
"imputer:numerical_strategy": "mean",
354354
"lr_scheduler:__choice__": "CosineAnnealingLR",
355355
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -377,8 +377,8 @@
377377
"network_head:fully_connected:units_layer_1": 128},
378378
{"data_loader:batch_size": 163,
379379
"encoder:__choice__": "OneHotEncoder",
380+
"coalescer:__choice__": "NoCoalescer",
380381
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
381-
"imputer:categorical_strategy": "most_frequent",
382382
"imputer:numerical_strategy": "mean",
383383
"lr_scheduler:__choice__": "CosineAnnealingLR",
384384
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -410,8 +410,8 @@
410410
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6341848343636569},
411411
{"data_loader:batch_size": 150,
412412
"encoder:__choice__": "OneHotEncoder",
413+
"coalescer:__choice__": "NoCoalescer",
413414
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
414-
"imputer:categorical_strategy": "most_frequent",
415415
"imputer:numerical_strategy": "mean",
416416
"lr_scheduler:__choice__": "CosineAnnealingLR",
417417
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -444,8 +444,8 @@
444444
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7133813761319248},
445445
{"data_loader:batch_size": 151,
446446
"encoder:__choice__": "OneHotEncoder",
447+
"coalescer:__choice__": "NoCoalescer",
447448
"feature_preprocessor:__choice__": "TruncatedSVD",
448-
"imputer:categorical_strategy": "most_frequent",
449449
"imputer:numerical_strategy": "mean",
450450
"lr_scheduler:__choice__": "CosineAnnealingLR",
451451
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -474,8 +474,8 @@
474474
"network_head:fully_connected:units_layer_1": 128},
475475
{"data_loader:batch_size": 42,
476476
"encoder:__choice__": "OneHotEncoder",
477+
"coalescer:__choice__": "NoCoalescer",
477478
"feature_preprocessor:__choice__": "TruncatedSVD",
478-
"imputer:categorical_strategy": "most_frequent",
479479
"imputer:numerical_strategy": "mean",
480480
"lr_scheduler:__choice__": "CosineAnnealingLR",
481481
"network_backbone:__choice__": "ShapedResNetBackbone",

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
from typing import Any, Dict, List, Optional, Union
1+
from typing import Any, Dict, List, Optional, Tuple, Union
22

33
import numpy as np
44

5+
from sklearn.base import BaseEstimator
56
from sklearn.compose import ColumnTransformer
67
from sklearn.pipeline import make_pipeline
78

@@ -49,18 +50,25 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
4950
"""
5051

5152
self.check_requirements(X, y)
52-
numerical_pipeline = 'passthrough'
53-
categorical_pipeline = 'passthrough'
5453

5554
preprocessors = get_tabular_preprocessers(X)
56-
if len(X['dataset_properties']['numerical_columns']):
55+
column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
56+
if len(preprocessors['numerical']) > 0:
5757
numerical_pipeline = make_pipeline(*preprocessors['numerical'])
58-
if len(X['dataset_properties']['categorical_columns']):
58+
column_transformers.append(
59+
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
60+
)
61+
if len(preprocessors['categorical']) > 0:
5962
categorical_pipeline = make_pipeline(*preprocessors['categorical'])
60-
61-
self.preprocessor = ColumnTransformer([
62-
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
63-
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
63+
column_transformers.append(
64+
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
65+
)
66+
67+
# in case the preprocessing steps are disabled
68+
# i.e, NoEncoder for categorical, we want to
69+
# let the data in categorical columns pass through
70+
self.preprocessor = ColumnTransformer(
71+
column_transformers,
6472
remainder='passthrough'
6573
)
6674

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
from ConfigSpace.hyperparameters import UniformFloatHyperparameter
5+
6+
import numpy as np
7+
8+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
9+
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
10+
from autoPyTorch.utils.implementations import MinorityCoalesceTransformer
11+
12+
13+
class MinorityCoalescer(BaseCoalescer):
14+
"""Group together categories whose occurence is less than a specified min_frac """
15+
def __init__(self, min_frac: float, random_state: np.random.RandomState):
16+
super().__init__()
17+
self.min_frac = min_frac
18+
self.random_state = random_state
19+
20+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
21+
self.check_requirements(X, y)
22+
self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_frac=self.min_frac)
23+
return self
24+
25+
@staticmethod
26+
def get_hyperparameter_search_space(
27+
dataset_properties: Optional[Dict[str, Any]] = None,
28+
min_frac: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_frac',
29+
value_range=(1e-4, 0.5),
30+
default_value=1e-2,
31+
),
32+
) -> ConfigurationSpace:
33+
34+
cs = ConfigurationSpace()
35+
add_hyperparameter(cs, min_frac, UniformFloatHyperparameter)
36+
return cs
37+
38+
@staticmethod
39+
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
40+
return {
41+
'shortname': 'MinorityCoalescer',
42+
'name': 'MinorityCoalescer',
43+
'handles_sparse': False
44+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
import numpy as np
4+
5+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
6+
7+
8+
class NoCoalescer(BaseCoalescer):
9+
def __init__(self, random_state: np.random.RandomState):
10+
super().__init__()
11+
self.random_state = random_state
12+
self._processing = False
13+
14+
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
15+
"""
16+
As no coalescing happens, only check the requirements.
17+
18+
Args:
19+
X (Dict[str, Any]):
20+
fit dictionary
21+
y (Optional[Any]):
22+
Parameter to comply with scikit-learn API. Not used.
23+
24+
Returns:
25+
instance of self
26+
"""
27+
self.check_requirements(X, y)
28+
29+
return self
30+
31+
@staticmethod
32+
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
33+
return {
34+
'shortname': 'NoCoalescer',
35+
'name': 'NoCoalescer',
36+
'handles_sparse': True
37+
}

0 commit comments

Comments
 (0)