4
4
from ConfigSpace .configuration_space import ConfigurationSpace
5
5
from ConfigSpace .hyperparameters import (
6
6
UniformFloatHyperparameter ,
7
+ UniformIntegerHyperparameter ,
8
+ CategoricalHyperparameter
7
9
)
8
10
9
11
import numpy as np
16
18
from autoPyTorch .utils .common import HyperparameterSearchSpace , add_hyperparameter
17
19
18
20
21
+ def get_num_output_dimensions (config : Dict [str , Any ], num_categs_per_feature : List [int ]) -> List [int ]:
22
+ """
23
+ Returns list of embedding sizes for each categorical variable.
24
+ Selects this adaptively based on training_datset.
25
+ Note: Assumes there is at least one embed feature.
26
+ Args:
27
+ config (Dict[str, Any]):
28
+ contains the hyperparameters required to calculate the `num_output_dimensions`
29
+ num_categs_per_feature (List[int]):
30
+ list containing number of categories for each feature that is to be embedded,
31
+ 0 if the column is not an embed column
32
+ Returns:
33
+ List[int]:
34
+ list containing the output embedding size for each column,
35
+ 1 if the column is not an embed column
36
+ """
37
+
38
+ max_embedding_dim = config ['max_embedding_dim' ]
39
+ embed_exponent = config ['embed_exponent' ]
40
+ size_factor = config ['embedding_size_factor' ]
41
+ num_output_dimensions = [int (size_factor * max (
42
+ 2 ,
43
+ min (max_embedding_dim ,
44
+ 1.6 * num_categories ** embed_exponent )))
45
+ if num_categories > 0 else 1 for num_categories in num_categs_per_feature ]
46
+ return num_output_dimensions
47
+
48
+
19
49
class _LearnedEntityEmbedding (nn .Module ):
20
50
""" Learned entity embedding module for categorical features"""
21
51
@@ -35,9 +65,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n
35
65
36
66
self .num_embed_features = self .num_categories_per_col [self .embed_features ]
37
67
38
- self .num_output_dimensions = [1 ] * num_features_excl_embed
39
- self .num_output_dimensions .extend ([ceil (config ["dimension_reduction_" + str (i )] * num_in ) for i , num_in in
40
- enumerate (self .num_embed_features )])
68
+ self .num_output_dimensions = get_num_output_dimensions (config , self .num_categories_per_col )
41
69
42
70
self .num_out_feats = num_features_excl_embed + sum (self .num_output_dimensions )
43
71
@@ -78,12 +106,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
78
106
# before passing it through the model
79
107
concat_seq = []
80
108
81
- x_pointer = 0
82
109
layer_pointer = 0
83
110
for x_pointer , embed in enumerate (self .embed_features ):
84
111
current_feature_slice = x [:, x_pointer ]
85
112
if not embed :
86
- x_pointer += 1
87
113
concat_seq .append (current_feature_slice .view (- 1 , 1 ))
88
114
continue
89
115
current_feature_slice = current_feature_slice .to (torch .int )
@@ -153,28 +179,24 @@ def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_
153
179
@staticmethod
154
180
def get_hyperparameter_search_space (
155
181
dataset_properties : Optional [Dict [str , BaseDatasetPropertiesType ]] = None ,
156
- dimension_reduction : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "dimension_reduction" ,
157
- value_range = (0 , 1 ),
158
- default_value = 0.5 ),
182
+ embed_exponent : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "embed_exponent" ,
183
+ value_range = (0.56 ,),
184
+ default_value = 0.56 ),
185
+ max_embedding_dim : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "max_embedding_dim" ,
186
+ value_range = (100 ,),
187
+ default_value = 100 ),
188
+ embedding_size_factor : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "embedding_size_factor" ,
189
+ value_range = (0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1.0 , 1.1 , 1.2 , 1.3 , 1.4 , 1.5 ),
190
+ default_value = 1 ,
191
+ ),
159
192
) -> ConfigurationSpace :
160
193
cs = ConfigurationSpace ()
161
194
if dataset_properties is not None :
162
- for i in range (len (dataset_properties ['categorical_columns' ])
163
- if isinstance (dataset_properties ['categorical_columns' ], List ) else 0 ):
164
- # currently as we dont have information about the embedding columns
165
- # we search for more dimensions than necessary. This can be solved by
166
- # not having `min_unique_values_for_embedding` as a hyperparameter and
167
- # instead passing it as a parameter to the feature validator, which
168
- # allows us to pass embed_columns to the dataset properties.
169
- # TODO: test the trade off
170
- # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding`
171
- # in one custom transformer. this will also allow users to use this transformer
172
- # outside the pipeline
173
- ee_dimensions_search_space = HyperparameterSearchSpace (hyperparameter = "dimension_reduction_" + str (i ),
174
- value_range = dimension_reduction .value_range ,
175
- default_value = dimension_reduction .default_value ,
176
- log = dimension_reduction .log )
177
- add_hyperparameter (cs , ee_dimensions_search_space , UniformFloatHyperparameter )
195
+ if len (dataset_properties ['categorical_columns' ]) > 0 :
196
+ add_hyperparameter (cs , embed_exponent , UniformFloatHyperparameter )
197
+ add_hyperparameter (cs , max_embedding_dim , UniformIntegerHyperparameter )
198
+ add_hyperparameter (cs , embedding_size_factor , CategoricalHyperparameter )
199
+
178
200
return cs
179
201
180
202
@staticmethod
0 commit comments