Reg cocktails apt1.0+reg cocktails pytorch embedding reduced (#454)

ravinkohli · ravinkohli · commit 8e15eec2210f · 2022-08-17T15:15:05.000+02:00
* reduce number of hyperparameters for pytorch embedding

* remove todos for the preprocessing PR, and apply suggestion from code review

* remove unwanted exclude in test
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -273,6 +273,10 @@ def build_pipeline(
     ) -> BasePipeline:
         """
         Build pipeline according to current task
+        and for the passed dataset properties
+
+        Args:
+            dataset_properties (Dict[str, Any]):
                 Characteristics of the dataset to guide the pipeline
                 choices of components
             include_components (Optional[Dict[str, Any]]):
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -4,6 +4,8 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
     UniformFloatHyperparameter,
+    UniformIntegerHyperparameter,
+    CategoricalHyperparameter
 )
 
 import numpy as np
@@ -16,6 +18,34 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
+def get_num_output_dimensions(config: Dict[str, Any], num_categs_per_feature: List[int]) -> List[int]:
+    """
+        Returns list of embedding sizes for each categorical variable.
+        Selects this adaptively based on training_datset.
+        Note: Assumes there is at least one embed feature.
+    Args:
+        config (Dict[str, Any]): 
+            contains the hyperparameters required to calculate the `num_output_dimensions`
+        num_categs_per_feature (List[int]):
+            list containing number of categories for each feature that is to be embedded,
+            0 if the column is not an embed column
+    Returns:
+        List[int]:
+            list containing the output embedding size for each column,
+            1 if the column is not an embed column
+    """
+
+    max_embedding_dim = config['max_embedding_dim']
+    embed_exponent = config['embed_exponent']
+    size_factor = config['embedding_size_factor']
+    num_output_dimensions = [int(size_factor*max(
+                                                 2,
+                                                 min(max_embedding_dim,
+                                                     1.6 * num_categories**embed_exponent)))
+                             if num_categories > 0 else 1 for num_categories in num_categs_per_feature]
+    return num_output_dimensions
+
+
 class _LearnedEntityEmbedding(nn.Module):
     """ Learned entity embedding module for categorical features"""
 
@@ -35,9 +65,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n
 
         self.num_embed_features = self.num_categories_per_col[self.embed_features]
 
-        self.num_output_dimensions = [1] * num_features_excl_embed
-        self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in
-                                           enumerate(self.num_embed_features)])
+        self.num_output_dimensions = get_num_output_dimensions(config, self.num_categories_per_col)
 
         self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions)
 
@@ -78,12 +106,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # before passing it through the model
         concat_seq = []
 
-        x_pointer = 0
         layer_pointer = 0
         for x_pointer, embed in enumerate(self.embed_features):
             current_feature_slice = x[:, x_pointer]
             if not embed:
-                x_pointer += 1
                 concat_seq.append(current_feature_slice.view(-1, 1))
                 continue
             current_feature_slice = current_feature_slice.to(torch.int)
@@ -153,28 +179,24 @@ def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
-                                                                                   value_range=(0, 1),
-                                                                                   default_value=0.5),
+        embed_exponent: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embed_exponent",
+                                                                                   value_range=(0.56,),
+                                                                                   default_value=0.56),
+        max_embedding_dim: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_embedding_dim",
+                                                                                   value_range=(100,),
+                                                                                   default_value=100),
+        embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor",
+                                                                                     value_range=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5),
+                                                                                     default_value=1,
+                                                                                     ),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         if dataset_properties is not None:
-            for i in range(len(dataset_properties['categorical_columns'])
-                           if isinstance(dataset_properties['categorical_columns'], List) else 0):
-                # currently as we dont have information about the embedding columns
-                # we search for more dimensions than necessary. This can be solved by
-                # not having `min_unique_values_for_embedding` as a hyperparameter and
-                # instead passing it as a parameter to the feature validator, which
-                # allows us to pass embed_columns to the dataset properties.
-                # TODO: test the trade off
-                # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding`
-                # in one custom transformer. this will also allow users to use this transformer
-                # outside the pipeline
-                ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i),
-                                                                       value_range=dimension_reduction.value_range,
-                                                                       default_value=dimension_reduction.default_value,
-                                                                       log=dimension_reduction.log)
-                add_hyperparameter(cs, ee_dimensions_search_space, UniformFloatHyperparameter)
+            if len(dataset_properties['categorical_columns']) > 0:
+                add_hyperparameter(cs, embed_exponent, UniformFloatHyperparameter)
+                add_hyperparameter(cs, max_embedding_dim, UniformIntegerHyperparameter)
+                add_hyperparameter(cs, embedding_size_factor, CategoricalHyperparameter)
+
         return cs
 
     @staticmethod
diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py
@@ -13,8 +13,6 @@
 )
 
 
-# TODO: fix in preprocessing PR
-# @pytest.mark.skip("Skipping tests as preprocessing is not finalised")
 @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only',
                                                     'classification_categorical_only',
                                                     'classification_numerical_and_categorical'], indirect=True)
diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py
@@ -19,7 +19,6 @@ def head(request):
     return request.param
 
 
-# TODO: add 'LearnedEntityEmbedding' after preprocessing dix
 @pytest.fixture(params=['NoEmbedding', 'LearnedEntityEmbedding'])
 def embedding(request):
     return request.param
diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py
@@ -61,11 +61,9 @@ def test_pipeline_fit(self, fit_dictionary_tabular):
         """This test makes sure that the pipeline is able to fit
         given random combinations of hyperparameters across the pipeline"""
         # TODO: fix issue where adversarial also works for regression
-        # TODO: Fix issue with learned entity embedding after preprocessing PR
         pipeline = TabularRegressionPipeline(
             dataset_properties=fit_dictionary_tabular['dataset_properties'],
-            exclude={'trainer': ['AdversarialTrainer'],
-                     'network_embedding': ['LearnedEntityEmbedding']})
+            exclude={'trainer': ['AdversarialTrainer']})
         cs = pipeline.get_hyperparameter_search_space()
 
         config = cs.sample_configuration()
@@ -91,8 +89,7 @@ def test_pipeline_predict(self, fit_dictionary_tabular):
         X = fit_dictionary_tabular['X_train'].copy()
         pipeline = TabularRegressionPipeline(
             dataset_properties=fit_dictionary_tabular['dataset_properties'],
-            exclude={'trainer': ['AdversarialTrainer'],
-                     'network_embedding': ['LearnedEntityEmbedding']})
+            exclude={'trainer': ['AdversarialTrainer']})
 
         cs = pipeline.get_hyperparameter_search_space()
         config = cs.sample_configuration()
@@ -121,8 +118,7 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
 
         pipeline = TabularRegressionPipeline(
             dataset_properties=fit_dictionary_tabular['dataset_properties'],
-            exclude={'trainer': ['AdversarialTrainer'],
-                     'network_embedding': ['LearnedEntityEmbedding']})
+            exclude={'trainer': ['AdversarialTrainer']})
         cs = pipeline.get_hyperparameter_search_space()
         config = cs.sample_configuration()
         pipeline.set_hyperparameters(config)
@@ -139,11 +135,10 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
         assert fit_dictionary_tabular.items() <= transformed_fit_dictionary_tabular.items()
 
         # Then the pipeline should have added the following keys
-        # Removing 'imputer', 'encoder', 'scaler', these will be
-        # TODO: added back after a PR fixing preprocessing
         expected_keys = {'tabular_transformer', 'preprocess_transforms', 'network',
                          'optimizer', 'lr_scheduler', 'train_data_loader',
-                         'val_data_loader', 'run_summary', 'feature_preprocessor'}
+                         'val_data_loader', 'run_summary', 'feature_preprocessor',
+                         'imputer', 'encoder', 'scaler'}
         assert expected_keys.issubset(set(transformed_fit_dictionary_tabular.keys()))
 
         # Then we need to have transformations being created.

Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,6 @@`
`13`	`13`	`)`
`14`	`14`
`15`	`15`
`16`		`-# TODO: fix in preprocessing PR`
`17`		`-# @pytest.mark.skip("Skipping tests as preprocessing is not finalised")`
`18`	`16`	`@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only',`
`19`	`17`	`'classification_categorical_only',`
`20`	`18`	`'classification_numerical_and_categorical'], indirect=True)`