embedding compatible for time series tasks

dengdifan · dengdifan · commit 1d9ddc06df85 · 2022-08-19T19:51:55.000+02:00
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -72,34 +72,45 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n
 
         self.ee_layers = self._create_ee_layers()
 
-    def get_partial_models(self, subset_features: List[int]) -> "_LearnedEntityEmbedding":
+    def get_partial_models(self,
+                           n_excl_embed_features: int,
+                           idx_embed_feat_partial: List[int]) -> "_LearnedEntityEmbedding":
         """
         extract a partial models that only works on a subset of the data that ought to be passed to the embedding
         network, this function is implemented for time series forecasting tasks where the known future features is only
         a subset of the past features
         Args:
-            subset_features (List[int]):
-                a set of index identifying which features will pass through the partial model
+            n_excl_embed_features (int):
+                number of unembedded features
+            idx_embed_feat_partial (List[int]):
+                a set of index identifying the which embedding features will be inherited by the partial model
 
         Returns:
             partial_model (_LearnedEntityEmbedding)
                 a new partial model
         """
-        num_input_features = self.num_categories_per_col[subset_features]
-        num_features_excl_embed = sum([sf < self.num_features_excl_embed for sf in subset_features])
+        n_partial_features = n_excl_embed_features + len(idx_embed_feat_partial)
 
-        num_output_dimensions = [self.num_output_dimensions[sf] for sf in subset_features]
-        embed_features = [self.embed_features[sf] for sf in subset_features]
+        num_categories_per_col = np.zeros(n_partial_features, dtype=np.int16)
+        num_output_dimensions = [1] * n_partial_features
 
         ee_layers = []
-        ee_layer_tracker = 0
-        for sf in subset_features:
-            if self.embed_features[sf]:
-                ee_layers.append(self.ee_layers[ee_layer_tracker])
-                ee_layer_tracker += 1
+        for idx, idx_embed in enumerate(idx_embed_feat_partial):
+            idx_raw = self.num_features_excl_embed + idx_embed
+            n_embed = self.num_categories_per_col[idx_raw]
+            n_output = self.num_output_dimensions[idx_raw]
+
+            idx_new = n_excl_embed_features + idx
+            num_categories_per_col[idx_new] = n_embed
+            num_output_dimensions[idx_new] = n_output
+
+            ee_layers.append(self.ee_layers[idx_embed])
+
         ee_layers = nn.ModuleList(ee_layers)
 
-        return PartialLearnedEntityEmbedding(num_input_features, num_features_excl_embed, embed_features,
+        embed_features = num_categories_per_col > 0
+
+        return PartialLearnedEntityEmbedding(num_categories_per_col, n_excl_embed_features, embed_features,
                                              num_output_dimensions, ee_layers)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -108,10 +119,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         concat_seq = []
 
         layer_pointer = 0
-        # Time series tasks need to add targets to the embeddings. However, the target information is not recorded
-        # by autoPyTorch's embeddings. Therefore, we need to add the targets parts to `concat_seq` manually, which is
-        # the last few dimensions of the input x
-        # we assign x_pointer to 0 beforehand to avoid the case that self.embed_features has 0 length
+        # Given that our embedding network is only applied to the last few feature columns self.embed_features
         x_pointer = 0
         for x_pointer, embed in enumerate(self.embed_features):
             if not embed:
@@ -121,9 +129,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             current_feature_slice = x[..., x_pointer]
             current_feature_slice = current_feature_slice.to(torch.int)
             concat_seq.append(self.ee_layers[layer_pointer](current_feature_slice))
-            layer_pointer += 1
 
-        concat_seq.append(x[..., x_pointer:])
+            layer_pointer += 1
 
         return torch.cat(concat_seq, dim=-1)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
@@ -12,7 +12,7 @@
 
 
 class _NoEmbedding(nn.Module):
-    def get_partial_models(self, subset_features: List[int]) -> "_NoEmbedding":
+    def get_partial_models(self, **kwargs: Any) -> "_NoEmbedding":
         return self
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -20,6 +20,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None):
         self.embedding: Optional[nn.Module] = None
         self.random_state = random_state
         self.feature_shapes: Dict[str, int] = {}
+        self.embed_features_idx: Optional[Tuple] = None
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
@@ -30,22 +31,28 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             num_features_excl_embed=num_features_excl_embed
         )
         if "feature_shapes" in X['dataset_properties']:
+            n_features_embedded = len(num_categories_per_col) - num_features_excl_embed
             if num_output_features is not None:
                 feature_shapes = X['dataset_properties']['feature_shapes']
                 # forecasting tasks
                 feature_names = X['dataset_properties']['feature_names']
-                for idx_cat, n_output_cat in enumerate(num_output_features[num_features_excl_embed:]):
-                    cat_feature_name = feature_names[idx_cat + num_features_excl_embed]
-                    feature_shapes[cat_feature_name] = n_output_cat
+                n_features_all = len(feature_names)
+                embed_features_idx = tuple(range(n_features_all - n_features_embedded, n_features_all))
+                for idx, n_output_embedded in zip(embed_features_idx, num_output_features[-n_features_embedded:]):
+                    feat_name = feature_names[idx]
+                    feature_shapes[feat_name] = n_output_embedded
+                self.embed_features_idx = embed_features_idx
                 self.feature_shapes = feature_shapes
             else:
                 self.feature_shapes = X['dataset_properties']['feature_shapes']
+                self.embed_features_idx = []
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'network_embedding': self.embedding})
         if "feature_shapes" in X['dataset_properties']:
             X['dataset_properties'].update({"feature_shapes": self.feature_shapes})
+            X['embed_features_idx'] = self.embed_features_idx
         return X
 
     def build_embedding(self,