Skip to content

Commit c2ffc02

Browse files
committed
adapt new pipline for time-series tasks
1 parent 2839caa commit c2ffc02

File tree

7 files changed

+50
-5
lines changed

7 files changed

+50
-5
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from typing import Any, Dict, Optional
2+
3+
import numpy as np
4+
5+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
6+
ColumnSplitter
7+
)
8+
from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
9+
autoPyTorchTimeSeriesPreprocessingComponent
10+
11+
12+
class TimeSeriesColumnSplitter(ColumnSplitter, autoPyTorchTimeSeriesPreprocessingComponent):
13+
"""
14+
Splits categorical columns into embed or encode columns based on a hyperparameter.
15+
The splitter for time series is quite similar to the tabular splitter. However, we need to reserve the raw
16+
number of categorical features for later use
17+
"""
18+
def __init__(
19+
self,
20+
min_categories_for_embedding: float = 5,
21+
random_state: Optional[np.random.RandomState] = None
22+
):
23+
super(TimeSeriesColumnSplitter, self).__init__(min_categories_for_embedding, random_state)
24+
self.num_categories_per_col_encoded = None
25+
26+
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'TimeSeriesColumnSplitter':
27+
super(TimeSeriesColumnSplitter, self).fit(X, y)
28+
self.num_categories_per_col_encoded = X['dataset_properties']['num_categories_per_col']
29+
return self
30+
31+
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
32+
X = super(TimeSeriesColumnSplitter, self).transform(X)
33+
X['dataset_properties']['num_categories_per_col_encoded'] = self.num_categories_per_col_encoded
34+
return X

autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/column_spliting/__init__.py

Whitespace-only changes.

autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@ def __init__(self,
1919
def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder:
2020
OneHotEncoder.fit(self, X, y)
2121
categorical_columns = X['dataset_properties']['categorical_columns']
22-
num_categories_per_col = X['dataset_properties']['num_categories_per_col']
22+
if 'num_categories_per_col_encoded' in X['dataset_properties']:
23+
num_categories_per_col = X['dataset_properties']['num_categories_per_col_encoded']
24+
else:
25+
num_categories_per_col = X['dataset_properties']['num_categories_per_col']
2326
feature_names = X['dataset_properties']['feature_names']
2427
feature_shapes = X['dataset_properties']['feature_shapes']
2528

26-
if len(num_categories_per_col) == 0:
27-
num_categories_per_col = [len(cat) for cat in self.preprocessor['categorical'].categories] # type: ignore
2829
for i, cat_column in enumerate(categorical_columns):
2930
feature_shapes[feature_names[cat_column]] = num_categories_per_col[i]
3031
self.feature_shapes = feature_shapes

autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44

55
from autoPyTorch.pipeline.components.base_component import (
66
ThirdPartyComponents, autoPyTorchComponent, find_components)
7-
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import \
8-
EncoderChoice
7+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
98
from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import \
109
TimeSeriesBaseEncoder
1110

autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
5656
categorical_columns = X['dataset_properties']['categorical_columns']
5757

5858
# resort feature_names
59+
# Previously, the categorical features are sorted before numerical features. However,
60+
# After the preprocessing. The numerical features are sorted at the first place.
5961
new_feature_names = [feature_names[num_col] for num_col in numerical_columns]
6062
new_feature_names += [feature_names[cat_col] for cat_col in categorical_columns]
6163
if set(feature_names) != set(new_feature_names):

autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarr
7070
number of categories for categorical columns and
7171
0 for numerical columns
7272
"""
73+
if X['dataset_properties']['target_type'] == 'time_series_forecasting' \
74+
and X['dataset_properties'].get('uni_variant', False):
75+
# For uni_variant time series forecasting tasks, we don't have the related information for embeddings
76+
return 0, np.asarray([])
77+
7378
num_cols = X['shape_after_preprocessing']
7479
# only works for 2D(rows, features) tabular data
7580
num_features_excl_embed = num_cols[0] - len(X['embed_columns'])

autoPyTorch/pipeline/time_series_forecasting.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
2727
TimeSeriesFeatureTransformer
2828
)
29+
from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.column_spliting.ColumnSplitter import (
30+
TimeSeriesColumnSplitter
31+
)
2932
from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import TimeSeriesEncoderChoice
3033
from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
3134
TimeSeriesFeatureImputer,
@@ -333,6 +336,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
333336
if not default_dataset_properties.get("uni_variant", False):
334337
steps.extend([("impute", TimeSeriesFeatureImputer(random_state=self.random_state)),
335338
("scaler", BaseScaler(random_state=self.random_state)),
339+
("column_splitter", TimeSeriesColumnSplitter(random_state=self.random_state)),
336340
('feature_encoding', TimeSeriesEncoderChoice(default_dataset_properties,
337341
random_state=self.random_state)),
338342
("time_series_transformer", TimeSeriesFeatureTransformer(random_state=self.random_state)),

0 commit comments

Comments
 (0)