-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[enhancement] Add Scaler module (#50)
* Add quantification notebook * Add support for more darts regression models * Revert "Add quantification notebook" This reverts commit 348deb2. * fix docstring * rename notebook * Add Standard Scaler to experiment * Move scaling to a separate module * Add scaling level and support for multiple yhats * Raise with raise_if" * Add docstrings to Scaler * Import Scaler in __init__ * Test per_dataset and per_time_series scaling * Update custom pipeline tutorial * Fix typing * Streamline the choice of scaling level
- Loading branch information
Showing
5 changed files
with
506 additions
and
139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import logging | ||
import os | ||
import pathlib | ||
|
||
import pandas as pd | ||
from sklearn.preprocessing import StandardScaler | ||
|
||
from tot.benchmark import SimpleBenchmark | ||
from tot.datasets.dataset import Dataset | ||
from tot.models.models_neuralprophet import NeuralProphetModel | ||
|
||
log = logging.getLogger("tot.test") | ||
log.setLevel("WARNING") | ||
log.parent.setLevel("WARNING") | ||
|
||
DIR = pathlib.Path(__file__).parent.parent.absolute() | ||
DATA_DIR = os.path.join(DIR, "datasets") | ||
AIR_FILE = os.path.join(DATA_DIR, "air_passengers.csv") | ||
ERCOT_FILE = os.path.join(DATA_DIR, "ercot_load_reduced.csv") | ||
SAVE_DIR = os.path.join(DIR, "tests", "test-logs") | ||
if not os.path.isdir(SAVE_DIR): | ||
os.makedirs(SAVE_DIR) | ||
|
||
try: | ||
from prophet import Prophet | ||
|
||
_prophet_installed = True | ||
except ImportError: | ||
Prophet = None | ||
_prophet_installed = False | ||
|
||
NROWS = 128 | ||
EPOCHS = 2 | ||
BATCH_SIZE = 64 | ||
LR = 1.0 | ||
ERCOT_REGIONS = ["NORTH", "EAST", "FAR_WEST"] | ||
|
||
PLOT = False | ||
|
||
|
||
def test_scaling_per_dataset(): | ||
ercot_df_aux = pd.read_csv(ERCOT_FILE) | ||
ercot_df = pd.DataFrame() | ||
for region in ERCOT_REGIONS: | ||
ercot_df = pd.concat( | ||
( | ||
ercot_df, | ||
ercot_df_aux[ercot_df_aux["ID"] == region].iloc[:NROWS].copy(deep=True), | ||
), | ||
ignore_index=True, | ||
) | ||
air_passengers_df = pd.read_csv(AIR_FILE, nrows=NROWS) | ||
|
||
dataset_list = [ | ||
Dataset( | ||
df=air_passengers_df, | ||
name="air_passengers", | ||
freq="MS", | ||
seasonality_mode="multiplicative", | ||
), | ||
Dataset( | ||
df=ercot_df, | ||
name="ercot", | ||
freq="H", | ||
), | ||
] | ||
model_classes_and_params = [ | ||
( | ||
NeuralProphetModel, | ||
{ | ||
"scaler": StandardScaler(), | ||
"scaling_level": "per_dataset", | ||
"n_lags": 5, | ||
"n_forecasts": 3, | ||
"learning_rate": 0.1, | ||
"normalize": "off", | ||
}, | ||
), | ||
] | ||
log.debug("{}".format(model_classes_and_params)) | ||
|
||
benchmark = SimpleBenchmark( | ||
model_classes_and_params=model_classes_and_params, | ||
datasets=dataset_list, | ||
metrics=["MAE"], | ||
test_percentage=0.25, | ||
) | ||
results_train, results_test = benchmark.run() | ||
|
||
log.debug("{}".format(results_test)) | ||
|
||
|
||
def test_scaling_per_time_series(): | ||
ercot_df_aux = pd.read_csv(ERCOT_FILE) | ||
ercot_df = pd.DataFrame() | ||
for region in ERCOT_REGIONS: | ||
ercot_df = pd.concat( | ||
( | ||
ercot_df, | ||
ercot_df_aux[ercot_df_aux["ID"] == region].iloc[:NROWS].copy(deep=True), | ||
), | ||
ignore_index=True, | ||
) | ||
air_passengers_df = pd.read_csv(AIR_FILE, nrows=NROWS) | ||
|
||
dataset_list = [ | ||
Dataset( | ||
df=air_passengers_df, | ||
name="air_passengers", | ||
freq="MS", | ||
seasonality_mode="multiplicative", | ||
), | ||
Dataset( | ||
df=ercot_df, | ||
name="ercot", | ||
freq="H", | ||
), | ||
] | ||
model_classes_and_params = [ | ||
( | ||
NeuralProphetModel, | ||
{ | ||
"scaler": StandardScaler(), | ||
"scaling_level": "per_time_series", | ||
"n_lags": 5, | ||
"n_forecasts": 3, | ||
"learning_rate": 0.1, | ||
"normalize": "off", | ||
}, | ||
), | ||
] | ||
log.debug("{}".format(model_classes_and_params)) | ||
|
||
benchmark = SimpleBenchmark( | ||
model_classes_and_params=model_classes_and_params, | ||
datasets=dataset_list, | ||
metrics=["MAE"], | ||
test_percentage=0.25, | ||
) | ||
results_train, results_test = benchmark.run() | ||
|
||
log.debug("{}".format(results_test)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .scaler import Scaler # noqa: F401 to evade flake8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
from dataclasses import dataclass | ||
from typing import Callable, Tuple | ||
|
||
import pandas as pd | ||
|
||
from tot.error_utils import raise_if | ||
|
||
|
||
def _pivot(df, col_name): | ||
return df.pivot(index="ds", columns="ID", values=col_name).rename_axis(columns=None).reset_index() | ||
|
||
|
||
def _melt(df, IDs, col_name): | ||
return pd.melt(df, id_vars="ds", value_vars=IDs, var_name="ID", value_name=col_name) | ||
|
||
|
||
SCALING_LEVELS = ["per_dataset", "per_time_series"] | ||
|
||
|
||
@dataclass | ||
class Scaler: | ||
""" | ||
A scaling module allowing to perform transform and inverse_transform operations on the time series data. Supports | ||
transformers from `sklearn.preprocessing` package and other scalers implementing `fit`, `transform` and | ||
`inverse_transform` methods. See: https://scikit-learn.org/stable/modules/preprocessing.html | ||
`scaling_level` specifies global ("per_dataset") or local scaling ("per_time_series"). | ||
Examples | ||
-------- | ||
>>> from sklearn.preprocessing import StandardScaler | ||
>>> scaler = Scaler(transformer=StandardScaler(), scaling_level="per_dataset") | ||
>>> df_train, df_test = scaler.transform(df_train, df_test) | ||
>>> fcst_train, fcst_train = scaler.inverse_transform(fcst_train, fcst_train) | ||
""" | ||
|
||
transformer: object | ||
scaling_level: str | ||
|
||
def __post_init__(self): | ||
is_transformer_valid = ( | ||
callable(getattr(self.transformer, "fit", None)) | ||
and callable(getattr(self.transformer, "transform", None)) | ||
and callable(getattr(self.transformer, "inverse_transform", None)) | ||
) | ||
raise_if( | ||
not is_transformer_valid, | ||
"Transformer provided to the Scaler must implement fit, transform and " "inverse_transform methods", | ||
) | ||
raise_if( | ||
self.scaling_level not in SCALING_LEVELS, | ||
"Invalid scaling level. Available levels: `per_dataset`, " "`per_time_series`", | ||
) | ||
|
||
def _scale_per_series(self, df: pd.DataFrame, fit: bool = False) -> pd.DataFrame: | ||
""" | ||
Applies `transform` per series. Fits the `transformer` if `fit` set to True. First, pivot is performed on the | ||
dataframe so that unique `ID`s become columns, then the transformation is applied to the df's values. Data is | ||
returned in its original format. | ||
Parameters: | ||
----------- | ||
df : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with data | ||
fit : bool | ||
if set to True Scaler is fitted on data from `df` | ||
Returns: | ||
-------- | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with transformed data | ||
""" | ||
IDs = df["ID"].unique() | ||
df_pivot = _pivot(df, "y") | ||
|
||
if fit: | ||
self.transformer.fit(df_pivot[IDs]) | ||
df_pivot[IDs] = self.transformer.transform(df_pivot[IDs]) | ||
|
||
return _melt(df_pivot, IDs, "y") | ||
|
||
def _scale(self, df: pd.DataFrame, fit=False) -> pd.DataFrame: | ||
""" | ||
Applies `transform` on `y` column in `df`. Fits the `transformer` if `fit` set to True. | ||
Parameters: | ||
----------- | ||
df : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with data | ||
fit : bool | ||
if set to True Scaler is fitted with data from `df` | ||
Returns: | ||
-------- | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with transformed data | ||
""" | ||
if fit: | ||
self.transformer.fit(df["y"].values.reshape(-1, 1)) | ||
df["y"] = self.transformer.transform(df["y"].values.reshape(-1, 1)) | ||
return df | ||
|
||
def _rescale_per_series(self, df: pd.DataFrame, col_name: str) -> pd.DataFrame: | ||
""" | ||
Applies `inverse_transform` per series. First, pivot is performed on the dataframe so that unique `ID`s | ||
become columns, then inverse transformation is applied. Data is returned in its original format. | ||
Parameters: | ||
----------- | ||
df : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, [``yhat<i>``], and optionally ``ID`` with data | ||
col_name : str | ||
name of the column, on which the operation is applied | ||
Returns: | ||
-------- | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, [``yhat<i>``], and optionally ``ID`` with rescaled data | ||
""" | ||
IDs = df["ID"].unique() | ||
df_pivot = _pivot(df, col_name) | ||
|
||
df_pivot[IDs] = self.transformer.inverse_transform(df_pivot[IDs]) | ||
|
||
return _melt(df_pivot, IDs, col_name) | ||
|
||
def _rescale(self, df: pd.DataFrame, col_name: str) -> pd.DataFrame: | ||
""" | ||
Applies `inverse_transform` on column `col_name` in `df`. | ||
Parameters: | ||
----------- | ||
df : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, [``yhat<i>``], and optionally ``ID`` with data | ||
col_name : str | ||
name of the column, on which the operation is applied | ||
Returns: | ||
-------- | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, [``yhat<i>``], and optionally ``ID`` with rescaled data | ||
""" | ||
df[col_name] = self.transformer.inverse_transform(df[col_name].values.reshape(-1, 1)) | ||
return df | ||
|
||
def _inverse_transform(self, df: pd.DataFrame, rescale_method: Callable) -> pd.DataFrame: | ||
""" | ||
Applies rescaling on the `df`. First, rescaling is performed on the `y` column to create the main df. Then, | ||
operation is repeated on all `yhat` values and results are updated in the main df. Proper `rescale` | ||
implementation is chosen based on scaling level. | ||
Parameters: | ||
----------- | ||
df : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, [``yhat<i>``] and optionally ``ID`` with data | ||
Returns: | ||
-------- | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, [``yhat<i>``] and optionally ``ID`` with rescaled data | ||
""" | ||
result = rescale_method(df, "y") | ||
|
||
yhats = [col for col in df.columns if "yhat" in col] | ||
for yhat in yhats: | ||
result[yhat] = rescale_method(df, yhat)[yhat] | ||
|
||
return result | ||
|
||
def transform(self, df_train: pd.DataFrame, df_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: | ||
""" | ||
Applies `transform` on the dataframes. The transformer is fit on the `df_train`. | ||
Parameters: | ||
----------- | ||
df_train : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with train data | ||
df_train : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with test data | ||
Returns: | ||
-------- | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with scaled train data | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``, and optionally ``ID`` with scaled test data | ||
""" | ||
df_train = df_train.copy() | ||
df_test = df_test.copy() | ||
if self.scaling_level == "per_time_series": | ||
return self._scale_per_series(df_train, fit=True), self._scale_per_series(df_test) | ||
|
||
return self._scale(df_train, fit=True), self._scale(df_test) | ||
|
||
def inverse_transform(self, df_train: pd.DataFrame, df_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: | ||
""" | ||
Applies `inverse_transform` on the dataframes. | ||
Parameters: | ||
----------- | ||
df_train : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``,[``yhat<i>``], and optionally ``ID`` with train results | ||
df_train : pd.DataFrame | ||
dataframe containing column ``ds``, ``y``,[``yhat<i>``], and optionally ``ID`` with test results | ||
Returns: | ||
-------- | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``,[``yhat<i>``], and optionally ``ID`` with rescaled train results | ||
pd.DataFrame | ||
dataframe containing column ``ds``, ``y``,[``yhat<i>``], and optionally ``ID`` with rescaled test results | ||
""" | ||
df_train = df_train.copy() | ||
df_test = df_test.copy() | ||
if self.scaling_level == "per_time_series": | ||
rescale_method = self._rescale_per_series | ||
else: | ||
rescale_method = self._rescale | ||
return self._inverse_transform(df_train, rescale_method), self._inverse_transform(df_test, rescale_method) |
Oops, something went wrong.