diff --git a/datasets/__init__.py b/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/datasets/data.py b/datasets/data.py
new file mode 100644
index 0000000..f73410d
--- /dev/null
+++ b/datasets/data.py
@@ -0,0 +1,818 @@
+from typing import Optional
+import os
+from multiprocessing import Pool, cpu_count
+import glob
+import re
+import logging
+from itertools import repeat, chain
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from sktime.utils import load_data
+from sklearn import preprocessing
+from datetime import  datetime
+
+import matplotlib.pyplot as plt
+from datasets import utils
+
+logger = logging.getLogger('__main__')
+
+class Normalizer(object):
+    """
+    Normalizes dataframe across ALL contained rows (time steps). Different from per-sample normalization.
+    """
+
+    def __init__(self, norm_type, mean=None, std=None, min_val=None, max_val=None):
+        """
+        Args:
+            norm_type: choose from:
+                "standardization", "minmax": normalizes dataframe across ALL contained rows (time steps)
+                "per_sample_std", "per_sample_minmax": normalizes each sample separately (i.e. across only its own rows)
+            mean, std, min_val, max_val: optional (num_feat,) Series of pre-computed values
+        """
+
+        self.norm_type = norm_type
+        self.mean = mean
+        self.std = std
+        self.min_val = min_val
+        self.max_val = max_val
+        self.types = None
+        self.labelEncoder = {}
+
+    def normalize(self, df):
+        """
+        Args:
+            df: input dataframe
+        Returns:
+            df: normalized dataframe
+        """
+        # logger.info(" Normalize : shape  {} \n columns {}", .format(df.shape,  df.head()))
+        self.types = df.dtypes
+        # if any feature in Numeric then done label-encoding;
+        charcolumns =  df.columns[self.types == "object"]
+        for cols in charcolumns:
+            self.labelEncoder[cols] = preprocessing.LabelEncoder().fit(df[cols])
+            df[cols] = self.labelEncoder[cols].transform(df[cols])
+
+        if self.norm_type == "standardization":
+            if self.mean is None:
+                self.mean = df.mean()
+                self.std = df.std()
+            return (df - self.mean) / (self.std + np.finfo(float).eps)
+
+        elif self.norm_type == "minmax":
+            if self.max_val is None:
+                self.max_val = df.max()
+                self.min_val = df.min()
+                logger.info("normalize: \nmax {} \n min{} ".format(self.max_val, self.min_val))
+            return (df - self.min_val) / (self.max_val - self.min_val + np.finfo(float).eps)
+
+        elif self.norm_type == "per_sample_std":
+            grouped = df.groupby(by=df.index)
+            self.groupedidx = grouped.indices
+            self.groupedmean = grouped.transform('mean')
+            self.groupedstddev = grouped.transform('std')
+            return (df - self.groupedmean) / self.groupedstddev
+
+        elif self.norm_type == "per_sample_minmax":
+            grouped = df.groupby(by=df.index)
+            self.groupedidx  = grouped.indices
+            self.groupedmin_vals = grouped.transform('min')
+            self.groupedmax_vals = grouped.transform('max')
+            return (df - self.groupedmin_vals) / (self.groupedmax_vals - self.groupedmin_vals + np.finfo(float).eps)
+
+        else:
+            raise (NameError(f'Normalize method "{self.norm_type}" not implemented'))
+
+    def inverse_normalize(self, df:pd.DataFrame=None):
+        """reverses the transformation done with the method that was used normalize function above.
+
+        Args:
+            df (_type_): dataFrame with the same names as that was given earlier
+        """
+        dfout = pd.DataFrame()
+        # inverse normalize the character columns once done with reversing of the minmax
+
+        if self.norm_type == "standardization":
+            assert set(self.mean.index).issuperset(set(df.columns))
+            means = self.means[df.columns]
+            stds = self.std[df.columns]
+            dfout =  pd.DataFrame(df*(stds+ np.finfo(float).eps) +  means)
+
+        elif self.norm_type == "minmax":
+            assert set(self.max_val.index).issuperset(set(df.columns))
+            minvals = self.min_val[df.columns]
+            maxvals = self.max_val[df.columns]
+            dfout = pd.DataFrame((maxvals - minvals + np.finfo(float).eps)*df + minvals)
+
+        elif self.norm_type == "per_sample_std":
+            logger.info( "---------------- YTD ------------------------")
+            idx  = df.index
+            means = self.groupedmean[idx]
+            stds = self.groupedstddev[idx]
+            dfout = pd.DataFrame(df*stds + means)
+
+        elif self.norm_type == "per_sample_minmax":
+            idx = df.index
+            min_vals = self.groupedmin_vals[idx]
+            max_vals = self.groupedmax_vals[idx]
+            dfout = pd.DataFrame(df * (max_vals - min_vals + np.finfo(float).eps) - min_vals)
+
+        else:
+            raise (NameError(f'Normalize method "{self.norm_type}" not implemented'))
+            return None
+        
+        dfout.columns = df.columns
+        charcolumns = dfout.columns[dfout.columns.dtypes == "object"]
+        for cols in charcolumns:
+            dfout[cols] = self.labelEncoder.inverse_transform(dfout[cols])
+        
+        return dfout
+
+
+def interpolate_missing(y):
+    """
+    Replaces NaN values in pd.Series `y` using linear interpolation
+    """
+    if y.isna().any():
+        y = y.interpolate(method='linear', limit_direction='both')
+    return y
+
+
+def subsample(y, limit=256, factor=2):
+    """
+    If a given Series is longer than `limit`, returns subsampled sequence by the specified integer factor
+    """
+    if len(y) > limit:
+        return y[::factor].reset_index(drop=True)
+    return y
+
+
+class BaseData(object):
+
+    def set_num_processes(self, n_proc):
+
+        if (n_proc is None) or (n_proc <= 0):
+            self.n_proc = cpu_count()  # max(1, cpu_count() - 1)
+        else:
+            self.n_proc = min(n_proc, cpu_count())
+
+
+class WeldData(BaseData):
+    """
+    Dataset class for welding dataset.
+    Attributes:
+        all_df: dataframe indexed by ID, with multiple rows corresponding to the same index (sample).
+            Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature.
+        feature_df: contains the subset of columns of `all_df` which correspond to selected features
+        feature_names: names of columns contained in `feature_df` (same as feature_df.columns)
+        all_IDs: IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() )
+        max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used.
+            (Moreover, script argument overrides this attribute)
+    """
+
+    def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None, config=None):
+
+        self.set_num_processes(n_proc=n_proc)
+
+        self.all_df = self.load_all(root_dir, file_list=file_list, pattern=pattern)
+        self.all_df = self.all_df.sort_values(by=['weld_record_index'])  # datasets is presorted
+        # TODO: There is a single ID that causes the model output to become nan - not clear why
+        self.all_df = self.all_df[self.all_df['weld_record_index'] != 920397]  # exclude particular ID
+        self.all_df = self.all_df.set_index('weld_record_index')
+        self.all_IDs = self.all_df.index.unique()  # all sample (session) IDs
+        self.max_seq_len = 66
+        if limit_size is not None:
+            if limit_size > 1:
+                limit_size = int(limit_size)
+            else:  # interpret as proportion if in (0, 1]
+                limit_size = int(limit_size * len(self.all_IDs))
+            self.all_IDs = self.all_IDs[:limit_size]
+            self.all_df = self.all_df.loc[self.all_IDs]
+
+        self.feature_names = ['wire_feed_speed', 'current', 'voltage', 'motor_current', 'power']
+        self.feature_df = self.all_df[self.feature_names]
+
+    def load_all(self, root_dir, file_list=None, pattern=None):
+        """
+        Loads datasets from csv files contained in `root_dir` into a dataframe, optionally choosing from `pattern`
+        Args:
+            root_dir: directory containing all individual .csv files
+            file_list: optionally, provide a list of file paths within `root_dir` to consider.
+                Otherwise, entire `root_dir` contents will be used.
+            pattern: optionally, apply regex string to select subset of files
+        Returns:
+            all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files
+        """
+        # each file name corresponds to another date. Also tools (A, B) and others.
+
+        # Select paths for training and evaluation
+        if file_list is None:
+            data_paths = glob.glob(os.path.join(root_dir, '*'))  # list of all paths
+        else:
+            data_paths = [os.path.join(root_dir, p) for p in file_list]
+        if len(data_paths) == 0:
+            raise Exception('No files found using: {}'.format(os.path.join(root_dir, '*')))
+
+        if pattern is None:
+            # by default evaluate on
+            selected_paths = data_paths
+        else:
+            selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths))
+
+        input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.csv')]
+        if len(input_paths) == 0:
+            raise Exception("No .csv files found using pattern: '{}'".format(pattern))
+
+        if self.n_proc > 1:
+            # Load in parallel
+            _n_proc = min(self.n_proc, len(input_paths))  # no more than file_names needed here
+            logger.info("Loading {} datasets files using {} parallel processes ...".format(len(input_paths), _n_proc))
+            with Pool(processes=_n_proc) as pool:
+                all_df = pd.concat(pool.map(WeldData.load_single, input_paths))
+        else:  # read 1 file at a time
+            all_df = pd.concat(WeldData.load_single(path) for path in input_paths)
+
+        return all_df
+
+    @staticmethod
+    def load_single(filepath):
+        df = WeldData.read_data(filepath)
+        df = WeldData.select_columns(df)
+        num_nan = df.isna().sum().sum()
+        if num_nan > 0:
+            logger.warning("{} nan values in {} will be replaced by 0".format(num_nan, filepath))
+            df = df.fillna(0)
+
+        return df
+
+    @staticmethod
+    def read_data(filepath):
+        """Reads a single .csv, which typically contains a day of datasets of various weld sessions.
+        """
+        df = pd.read_csv(filepath)
+        return df
+
+    @staticmethod
+    def select_columns(df):
+        """"""
+        df = df.rename(columns={"per_energy": "power"})
+        # Sometimes 'diff_time' is not measured correctly (is 0), and power ('per_energy') becomes infinite
+        is_error = df['power'] > 1e16
+        df.loc[is_error, 'power'] = df.loc[is_error, 'true_energy'] / df['diff_time'].median()
+
+        df['weld_record_index'] = df['weld_record_index'].astype(int)
+        keep_cols = ['weld_record_index', 'wire_feed_speed', 'current', 'voltage', 'motor_current', 'power']
+        df = df[keep_cols]
+
+        return df
+
+
+class TSRegressionArchive(BaseData):
+    """
+    Dataset class for datasets included in:
+        1) the Time Series Regression Archive (www.timeseriesregression.org), or
+        2) the Time Series Classification Archive (www.timeseriesclassification.com)
+    Attributes:
+        all_df: (num_samples * seq_len, num_columns) dataframe indexed by integer indices, with multiple rows corresponding to the same index (sample).
+            Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature.
+        feature_df: (num_samples * seq_len, feat_dim) dataframe; contains the subset of columns of `all_df` which correspond to selected features
+        feature_names: names of columns contained in `feature_df` (same as feature_df.columns)
+        all_IDs: (num_samples,) series of IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() )
+        labels_df: (num_samples, num_labels) pd.DataFrame of label(s) for each sample
+        max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used.
+            (Moreover, script argument overrides this attribute)
+    """
+
+    def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None, config=None):
+
+        #self.set_num_processes(n_proc=n_proc)
+
+        self.config = config
+
+        self.all_df, self.labels_df = self.load_all(root_dir, file_list=file_list, pattern=pattern)
+        self.all_IDs = self.all_df.index.unique()  # all sample IDs (integer indices 0 ... num_samples-1)
+
+        if limit_size is not None:
+            if limit_size > 1:
+                limit_size = int(limit_size)
+            else:  # interpret as proportion if in (0, 1]
+                limit_size = int(limit_size * len(self.all_IDs))
+            self.all_IDs = self.all_IDs[:limit_size]
+            self.all_df = self.all_df.loc[self.all_IDs]
+
+        # use all features
+        self.feature_names = self.all_df.columns
+        self.feature_df = self.all_df
+
+    def load_all(self, root_dir, file_list=None, pattern=None):
+        """
+        Loads datasets from csv files contained in `root_dir` into a dataframe, optionally choosing from `pattern`
+        Args:
+            root_dir: directory containing all individual .csv files
+            file_list: optionally, provide a list of file paths within `root_dir` to consider.
+                Otherwise, entire `root_dir` contents will be used.
+            pattern: optionally, apply regex string to select subset of files
+        Returns:
+            all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files
+            labels_df: dataframe containing label(s) for each sample
+        """
+
+        # Select paths for training and evaluation
+        if file_list is None:
+            data_paths = glob.glob(os.path.join(root_dir, '*'))  # list of all paths
+        else:
+            data_paths = [os.path.join(root_dir, p) for p in file_list]
+        if len(data_paths) == 0:
+            raise Exception('No files found using: {}'.format(os.path.join(root_dir, '*')))
+
+        if pattern is None:
+            # by default evaluate on
+            selected_paths = data_paths
+        else:
+            selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths))
+
+        input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.ts')]
+        if len(input_paths) == 0:
+            raise Exception("No .ts files found using pattern: '{}'".format(pattern))
+
+        all_df, labels_df = self.load_single(input_paths[0])  # a single file contains dataset
+
+        return all_df, labels_df
+
+    def load_single(self, filepath):
+
+        # Every row of the returned df corresponds to a sample;
+        # every column is a pd.Series indexed by timestamp and corresponds to a different dimension (feature)
+        if self.config['task'] == 'regression':
+            df, labels = utils.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True, replace_missing_vals_with='NaN')
+            labels_df = pd.DataFrame(labels, dtype=np.float32)
+        elif self.config['task'] == 'classification':
+            df, labels = load_data.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True, replace_missing_vals_with='NaN')
+            labels = pd.Series(labels, dtype="category")
+            self.class_names = labels.cat.categories
+            labels_df = pd.DataFrame(labels.cat.codes, dtype=np.int8)  # int8-32 gives an error when using nn.CrossEntropyLoss
+        else:  # e.g. imputation
+            try:
+                data = load_data.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True,
+                                                                     replace_missing_vals_with='NaN')
+                if isinstance(data, tuple):
+                    df, labels = data
+                else:
+                    df = data
+            except:
+                df, _ = utils.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True,
+                                                                 replace_missing_vals_with='NaN')
+            labels_df = None
+
+        lengths = df.applymap(lambda x: len(x)).values  # (num_samples, num_dimensions) array containing the length of each series
+        horiz_diffs = np.abs(lengths - np.expand_dims(lengths[:, 0], -1))
+
+        # most general check: len(np.unique(lengths.values)) > 1:  # returns array of unique lengths of sequences
+        if np.sum(horiz_diffs) > 0:  # if any row (sample) has varying length across dimensions
+            logger.warning("Not all time series dimensions have same length - will attempt to fix by subsampling first dimension...")
+            df = df.applymap(subsample)  # TODO: this addresses a very specific case (PPGDalia)
+
+        if self.config['subsample_factor']:
+            df = df.applymap(lambda x: subsample(x, limit=0, factor=self.config['subsample_factor']))
+
+        lengths = df.applymap(lambda x: len(x)).values
+        vert_diffs = np.abs(lengths - np.expand_dims(lengths[0, :], 0))
+        if np.sum(vert_diffs) > 0:  # if any column (dimension) has varying length across samples
+            self.max_seq_len = int(np.max(lengths[:, 0]))
+            logger.warning("Not all samples have same length: maximum length set to {}".format(self.max_seq_len))
+        else:
+            self.max_seq_len = lengths[0, 0]
+
+        # First create a (seq_len, feat_dim) dataframe for each sample, indexed by a single integer ("ID" of the sample)
+        # Then concatenate into a (num_samples * seq_len, feat_dim) dataframe, with multiple rows corresponding to the
+        # sample index (i.e. the same scheme as all datasets in this project)
+        df = pd.concat((pd.DataFrame({col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index(
+            pd.Series(lengths[row, 0]*[row])) for row in range(df.shape[0])), axis=0)
+
+        # Replace NaN values
+        grp = df.groupby(by=df.index)
+        df = grp.transform(interpolate_missing)
+
+        return df, labels_df
+
+
+class PMUData(BaseData):
+    """
+    Dataset class for Phasor Measurement Unit dataset.
+    Attributes:
+        all_df: dataframe indexed by ID, with multiple rows corresponding to the same index (sample).
+            Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature.
+        feature_df: contains the subset of columns of `all_df` which correspond to selected features
+        feature_names: names of columns contained in `feature_df` (same as feature_df.columns)
+        all_IDs: IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() )
+        max_seq_len: maximum sequence (time series) length (optional). Used only if script argument `max_seq_len` is not
+            defined.
+    """
+
+    def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None, config=None):
+
+        self.set_num_processes(n_proc=n_proc)
+
+        self.all_df = self.load_all(root_dir, file_list=file_list, pattern=pattern)
+
+        if config['data_window_len'] is not None:
+            self.max_seq_len = config['data_window_len']
+            # construct sample IDs: 0, 0, ..., 0, 1, 1, ..., 1, 2, ..., (num_whole_samples - 1)
+            # num_whole_samples = len(self.all_df) // self.max_seq_len  # commented code is for more general IDs
+            # IDs = list(chain.from_iterable(map(lambda x: repeat(x, self.max_seq_len), range(num_whole_samples + 1))))
+            # IDs = IDs[:len(self.all_df)]  # either last sample is completely superfluous, or it has to be shortened
+            IDs = [i // self.max_seq_len for i in range(self.all_df.shape[0])]
+            self.all_df.insert(loc=0, column='ExID', value=IDs)
+        else:
+            # self.all_df = self.all_df.sort_values(by=['ExID'])  # dataset is presorted
+            self.max_seq_len = 30
+
+        self.all_df = self.all_df.set_index('ExID')
+        # rename columns
+        self.all_df.columns = [re.sub(r'\d+', str(i//3), col_name) for i, col_name in enumerate(self.all_df.columns[:])]
+        #self.all_df.columns = ["_".join(col_name.split(" ")[:-1]) for col_name in self.all_df.columns[:]]
+        self.all_IDs = self.all_df.index.unique()  # all sample (session) IDs
+
+        if limit_size is not None:
+            if limit_size > 1:
+                limit_size = int(limit_size)
+            else:  # interpret as proportion if in (0, 1]
+                limit_size = int(limit_size * len(self.all_IDs))
+            self.all_IDs = self.all_IDs[:limit_size]
+            self.all_df = self.all_df.loc[self.all_IDs]
+
+        self.feature_names = self.all_df.columns  # all columns are used as features
+        self.feature_df = self.all_df[self.feature_names]
+
+    def load_all(self, root_dir, file_list=None, pattern=None):
+        """
+        Loads datasets from csv files contained in `root_dir` into a dataframe, optionally choosing from `pattern`
+        Args:
+            root_dir: directory containing all individual .csv files
+            file_list: optionally, provide a list of file paths within `root_dir` to consider.
+                Otherwise, entire `root_dir` contents will be used.
+            pattern: optionally, apply regex string to select subset of files
+        Returns:
+            all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files
+        """
+
+        # Select paths for training and evaluation
+        if file_list is None:
+            data_paths = glob.glob(os.path.join(root_dir, '*'))  # list of all paths
+        else:
+            data_paths = [os.path.join(root_dir, p) for p in file_list]
+        if len(data_paths) == 0:
+            raise Exception('No files found using: {}'.format(os.path.join(root_dir, '*')))
+
+        if pattern is None:
+            # by default evaluate on
+            selected_paths = data_paths
+        else:
+            selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths))
+
+        input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.csv')]
+        if len(input_paths) == 0:
+            raise Exception("No .csv files found using pattern: '{}'".format(pattern))
+
+        if self.n_proc > 1:
+            # Load in parallel
+            _n_proc = min(self.n_proc, len(input_paths))  # no more than file_names needed here
+            logger.info("Loading {} datasets files using {} parallel processes ...".format(len(input_paths), _n_proc))
+            with Pool(processes=_n_proc) as pool:
+                all_df = pd.concat(pool.map(PMUData.load_single, input_paths))
+        else:  # read 1 file at a time
+            all_df = pd.concat(PMUData.load_single(path) for path in input_paths)
+
+        return all_df
+
+    @staticmethod
+    def load_single(filepath):
+        df = PMUData.read_data(filepath)
+        #df = PMUData.select_columns(df)
+        num_nan = df.isna().sum().sum()
+        if num_nan > 0:
+            logger.warning("{} nan values in {} will be replaced by 0".format(num_nan, filepath))
+            df = df.fillna(0)
+
+        return df
+
+    @staticmethod
+    def read_data(filepath):
+        """Reads a single .csv, which typically contains a day of datasets of various weld sessions.
+        """
+        df = pd.read_csv(filepath)
+        return df
+
+ 
+## create a class of surveillance data
+class WasteWaterData(BaseData):
+    """
+    Dataset class for Machine dataset.
+    Attributes:
+        all_df: dataframe indexed by ID, with multiple rows corresponding to the same index (sample).
+            Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature.
+
+    """
+    def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None,
+            navalue = None, config=None):
+        
+        self.config = config
+        self.wideData =None
+        self.wideDataFinal = None
+        self.feature_df = None
+        self.set_num_processes(n_proc=n_proc)
+        self.Normalizer = None
+        all_df = None
+
+        try:
+            # print(" ---- init 0: ", root_dir )
+            all_df = self.load_all(root_dir, file_list=file_list, pattern=pattern)
+
+            wwdata, self.demodata = self.processData(all_df)
+
+        except Exception as e:
+            print(" __init__ 1:")
+            print(all_df.head())
+
+            raise 
+        
+        try:
+            self.all_df = self.cleanData(wwdata)
+            self.all_df = self.all_df.set_index('days')
+            self.all_IDs = self.all_df.index.unique()  # all sample (session) IDs
+            self.navalue = navalue
+            self.max_seq_len = 66
+            if limit_size is not None:
+                if limit_size > 1:
+                    limit_size = int(limit_size)
+                else:  # interpret as proportion if in (0, 1]
+                    limit_size = int(limit_size * len(self.all_IDs))
+                self.all_IDs = self.all_IDs[:limit_size]
+                all_df = all_df.loc[self.all_IDs]
+        
+        except Exception as e:
+            print(" __init__ 2:")
+            print(wwdata.head())
+            raise
+
+        """
+        sets variable that are features, uses regex feature match to create the list
+        of features that are needed to generate the sequence.
+
+        feature_df: contains the subset of columns of `all_df` which correspond to selected features
+        feature_names: names of columns contained in `feature_df` (same as feature_df.columns)
+        all_IDs: IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() )
+        max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used.
+            (Moreover, script argument overrides this attribute)
+        """
+        feature_names = ['geoLat','geoLong','phureg', 'sewershedPop',
+                        'mQ', 'mN1', 'mN2', 'mBiomarker']
+        
+        self.feature_df = (self.all_df.loc[self.all_df.phureg.isin(['Central East','Toronto','Central West']).values,feature_names]
+                        .reset_index().sort_values(['days','phureg','geoLat','geoLong']).set_index('days'))
+        self.feature_names = feature_names
+
+        # replacing the missing with -1 in mQ, mN1, mN2, and mBiomarker
+        self.feature_df.loc[:,['mQ', 'mN1', 'mN2', 'mBiomarker']] = self.feature_df.loc[:,['mQ', 'mN1', 'mN2', 'mBiomarker']].fillna(np.nan, inplace=True)
+        logger.info("data shape is {}\n names: {}\n".format(self.feature_df.shape,self.feature_names))
+
+    def load_all(self, root_dir, file_list=None, pattern=None):
+
+        # read in MECP data
+        if file_list is None:
+            data_paths = glob.glob(os.path.join(root_dir,'*')) # list of all paths
+        else:
+            data_paths = [os.path.join(root_dir, p) for p in file_list]
+            
+        if pattern is None:
+            # by default evaluate on
+            selected_paths = data_paths
+        else:
+            selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths))
+
+        input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.csv')]
+        print(" load_all .... ", data_paths,input_paths)
+
+        if len(input_paths) == 0:
+            raise Exception(f"No .csv files found using pattern: {pattern}")
+        
+        if self.n_proc >1:
+            #do this....if you really have big data
+            logger.info(" ----------------------------YTD ---------------------\n")
+            all_df = pd.concat(WasteWaterData.load_single(path) for path in input_paths)
+        else:
+            logger.info(" ----------------- concatenating-----------------------------")
+            all_df = pd.concat(WasteWaterData.load_single(path) for path in input_paths)
+
+        return all_df
+
+    def set_data(self, phureg=None, vars= None, scale=True):
+        """ creates the wide data from the dataset for slice based on the 'phureg' and 'vars',
+        output is data frame which is stored in parameter 'X' with column names having the latitude
+        and longitude information prepended with 'its vars' e.g. var1_<lat>_<long>
+
+        Args:
+            phureg (_type_, optional): one of the regions that you want to consider e.g. GTA. Defaults to None.
+            vars (_type_, optional): These are the variable obtained from the self.wwdata . Defaults to None.
+            normalize (_str_) : if the data has to be normalized before being transformed into wider format
+
+        """
+        if scale:
+            self.Normalizer = Normalizer('minmax')
+            wideDatalist = WasteWaterData.Transform(self.all_df, phureg=phureg, varlist=vars, 
+                    normalizer=self.Normalizer)
+        else :
+            wideDatalist = WasteWaterData.Transform(self.all_df, phureg=phureg, varlist=vars)
+        self.wideData = pd.concat(wideDatalist, axis=1, ignore_index=False)
+        self.X = self.wideData
+    
+    def append_data(self, data:pd.DataFrame=None,axis=1):
+        """
+        This function lets one add another dataset to the already created wide dataset in wide format
+        using the index of days, note that indexes can be retrived on wideData
+        Note that the object doesn't remember any new data that was added already if you want to 
+        add more data then you have to merge the data outside of the function and then call this function 
+        to append it to the root "wideData".
+
+        Args:
+            data (pd.DataFrame, optional): Data frame with row index same as the row index
+            of the wideData. Defaults to None.
+
+        """
+        self.X = pd.concat( [self.wideData, data], axis=axis,join='inner')
+
+    def processData(self,data:pd.DataFrame=None):
+        """_summary_
+        Args:
+            data (_type_, optional): _description_. Defaults to None.
+        Returns:
+            _type_: _description_
+        """
+
+        data["date"] = data.sampleDate.astype('datetime64')
+        data.rename(columns={"publicHealthRegion":"phureg", "publicHealthUnit":"phu"}, inplace=True)
+        logger.info("data shape is {}".format(data.shape))
+
+        self.idxcols = ['phureg','phu','geoLat','geoLong']
+        self.origcols = ['mQ', 'mN1', 'mN2', 'mN1N2', 'mBiomarker']
+        self.mavars = ['mN1_7dma', 'mN2_7dma',  'mN1N2_7dma','mN1E_7dma', 'nmN1_7dma', 'nmN2_7dma','nmN1N2_7dma', 
+              'nmE_7dma',  'nmN1E_7dma', 'qmN1N2', 'nqmN1N2','qmE', 'qmN1E', 'nqmE', 'nqmN1E','nmE', 'nmN1E',
+               'mE', 'mN1E']
+
+        self.othervars = ['date','sewershedPop', 'sewershedPop2020', 'siteID', 'nmN1', 'nmN2',
+                        'nmN1N2',  'qmN1', 'qmN2', 'qmN1N2',  'nqmN1', 'nqmN2','nqmN1N2']
+
+        self.demovars = ['moh_cbed','moh_cbed_7dma', 'moh_abed', 'moh_cbrd', 'moh_cbrd_7dma',
+                        'moh_covidHospitalization', 'moh_covidHosp_7dma',
+                        'moh_vaccine_atLeastTwoDoses']
+
+        # keep only required columns;
+        dat3 = data[ self.idxcols + self.origcols + self.mavars + self.othervars + self.demovars ].copy()
+ 
+        dat3 = dat3[dat3.date >= datetime.strptime("01/04/2021","%d/%m/%Y")]
+        dat3_1 = dat3[self.idxcols + self.demovars].copy()
+        dat3 = dat3[ self.idxcols + self.othervars + self.origcols ].copy()
+
+        return dat3,dat3_1
+
+    @staticmethod
+    def load_single(filepath):
+        print(f'Load_single: {filepath}\n')
+        df = pd.read_csv(filepath, low_memory=False)
+        # df = WasteWaterDataClass.select_columns(df)
+        num_nan = df.isna().sum().sum()
+        if num_nan > 0:
+            logger.warning("{} nan values in {} will be replaced with 0".format(num_nan,filepath))
+
+        return df
+
+    @staticmethod
+    def Transform(df, phureg = 'gta_region', varlist=["nmN1"], index=None, 
+                    normalizer:Normalizer = None ):
+        """Normalization routine for the data we have and it can be done by phu-region and variable list
+        
+        Args:
+            df (_type_): _description_
+            phureg (str, optional): _description_. Defaults to 'gta_region'.
+            varlist (list, optional): List of variables to used for normalization. Defaults to ["nmN1"].
+            index (_type_, optional): _description_. Defaults to None.
+            normalizer (Normalizer, optional): _description_. Defaults to None.
+
+        Returns:
+            _type_: _description_
+        """
+        if index is not None :
+            df = df.set_index(index)
+
+        if not isinstance(phureg,list):
+            phureg = [phureg]
+
+        df = df[df["phureg"].isin(phureg)].copy()
+
+        #Extract columns to build model
+        df = df[["geoLat", "geoLong"]+ varlist]
+
+        if normalizer is not None:
+            # print(f'\t Normalizing using {normalizer.norm_type}...\n')
+            df[varlist] = normalizer.normalize(df[varlist].copy())
+
+        #Group the data into a multivariate time series grouped by geoLat and geoLong
+        df_var = []
+        for i,vars in enumerate(varlist):
+            df_var.append(df.groupby(["geoLat", "geoLong"])[vars].apply(list).reset_index(drop=False))
+
+        #Extract geoLat and geoLong to make labels for time series, then drop the columns
+        names = [ str(a[0]) + '_' + str(a[1]) for a in list(zip(df_var[0]["geoLat"], df_var[0]["geoLong"])) ] 
+
+        # replace the colum names prepended with the latitude and longitude.
+        for i in range(len(df_var)):
+            df_var[i].drop(columns=["geoLat", "geoLong"], inplace=True)
+
+        #List -> columns format
+        for i, vars in enumerate(varlist):
+            df_var[i] = pd.DataFrame(df_var[i][vars].tolist(), index=df_var[i].index).transpose()
+            df_var[i].columns = [ vars + '_' + name for name in names]
+    
+        return df_var
+
+    def cleanData(self, df:pd.DataFrame):
+
+        t1 = df[self.origcols].isna().apply(sum,axis=1)
+        df3 = df[t1 <=30].sort_values(self.idxcols +['date'])
+
+        df3['days'] = (df3.date - pd.to_datetime('2021-04-01')).dt.days
+        
+        global_max = max(df3.days)
+        global_min = min(df3.days)
+        # function to check contiguousness of the days if not add missing line in between
+        def checknadd_missingdays(XX :pd.DataFrame = None ):
+            assert XX.days.unique().shape[0] == XX.shape[0], "====Days are not unique here===="
+            daysdf = pd.DataFrame({'days': range(global_min,global_max)}).reset_index(drop=True)
+            XX = daysdf.merge(XX, how = 'left', on='days')
+            return XX
+
+        logger.info('adding missing dates in the data')
+        df4 = (df3.groupby(self.idxcols)
+                    .apply(lambda x: checknadd_missingdays(x.drop(columns=self.idxcols)))
+                    .reset_index().drop(columns=['date','level_4'])
+                    )
+
+        # impute the sewershedpop if missing from 2020 after backfill and forward fill grouping by 
+        #  region/PHU latitude and longitude.    
+        logger.info("imputing the seweshed population")
+        df4['sewershedPop'] = (df4.groupby(self.idxcols)['sewershedPop']
+                                        .apply(lambda x: x.ffill().bfill())
+                                        .reset_index(drop=True)
+                                        ).values
+        df4['sewershedPop2020'] = (df4.groupby(self.idxcols)['sewershedPop2020']
+                                        .apply(lambda x: x.ffill().bfill())
+                                        .reset_index(drop=True)
+                                        ).values
+        df4.loc[df4.sewershedPop.isna(),'sewershedPop'] = df4.loc[df4.sewershedPop.isna(),'sewershedPop2020']
+
+        # dropping lat long where there less than 10% non-null.
+        print("Dropping Latitude and Longitude with less than 10% non-missing in all columns \n\
+                of the original values....")
+        _X = (df4[self.idxcols + self.origcols ].groupby(['geoLat','geoLong'])
+            .apply(lambda x : x.notnull().mean()).drop(columns=self.idxcols) )
+        drop_rows = (_X<.1).all(axis=1)
+        drop_lat_long = _X[drop_rows].reset_index().iloc[:,:2]
+        
+
+        logger.info('dropping following columns :\n\t {} ...'.format(drop_lat_long))
+        df4 = df4[~df4.geoLat.isin(drop_lat_long.geoLat) & ~df4.geoLong.isin(drop_lat_long.geoLong)]
+
+        # print( '--------------restricting length of latitude/Longitude ----------------' )
+        df4.geoLat = df4.geoLat.round(4)
+        df4.geoLong = df4.geoLong.round(4)
+        logger.info( '------------ done ...-------------')
+        return df4
+
+    @staticmethod
+    def mytsplot(df:pd.DataFrame = None, labels = None, xlabel=None, color='rgmbckyte', ylog=False,
+            title="", ylim=None):
+        if labels is None :
+            labels = df.columns.values
+        n = df.shape[1]
+        maxvalue = df.max().values.max()
+        ymin = max(df.min().values.min(), 8e-6)
+        fig = plt.figure(figsize=(20,12),dpi=120)
+        ax = fig.add_subplot()
+        ax.plot(df, linewidth=1.5 )
+        hdl = ax.get_legend_handles_labels()
+        if ylim is not None:
+            ax.set_ylim(ymin=ylim[0],ymax=ylim[1])
+        else:
+            ax.set_ylim(ymin=ymin, ymax=maxvalue)
+        if ylog:
+            ax.set_yscale('log')
+        fig.legend(hdl,labels=labels,bbox_to_anchor=[1.2,0.6], loc='center right')
+        ax.set_title(title)
+
+        
+data_factory = {'weld': WeldData,
+                'tsra': TSRegressionArchive,
+                'pmu': PMUData,
+                'wastewater': WasteWaterData}
diff --git a/datasets/dataset.py b/datasets/dataset.py
new file mode 100644
index 0000000..d84a736
--- /dev/null
+++ b/datasets/dataset.py
@@ -0,0 +1,324 @@
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+import logging
+
+logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class ImputationDataset(Dataset):
+    """Dynamically computes missingness (noise) mask for each sample"""
+
+    def __init__(self, data, indices, mean_mask_length=3, masking_ratio=0.15,
+                 mode='separate', distribution='geometric', exclude_feats=None):
+        super(ImputationDataset, self).__init__()
+
+        self.data = data  # this is a subclass of the BaseData class in data.py
+        self.IDs = indices  # list of data IDs, but also mapping between integer index and ID
+        self.feature_df = self.data.feature_df.loc[self.IDs]
+
+        logger.info(" Creating ImputatDataset w/data \n{} ".format(self.feature_df.head()))
+
+        self.masking_ratio = masking_ratio
+        self.mean_mask_length = mean_mask_length
+        self.mode = mode
+        self.distribution = distribution
+        self.exclude_feats = exclude_feats
+
+    def __getitem__(self, ind):
+        """
+        For a given integer index, returns the corresponding (seq_length, feat_dim) array and a noise mask of same shape
+        Args:
+            ind: integer index of sample in dataset
+        Returns:
+            X: (seq_length, feat_dim) tensor of the multivariate time series corresponding to a sample
+            mask: (seq_length, feat_dim) boolean tensor: 0s mask and predict, 1s: unaffected input
+            ID: ID of sample
+        """
+
+        X = self.feature_df.loc[self.IDs[ind]].values  # (seq_length, feat_dim) array
+        mask = noise_mask(X, self.masking_ratio, self.mean_mask_length, self.mode, 
+                          self.distribution, self.exclude_feats)  # (seq_length, feat_dim) boolean array
+
+
+
+        data_tr = torch.from_numpy(X.astype(float))
+        mask_tr = torch.from_numpy(mask)
+        # logger.info("__getitem__ \n X type {}\n Mask type {}".format(
+        #                      np.array(data_tr)[:5,:] ,mask_tr.dtype))
+
+        return data_tr, mask_tr, self.IDs[ind]
+
+    def update(self):
+        self.mean_mask_length = min(20, self.mean_mask_length + 1)
+        self.masking_ratio = min(1, self.masking_ratio + 0.05)
+
+    def __len__(self):
+        return len(self.IDs)
+
+
+class TransductionDataset(Dataset):
+
+    def __init__(self, data, indices, mask_feats, start_hint=0.0, end_hint=0.0):
+        super(TransductionDataset, self).__init__()
+
+        self.data = data  # this is a subclass of the BaseData class in data.py
+        self.IDs = indices  # list of data IDs, but also mapping between integer index and ID
+        self.feature_df = self.data.feature_df.loc[self.IDs]
+
+        self.mask_feats = mask_feats  # list/array of indices corresponding to features to be masked
+        self.start_hint = start_hint  # proportion at beginning of time series which will not be masked
+        self.end_hint = end_hint  # end_hint: proportion at the end of time series which will not be masked
+
+    def __getitem__(self, ind):
+        """
+        For a given integer index, returns the corresponding (seq_length, feat_dim) array and a noise mask of same shape
+        Args:
+            ind: integer index of sample in dataset
+        Returns:
+            X: (seq_length, feat_dim) tensor of the multivariate time series corresponding to a sample
+            mask: (seq_length, feat_dim) boolean tensor: 0s mask and predict, 1s: unaffected input
+            ID: ID of sample
+        """
+
+        X = self.feature_df.loc[self.IDs[ind]].values  # (seq_length, feat_dim) array
+        mask = transduct_mask(X, self.mask_feats, self.start_hint,
+                              self.end_hint)  # (seq_length, feat_dim) boolean array
+
+        return torch.from_numpy(X), torch.from_numpy(mask), self.IDs[ind]
+
+    def update(self):
+        self.start_hint = max(0, self.start_hint - 0.1)
+        self.end_hint = max(0, self.end_hint - 0.1)
+
+    def __len__(self):
+        return len(self.IDs)
+
+
+def collate_superv(data, max_len=None):
+    """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create
+    Args:
+        data: len(batch_size) list of tuples (X, y).
+            - X: torch tensor of shape (seq_length, feat_dim); variable seq_length.
+            - y: torch tensor of shape (num_labels,) : class indices or numerical targets
+                (for classification or regression, respectively). num_labels > 1 for multi-task models
+        max_len: global fixed sequence length. Used for architectures requiring fixed length input,
+            where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s
+    Returns:
+        X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input)
+        targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output)
+        target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor
+            0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values
+        padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 means padding
+    """
+
+    batch_size = len(data)
+    features, labels, IDs = zip(*data)
+
+    # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension)
+    lengths = [X.shape[0] for X in features]  # original sequence length for each time series
+    if max_len is None:
+        max_len = max(lengths)
+    X = torch.zeros(batch_size, max_len, features[0].shape[-1])  # (batch_size, padded_length, feat_dim)
+    for i in range(batch_size):
+        end = min(lengths[i], max_len)
+        X[i, :end, :] = features[i][:end, :]
+
+    targets = torch.stack(labels, dim=0)  # (batch_size, num_labels)
+
+    padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16),
+                                 max_len=max_len)  # (batch_size, padded_length) boolean tensor, "1" means keep
+
+    return X, targets, padding_masks, IDs
+
+
+class ClassiregressionDataset(Dataset):
+
+    def __init__(self, data, indices):
+        super(ClassiregressionDataset, self).__init__()
+
+        self.data = data  # this is a subclass of the BaseData class in data.py
+        self.IDs = indices  # list of data IDs, but also mapping between integer index and ID
+        self.feature_df = self.data.feature_df.loc[self.IDs]
+
+        self.labels_df = self.data.labels_df.loc[self.IDs]
+
+    def __getitem__(self, ind):
+        """
+        For a given integer index, returns the corresponding (seq_length, feat_dim) array and a noise mask of same shape
+        Args:
+            ind: integer index of sample in dataset
+        Returns:
+            X: (seq_length, feat_dim) tensor of the multivariate time series corresponding to a sample
+            y: (num_labels,) tensor of labels (num_labels > 1 for multi-task models) for each sample
+            ID: ID of sample
+        """
+
+        X = self.feature_df.loc[self.IDs[ind]].values  # (seq_length, feat_dim) array
+        y = self.labels_df.loc[self.IDs[ind]].values  # (num_labels,) array
+
+        return torch.from_numpy(X), torch.from_numpy(y), self.IDs[ind]
+
+    def __len__(self):
+        return len(self.IDs)
+
+
+def transduct_mask(X, mask_feats, start_hint=0.0, end_hint=0.0):
+    """
+    Creates a boolean mask of the same shape as X, with 0s at places where a feature should be masked.
+    Args:
+        X: (seq_length, feat_dim) numpy array of features corresponding to a single sample
+        mask_feats: list/array of indices corresponding to features to be masked
+        start_hint:
+        end_hint: proportion at the end of time series which will not be masked
+
+    Returns:
+        boolean numpy array with the same shape as X, with 0s at places where a feature should be masked
+    """
+
+    mask = np.ones(X.shape, dtype=bool)
+    start_ind = int(start_hint * X.shape[0])
+    end_ind = max(start_ind, int((1 - end_hint) * X.shape[0]))
+    mask[start_ind:end_ind, mask_feats] = 0
+
+    return mask
+
+
+def compensate_masking(X, mask):
+    """
+    Compensate feature vectors after masking values, in a way that the matrix product W @ X would not be affected on average.
+    If p is the proportion of unmasked (active) elements, X' = X / p = X * feat_dim/num_active
+    Args:
+        X: (batch_size, seq_length, feat_dim) torch tensor
+        mask: (batch_size, seq_length, feat_dim) torch tensor: 0s means mask and predict, 1s: unaffected (active) input
+    Returns:
+        (batch_size, seq_length, feat_dim) compensated features
+    """
+
+    # number of unmasked elements of feature vector for each time step
+    num_active = torch.sum(mask, dim=-1).unsqueeze(-1)  # (batch_size, seq_length, 1)
+    # to avoid division by 0, set the minimum to 1
+    num_active = torch.max(num_active, torch.ones(num_active.shape, dtype=torch.int16))  # (batch_size, seq_length, 1)
+    return X.shape[-1] * X / num_active
+
+
+def collate_unsuperv(data, max_len=None, mask_compensation=False):
+    """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create
+    Args:
+        data: len(batch_size) list of tuples (X, mask).
+            - X: torch tensor of shape (seq_length, feat_dim); variable seq_length.
+            - mask: boolean torch tensor of shape (seq_length, feat_dim); variable seq_length.
+        max_len: global fixed sequence length. Used for architectures requiring fixed length input,
+            where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s
+    Returns:
+        X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input)
+        targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output)
+        target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor
+            0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values
+        padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 ignore (padding)
+    """
+
+    batch_size = len(data)
+    features, masks, IDs = zip(*data)
+
+    # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension)
+    lengths = [X.shape[0] for X in features]  # original sequence length for each time series
+    if max_len is None:
+        max_len = max(lengths)
+    X = torch.zeros(batch_size, max_len, features[0].shape[-1])  # (batch_size, padded_length, feat_dim)
+    target_masks = torch.zeros_like(X,
+                                    dtype=torch.bool)  # (batch_size, padded_length, feat_dim) masks related to objective
+    for i in range(batch_size):
+        end = min(lengths[i], max_len)
+        X[i, :end, :] = features[i][:end, :]
+        target_masks[i, :end, :] = masks[i][:end, :]
+
+    targets = X.clone()
+    X = X * target_masks  # mask input
+    if mask_compensation:
+        X = compensate_masking(X, target_masks)
+
+    padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16), max_len=max_len)  # (batch_size, padded_length) boolean tensor, "1" means keep
+    target_masks = ~target_masks  # inverse logic: 0 now means ignore, 1 means predict
+    return X, targets, target_masks, padding_masks, IDs
+
+
+def noise_mask(X, masking_ratio, lm=3, mode='separate', distribution='geometric', exclude_feats=None):
+    """
+    Creates a random boolean mask of the same shape as X, with 0s at places where a feature should be masked.
+    Args:
+        X: (seq_length, feat_dim) numpy array of features corresponding to a single sample
+        masking_ratio: proportion of seq_length to be masked. At each time step, will also be the proportion of
+            feat_dim that will be masked on average
+        lm: average length of masking subsequences (streaks of 0s). Used only when `distribution` is 'geometric'.
+        mode: whether each variable should be masked separately ('separate'), or all variables at a certain positions
+            should be masked concurrently ('concurrent')
+        distribution: whether each mask sequence element is sampled independently at random, or whether
+            sampling follows a markov chain (and thus is stateful), resulting in geometric distributions of
+            masked squences of a desired mean length `lm`
+        exclude_feats: iterable of indices corresponding to features to be excluded from masking (i.e. to remain all 1s)
+
+    Returns:
+        boolean numpy array with the same shape as X, with 0s at places where a feature should be masked
+    """
+    if exclude_feats is not None:
+        exclude_feats = set(exclude_feats)
+
+    if distribution == 'geometric':  # stateful (Markov chain)
+        if mode == 'separate':  # each variable (feature) is independent
+            mask = np.ones(X.shape, dtype=bool)
+            for m in range(X.shape[1]):  # feature dimension
+                if exclude_feats is None or m not in exclude_feats:
+                    mask[:, m] = geom_noise_mask_single(X.shape[0], lm, masking_ratio)  # time dimension
+        else:  # replicate across feature dimension (mask all variables at the same positions concurrently)
+            mask = np.tile(np.expand_dims(geom_noise_mask_single(X.shape[0], lm, masking_ratio), 1), X.shape[1])
+    else:  # each position is independent Bernoulli with p = 1 - masking_ratio
+        if mode == 'separate':
+            mask = np.random.choice(np.array([True, False]), size=X.shape, replace=True,
+                                    p=(1 - masking_ratio, masking_ratio))
+        else:
+            mask = np.tile(np.random.choice(np.array([True, False]), size=(X.shape[0], 1), replace=True,
+                                            p=(1 - masking_ratio, masking_ratio)), X.shape[1])
+
+    return mask
+
+
+def geom_noise_mask_single(L, lm, masking_ratio):
+    """
+    Randomly create a boolean mask of length `L`, consisting of subsequences of average length lm, masking with 0s a `masking_ratio`
+    proportion of the sequence L. The length of masking subsequences and intervals follow a geometric distribution.
+    Args:
+        L: length of mask and sequence to be masked
+        lm: average length of masking subsequences (streaks of 0s)
+        masking_ratio: proportion of L to be masked
+
+    Returns:
+        (L,) boolean numpy array intended to mask ('drop') with 0s a sequence of length L
+    """
+    keep_mask = np.ones(L, dtype=bool)
+    p_m = 1 / lm  # probability of each masking sequence stopping. parameter of geometric distribution.
+    p_u = p_m * masking_ratio / (1 - masking_ratio)  # probability of each unmasked sequence stopping. parameter of geometric distribution.
+    p = [p_m, p_u]
+
+    # Start in state 0 with masking_ratio probability
+    state = int(np.random.rand() > masking_ratio)  # state 0 means masking, 1 means not masking
+    for i in range(L):
+        keep_mask[i] = state  # here it happens that state and masking value corresponding to state are identical
+        if np.random.rand() < p[state]:
+            state = 1 - state
+
+    return keep_mask
+
+
+def padding_mask(lengths, max_len=None):
+    """
+    Used to mask padded positions: creates a (batch_size, max_len) boolean mask from a tensor of sequence lengths,
+    where 1 means keep element at this position (time step)
+    """
+    batch_size = lengths.numel()
+    max_len = max_len or lengths.max_val()  # trick works because of overloading of 'or' operator for non-boolean types
+    return (torch.arange(0, max_len, device=lengths.device)
+            .type_as(lengths)
+            .repeat(batch_size, 1)
+            .lt(lengths.unsqueeze(1)))
diff --git a/datasets/datasplit.py b/datasets/datasplit.py
new file mode 100644
index 0000000..5041fbd
--- /dev/null
+++ b/datasets/datasplit.py
@@ -0,0 +1,189 @@
+import numpy as np
+from sklearn import model_selection
+
+
+def split_dataset(data_indices, validation_method, n_splits, validation_ratio, test_set_ratio=0,
+                  test_indices=None,
+                  random_seed=1337, labels=None):
+    """
+    Splits dataset (i.e. the global datasets indices) into a test set and a training/validation set.
+    The training/validation set is used to produce `n_splits` different configurations/splits of indices.
+
+    Returns:
+        test_indices: numpy array containing the global datasets indices corresponding to the test set
+            (empty if test_set_ratio is 0 or None)
+        train_indices: iterable of `n_splits` (num. of folds) numpy arrays,
+            each array containing the global datasets indices corresponding to a fold's training set
+        val_indices: iterable of `n_splits` (num. of folds) numpy arrays,
+            each array containing the global datasets indices corresponding to a fold's validation set
+    """
+
+    # Set aside test set, if explicitly defined
+    if test_indices is not None:
+        data_indices = np.array([ind for ind in data_indices if ind not in set(test_indices)])  # to keep initial order
+
+    datasplitter = DataSplitter.factory(validation_method, data_indices, labels)  # DataSplitter object
+
+    # Set aside a random partition of all data as a test set
+    if test_indices is None:
+        if test_set_ratio:  # only if test set not explicitly defined
+            datasplitter.split_testset(test_ratio=test_set_ratio, random_state=random_seed)
+            test_indices = datasplitter.test_indices
+        else:
+            test_indices = []
+    # Split train / validation sets
+    datasplitter.split_validation(n_splits, validation_ratio, random_state=random_seed)
+
+    return datasplitter.train_indices, datasplitter.val_indices, test_indices
+
+
+class DataSplitter(object):
+    """Factory class, constructing subclasses based on feature type"""
+
+    def __init__(self, data_indices, data_labels=None):
+        """data_indices = train_val_indices | test_indices"""
+
+        self.data_indices = data_indices  # global datasets indices
+        self.data_labels = data_labels  # global raw datasets labels
+        self.train_val_indices = np.copy(self.data_indices)  # global non-test indices (training and validation)
+        self.test_indices = []  # global test indices
+
+        if data_labels is not None:
+            self.train_val_labels = np.copy(
+                self.data_labels)  # global non-test labels (includes training and validation)
+            self.test_labels = []  # global test labels # TODO: maybe not needed
+
+    @staticmethod
+    def factory(split_type, *args, **kwargs):
+        if split_type == "StratifiedShuffleSplit":
+            return StratifiedShuffleSplitter(*args, **kwargs)
+        if split_type == "ShuffleSplit":
+            return ShuffleSplitter(*args, **kwargs)
+        else:
+            raise ValueError("DataSplitter for '{}' does not exist".format(split_type))
+
+    def split_testset(self, test_ratio, random_state=1337):
+        """
+        Input:
+            test_ratio: ratio of test set with respect to the entire dataset. Should result in an absolute number of
+                samples which is greater or equal to the number of classes
+        Returns:
+            test_indices: numpy array containing the global datasets indices corresponding to the test set
+            test_labels: numpy array containing the labels corresponding to the test set
+        """
+
+        raise NotImplementedError("Please override function in child class")
+
+    def split_validation(self):
+        """
+        Returns:
+            train_indices: iterable of n_splits (num. of folds) numpy arrays,
+                each array containing the global datasets indices corresponding to a fold's training set
+            val_indices: iterable of n_splits (num. of folds) numpy arrays,
+                each array containing the global datasets indices corresponding to a fold's validation set
+        """
+
+        raise NotImplementedError("Please override function in child class")
+
+
+class StratifiedShuffleSplitter(DataSplitter):
+    """
+    Returns randomized shuffled folds, which preserve the class proportions of samples in each fold. Differs from k-fold
+    in that not all samples are evaluated, and samples may be shared across validation sets,
+    which becomes more probable proportionally to validation_ratio/n_splits.
+    """
+
+    def split_testset(self, test_ratio, random_state=1337):
+        """
+        Input:
+            test_ratio: ratio of test set with respect to the entire dataset. Should result in an absolute number of
+                samples which is greater or equal to the number of classes
+        Returns:
+            test_indices: numpy array containing the global datasets indices corresponding to the test set
+            test_labels: numpy array containing the labels corresponding to the test set
+        """
+
+        splitter = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=random_state)
+        # get local indices, i.e. indices in [0, len(data_labels))
+        train_val_indices, test_indices = next(splitter.split(X=np.zeros(len(self.data_indices)), y=self.data_labels))
+        # return global datasets indices and labels
+        self.train_val_indices, self.train_val_labels = self.data_indices[train_val_indices], self.data_labels[train_val_indices]
+        self.test_indices, self.test_labels = self.data_indices[test_indices], self.data_labels[test_indices]
+
+        return
+
+    def split_validation(self, n_splits, validation_ratio, random_state=1337):
+        """
+        Input:
+            n_splits: number of different, randomized and independent from one-another folds
+            validation_ratio: ratio of validation set with respect to the entire dataset. Should result in an absolute number of
+                samples which is greater or equal to the number of classes
+        Returns:
+            train_indices: iterable of n_splits (num. of folds) numpy arrays,
+                each array containing the global datasets indices corresponding to a fold's training set
+            val_indices: iterable of n_splits (num. of folds) numpy arrays,
+                each array containing the global datasets indices corresponding to a fold's validation set
+        """
+
+        splitter = model_selection.StratifiedShuffleSplit(n_splits=n_splits, test_size=validation_ratio,
+                                                          random_state=random_state)
+        # get local indices, i.e. indices in [0, len(train_val_labels)), per fold
+        train_indices, val_indices = zip(*splitter.split(X=np.zeros(len(self.train_val_labels)), y=self.train_val_labels))
+        # return global datasets indices per fold
+        self.train_indices = [self.train_val_indices[fold_indices] for fold_indices in train_indices]
+        self.val_indices = [self.train_val_indices[fold_indices] for fold_indices in val_indices]
+
+        return
+
+
+class ShuffleSplitter(DataSplitter):
+    """
+    Returns randomized shuffled folds without requiring or taking into account the sample labels. Differs from k-fold
+    in that not all samples are evaluated, and samples may be shared across validation sets,
+    which becomes more probable proportionally to validation_ratio/n_splits.
+    """
+
+    def split_testset(self, test_ratio, random_state=1337):
+        """
+        Input:
+            test_ratio: ratio of test set with respect to the entire dataset. Should result in an absolute number of
+                samples which is greater or equal to the number of classes
+        Returns:
+            test_indices: numpy array containing the global datasets indices corresponding to the test set
+            test_labels: numpy array containing the labels corresponding to the test set
+        """
+
+        splitter = model_selection.ShuffleSplit(n_splits=1, test_size=test_ratio, random_state=random_state)
+        # get local indices, i.e. indices in [0, len(data_indices))
+        train_val_indices, test_indices = next(splitter.split(X=np.zeros(len(self.data_indices))))
+        # return global datasets indices and labels
+        self.train_val_indices = self.data_indices[train_val_indices]
+        self.test_indices = self.data_indices[test_indices]
+        if self.data_labels is not None:
+            self.train_val_labels = self.data_labels[train_val_indices]
+            self.test_labels = self.data_labels[test_indices]
+
+        return
+
+    def split_validation(self, n_splits, validation_ratio, random_state=1337):
+        """
+        Input:
+            n_splits: number of different, randomized and independent from one-another folds
+            validation_ratio: ratio of validation set with respect to the entire dataset. Should result in an absolute number of
+                samples which is greater or equal to the number of classes
+        Returns:
+            train_indices: iterable of n_splits (num. of folds) numpy arrays,
+                each array containing the global datasets indices corresponding to a fold's training set
+            val_indices: iterable of n_splits (num. of folds) numpy arrays,
+                each array containing the global datasets indices corresponding to a fold's validation set
+        """
+
+        splitter = model_selection.ShuffleSplit(n_splits=n_splits, test_size=validation_ratio,
+                                                random_state=random_state)
+        # get local indices, i.e. indices in [0, len(train_val_labels)), per fold
+        train_indices, val_indices = zip(*splitter.split(X=np.zeros(len(self.train_val_indices))))
+        # return global datasets indices per fold
+        self.train_indices = [self.train_val_indices[fold_indices] for fold_indices in train_indices]
+        self.val_indices = [self.train_val_indices[fold_indices] for fold_indices in val_indices]
+
+        return
diff --git a/datasets/utils.py b/datasets/utils.py
new file mode 100644
index 0000000..0db0d35
--- /dev/null
+++ b/datasets/utils.py
@@ -0,0 +1,605 @@
+"""
+Code to load Time Series Regression datasets. From:
+https://github.com/ChangWeiTan/TSRegression/blob/master/utils
+"""
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from tqdm import tqdm
+
+regression_datasets = ["AustraliaRainfall",
+                       "HouseholdPowerConsumption1",
+                       "HouseholdPowerConsumption2",
+                       "BeijingPM25Quality",
+                       "BeijingPM10Quality",
+                       "Covid3Month",
+                       "LiveFuelMoistureContent",
+                       "FloodModeling1",
+                       "FloodModeling2",
+                       "FloodModeling3",
+                       "AppliancesEnergy",
+                       "BenzeneConcentration",
+                       "NewsHeadlineSentiment",
+                       "NewsTitleSentiment",
+                       "BIDMC32RR",
+                       "BIDMC32HR",
+                       "BIDMC32SpO2",
+                       "IEEEPPG",
+                       "PPGDalia"]
+
+
+def uniform_scaling(data, max_len):
+    """
+    This is a function to scale the time series uniformly
+    :param data:
+    :param max_len:
+    :return:
+    """
+    seq_len = len(data)
+    scaled_data = [data[int(j * seq_len / max_len)] for j in range(max_len)]
+
+    return scaled_data
+
+
+# The following code is adapted from the python package sktime to read .ts file.
+class TsFileParseException(Exception):
+    """
+    Should be raised when parsing a .ts file and the format is incorrect.
+    """
+    pass
+
+
+def load_from_tsfile_to_dataframe(full_file_path_and_name, return_separate_X_and_y=True,
+                                  replace_missing_vals_with='NaN'):
+    """Loads data from a .ts file into a Pandas DataFrame.
+
+    Parameters
+    ----------
+    full_file_path_and_name: str
+        The full pathname of the .ts file to read.
+    return_separate_X_and_y: bool
+        true if X and Y values should be returned as separate Data Frames (X) and a numpy array (y), false otherwise.
+        This is only relevant for data that
+    replace_missing_vals_with: str
+       The value that missing values in the text file should be replaced with prior to parsing.
+
+    Returns
+    -------
+    DataFrame, ndarray
+        If return_separate_X_and_y then a tuple containing a DataFrame and a numpy array containing the relevant time-series and corresponding class values.
+    DataFrame
+        If not return_separate_X_and_y then a single DataFrame containing all time-series and (if relevant) a column "class_vals" the associated class values.
+    """
+
+    # Initialize flags and variables used when parsing the file
+    metadata_started = False
+    data_started = False
+
+    has_problem_name_tag = False
+    has_timestamps_tag = False
+    has_univariate_tag = False
+    has_class_labels_tag = False
+    has_target_labels_tag = False
+    has_data_tag = False
+
+    previous_timestamp_was_float = None
+    previous_timestamp_was_int = None
+    previous_timestamp_was_timestamp = None
+    num_dimensions = None
+    is_first_case = True
+    instance_list = []
+    class_val_list = []
+    line_num = 0
+    target_labels = False
+
+    # Parse the file
+    # print(full_file_path_and_name)
+    with open(full_file_path_and_name, 'r', encoding='utf-8') as file:
+        for line in tqdm(file):
+            # print(".", end='')
+            # Strip white space from start/end of line and change to lowercase for use below
+            line = line.strip().lower()
+            # Empty lines are valid at any point in a file
+            if line:
+                # Check if this line contains metadata
+                # Please note that even though metadata is stored in this function it is not currently published externally
+                if line.startswith("@problemname"):
+                    # Check that the data has not started
+                    if data_started:
+                        raise TsFileParseException("metadata must come before data")
+                    # Check that the associated value is valid
+                    tokens = line.split(' ')
+                    token_len = len(tokens)
+
+                    if token_len == 1:
+                        raise TsFileParseException("problemname tag requires an associated value")
+
+                    problem_name = line[len("@problemname") + 1:]
+                    has_problem_name_tag = True
+                    metadata_started = True
+                elif line.startswith("@timestamps"):
+                    # Check that the data has not started
+                    if data_started:
+                        raise TsFileParseException("metadata must come before data")
+
+                    # Check that the associated value is valid
+                    tokens = line.split(' ')
+                    token_len = len(tokens)
+
+                    if token_len != 2:
+                        raise TsFileParseException("timestamps tag requires an associated Boolean value")
+                    elif tokens[1] == "true":
+                        timestamps = True
+                    elif tokens[1] == "false":
+                        timestamps = False
+                    else:
+                        raise TsFileParseException("invalid timestamps value")
+                    has_timestamps_tag = True
+                    metadata_started = True
+                elif line.startswith("@univariate"):
+                    # Check that the data has not started
+                    if data_started:
+                        raise TsFileParseException("metadata must come before data")
+
+                    # Check that the associated value is valid
+                    tokens = line.split(' ')
+                    token_len = len(tokens)
+                    if token_len != 2:
+                        raise TsFileParseException("univariate tag requires an associated Boolean value")
+                    elif tokens[1] == "true":
+                        univariate = True
+                    elif tokens[1] == "false":
+                        univariate = False
+                    else:
+                        raise TsFileParseException("invalid univariate value")
+
+                    has_univariate_tag = True
+                    metadata_started = True
+                elif line.startswith("@classlabel"):
+                    # Check that the data has not started
+                    if data_started:
+                        raise TsFileParseException("metadata must come before data")
+
+                    # Check that the associated value is valid
+                    tokens = line.split(' ')
+                    token_len = len(tokens)
+
+                    if token_len == 1:
+                        raise TsFileParseException("classlabel tag requires an associated Boolean value")
+
+                    if tokens[1] == "true":
+                        class_labels = True
+                    elif tokens[1] == "false":
+                        class_labels = False
+                    else:
+                        raise TsFileParseException("invalid classLabel value")
+
+                    # Check if we have any associated class values
+                    if token_len == 2 and class_labels:
+                        raise TsFileParseException("if the classlabel tag is true then class values must be supplied")
+
+                    has_class_labels_tag = True
+                    class_label_list = [token.strip() for token in tokens[2:]]
+                    metadata_started = True
+                elif line.startswith("@targetlabel"):
+                    # Check that the data has not started
+                    if data_started:
+                        raise TsFileParseException("metadata must come before data")
+
+                    # Check that the associated value is valid
+                    tokens = line.split(' ')
+                    token_len = len(tokens)
+
+                    if token_len == 1:
+                        raise TsFileParseException("targetlabel tag requires an associated Boolean value")
+
+                    if tokens[1] == "true":
+                        target_labels = True
+                    elif tokens[1] == "false":
+                        target_labels = False
+                    else:
+                        raise TsFileParseException("invalid targetLabel value")
+
+                    has_target_labels_tag = True
+                    class_val_list = []
+                    metadata_started = True
+                # Check if this line contains the start of data
+                elif line.startswith("@data"):
+                    if line != "@data":
+                        raise TsFileParseException("data tag should not have an associated value")
+
+                    if data_started and not metadata_started:
+                        raise TsFileParseException("metadata must come before data")
+                    else:
+                        has_data_tag = True
+                        data_started = True
+                # If the 'data tag has been found then metadata has been parsed and data can be loaded
+                elif data_started:
+                    # Check that a full set of metadata has been provided
+                    incomplete_regression_meta_data = not has_problem_name_tag or not has_timestamps_tag or not has_univariate_tag or not has_target_labels_tag or not has_data_tag
+                    incomplete_classification_meta_data = not has_problem_name_tag or not has_timestamps_tag or not has_univariate_tag or not has_class_labels_tag or not has_data_tag
+                    if incomplete_regression_meta_data and incomplete_classification_meta_data:
+                        raise TsFileParseException("a full set of metadata has not been provided before the data")
+
+                    # Replace any missing values with the value specified
+                    line = line.replace("?", replace_missing_vals_with)
+
+                    # Check if we dealing with data that has timestamps
+                    if timestamps:
+                        # We're dealing with timestamps so cannot just split line on ':' as timestamps may contain one
+                        has_another_value = False
+                        has_another_dimension = False
+
+                        timestamps_for_dimension = []
+                        values_for_dimension = []
+
+                        this_line_num_dimensions = 0
+                        line_len = len(line)
+                        char_num = 0
+
+                        while char_num < line_len:
+                            # Move through any spaces
+                            while char_num < line_len and str.isspace(line[char_num]):
+                                char_num += 1
+
+                            # See if there is any more data to read in or if we should validate that read thus far
+
+                            if char_num < line_len:
+
+                                # See if we have an empty dimension (i.e. no values)
+                                if line[char_num] == ":":
+                                    if len(instance_list) < (this_line_num_dimensions + 1):
+                                        instance_list.append([])
+
+                                    instance_list[this_line_num_dimensions].append(pd.Series())
+                                    this_line_num_dimensions += 1
+
+                                    has_another_value = False
+                                    has_another_dimension = True
+
+                                    timestamps_for_dimension = []
+                                    values_for_dimension = []
+
+                                    char_num += 1
+                                else:
+                                    # Check if we have reached a class label
+                                    if line[char_num] != "(" and target_labels:
+                                        class_val = line[char_num:].strip()
+
+                                        # if class_val not in class_val_list:
+                                        #     raise TsFileParseException(
+                                        #         "the class value '" + class_val + "' on line " + str(
+                                        #             line_num + 1) + " is not valid")
+
+                                        class_val_list.append(float(class_val))
+                                        char_num = line_len
+
+                                        has_another_value = False
+                                        has_another_dimension = False
+
+                                        timestamps_for_dimension = []
+                                        values_for_dimension = []
+
+                                    else:
+
+                                        # Read in the data contained within the next tuple
+
+                                        if line[char_num] != "(" and not target_labels:
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " does not start with a '('")
+
+                                        char_num += 1
+                                        tuple_data = ""
+
+                                        while char_num < line_len and line[char_num] != ")":
+                                            tuple_data += line[char_num]
+                                            char_num += 1
+
+                                        if char_num >= line_len or line[char_num] != ")":
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " does not end with a ')'")
+
+                                        # Read in any spaces immediately after the current tuple
+
+                                        char_num += 1
+
+                                        while char_num < line_len and str.isspace(line[char_num]):
+                                            char_num += 1
+
+                                        # Check if there is another value or dimension to process after this tuple
+
+                                        if char_num >= line_len:
+                                            has_another_value = False
+                                            has_another_dimension = False
+
+                                        elif line[char_num] == ",":
+                                            has_another_value = True
+                                            has_another_dimension = False
+
+                                        elif line[char_num] == ":":
+                                            has_another_value = False
+                                            has_another_dimension = True
+
+                                        char_num += 1
+
+                                        # Get the numeric value for the tuple by reading from the end of the tuple data backwards to the last comma
+
+                                        last_comma_index = tuple_data.rfind(',')
+
+                                        if last_comma_index == -1:
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " contains a tuple that has no comma inside of it")
+
+                                        try:
+                                            value = tuple_data[last_comma_index + 1:]
+                                            value = float(value)
+
+                                        except ValueError:
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " contains a tuple that does not have a valid numeric value")
+
+                                        # Check the type of timestamp that we have
+
+                                        timestamp = tuple_data[0: last_comma_index]
+
+                                        try:
+                                            timestamp = int(timestamp)
+                                            timestamp_is_int = True
+                                            timestamp_is_timestamp = False
+                                        except ValueError:
+                                            timestamp_is_int = False
+
+                                        if not timestamp_is_int:
+                                            try:
+                                                timestamp = float(timestamp)
+                                                timestamp_is_float = True
+                                                timestamp_is_timestamp = False
+                                            except ValueError:
+                                                timestamp_is_float = False
+
+                                        if not timestamp_is_int and not timestamp_is_float:
+                                            try:
+                                                timestamp = timestamp.strip()
+                                                timestamp_is_timestamp = True
+                                            except ValueError:
+                                                timestamp_is_timestamp = False
+
+                                        # Make sure that the timestamps in the file (not just this dimension or case) are consistent
+
+                                        if not timestamp_is_timestamp and not timestamp_is_int and not timestamp_is_float:
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " contains a tuple that has an invalid timestamp '" + timestamp + "'")
+
+                                        if previous_timestamp_was_float is not None and previous_timestamp_was_float and not timestamp_is_float:
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " contains tuples where the timestamp format is inconsistent")
+
+                                        if previous_timestamp_was_int is not None and previous_timestamp_was_int and not timestamp_is_int:
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " contains tuples where the timestamp format is inconsistent")
+
+                                        if previous_timestamp_was_timestamp is not None and previous_timestamp_was_timestamp and not timestamp_is_timestamp:
+                                            raise TsFileParseException(
+                                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                                    line_num + 1) + " contains tuples where the timestamp format is inconsistent")
+
+                                        # Store the values
+
+                                        timestamps_for_dimension += [timestamp]
+                                        values_for_dimension += [value]
+
+                                        #  If this was our first tuple then we store the type of timestamp we had
+
+                                        if previous_timestamp_was_timestamp is None and timestamp_is_timestamp:
+                                            previous_timestamp_was_timestamp = True
+                                            previous_timestamp_was_int = False
+                                            previous_timestamp_was_float = False
+
+                                        if previous_timestamp_was_int is None and timestamp_is_int:
+                                            previous_timestamp_was_timestamp = False
+                                            previous_timestamp_was_int = True
+                                            previous_timestamp_was_float = False
+
+                                        if previous_timestamp_was_float is None and timestamp_is_float:
+                                            previous_timestamp_was_timestamp = False
+                                            previous_timestamp_was_int = False
+                                            previous_timestamp_was_float = True
+
+                                        # See if we should add the data for this dimension
+
+                                        if not has_another_value:
+                                            if len(instance_list) < (this_line_num_dimensions + 1):
+                                                instance_list.append([])
+
+                                            if timestamp_is_timestamp:
+                                                timestamps_for_dimension = pd.DatetimeIndex(timestamps_for_dimension)
+
+                                            instance_list[this_line_num_dimensions].append(
+                                                pd.Series(index=timestamps_for_dimension, data=values_for_dimension))
+                                            this_line_num_dimensions += 1
+
+                                            timestamps_for_dimension = []
+                                            values_for_dimension = []
+
+                            elif has_another_value:
+                                raise TsFileParseException(
+                                    "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                        line_num + 1) + " ends with a ',' that is not followed by another tuple")
+
+                            elif has_another_dimension and target_labels:
+                                raise TsFileParseException(
+                                    "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                        line_num + 1) + " ends with a ':' while it should list a class value")
+
+                            elif has_another_dimension and not target_labels:
+                                if len(instance_list) < (this_line_num_dimensions + 1):
+                                    instance_list.append([])
+
+                                instance_list[this_line_num_dimensions].append(pd.Series(dtype=np.float32))
+                                this_line_num_dimensions += 1
+                                num_dimensions = this_line_num_dimensions
+
+                            # If this is the 1st line of data we have seen then note the dimensions
+
+                            if not has_another_value and not has_another_dimension:
+                                if num_dimensions is None:
+                                    num_dimensions = this_line_num_dimensions
+
+                                if num_dimensions != this_line_num_dimensions:
+                                    raise TsFileParseException("line " + str(
+                                        line_num + 1) + " does not have the same number of dimensions as the previous line of data")
+
+                        # Check that we are not expecting some more data, and if not, store that processed above
+
+                        if has_another_value:
+                            raise TsFileParseException(
+                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                    line_num + 1) + " ends with a ',' that is not followed by another tuple")
+
+                        elif has_another_dimension and target_labels:
+                            raise TsFileParseException(
+                                "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
+                                    line_num + 1) + " ends with a ':' while it should list a class value")
+
+                        elif has_another_dimension and not target_labels:
+                            if len(instance_list) < (this_line_num_dimensions + 1):
+                                instance_list.append([])
+
+                            instance_list[this_line_num_dimensions].append(pd.Series())
+                            this_line_num_dimensions += 1
+                            num_dimensions = this_line_num_dimensions
+
+                        # If this is the 1st line of data we have seen then note the dimensions
+
+                        if not has_another_value and num_dimensions != this_line_num_dimensions:
+                            raise TsFileParseException("line " + str(
+                                line_num + 1) + " does not have the same number of dimensions as the previous line of data")
+
+                        # Check if we should have class values, and if so that they are contained in those listed in the metadata
+
+                        if target_labels and len(class_val_list) == 0:
+                            raise TsFileParseException("the cases have no associated class values")
+                    else:
+                        dimensions = line.split(":")
+                        # If first row then note the number of dimensions (that must be the same for all cases)
+                        if is_first_case:
+                            num_dimensions = len(dimensions)
+
+                            if target_labels:
+                                num_dimensions -= 1
+
+                            for dim in range(0, num_dimensions):
+                                instance_list.append([])
+                            is_first_case = False
+
+                        # See how many dimensions that the case whose data in represented in this line has
+                        this_line_num_dimensions = len(dimensions)
+
+                        if target_labels:
+                            this_line_num_dimensions -= 1
+
+                        # All dimensions should be included for all series, even if they are empty
+                        if this_line_num_dimensions != num_dimensions:
+                            raise TsFileParseException("inconsistent number of dimensions. Expecting " + str(
+                                num_dimensions) + " but have read " + str(this_line_num_dimensions))
+
+                        # Process the data for each dimension
+                        for dim in range(0, num_dimensions):
+                            dimension = dimensions[dim].strip()
+
+                            if dimension:
+                                data_series = dimension.split(",")
+                                data_series = [float(i) for i in data_series]
+                                instance_list[dim].append(pd.Series(data_series))
+                            else:
+                                instance_list[dim].append(pd.Series())
+
+                        if target_labels:
+                            class_val_list.append(float(dimensions[num_dimensions].strip()))
+
+            line_num += 1
+
+    # Check that the file was not empty
+    if line_num:
+        # Check that the file contained both metadata and data
+        complete_regression_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag and has_target_labels_tag and has_data_tag
+        complete_classification_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag and has_class_labels_tag and has_data_tag
+
+        if metadata_started and not complete_regression_meta_data and not complete_classification_meta_data:
+            raise TsFileParseException("metadata incomplete")
+        elif metadata_started and not data_started:
+            raise TsFileParseException("file contained metadata but no data")
+        elif metadata_started and data_started and len(instance_list) == 0:
+            raise TsFileParseException("file contained metadata but no data")
+
+        # Create a DataFrame from the data parsed above
+        data = pd.DataFrame(dtype=np.float32)
+
+        for dim in range(0, num_dimensions):
+            data['dim_' + str(dim)] = instance_list[dim]
+
+        # Check if we should return any associated class labels separately
+
+        if target_labels:
+            if return_separate_X_and_y:
+                return data, np.asarray(class_val_list)
+            else:
+                data['class_vals'] = pd.Series(class_val_list)
+                return data
+        else:
+            return data
+    else:
+        raise TsFileParseException("empty file")
+
+
+def process_data(X, min_len, normalise=None):
+    """
+    This is a function to process the data, i.e. convert dataframe to numpy array
+    :param X:
+    :param min_len:
+    :param normalise:
+    :return:
+    """
+    tmp = []
+    for i in tqdm(range(len(X))):
+        _x = X.iloc[i, :].copy(deep=True)
+
+        # 1. find the maximum length of each dimension
+        all_len = [len(y) for y in _x]
+        max_len = max(all_len)
+
+        # 2. adjust the length of each dimension
+        _y = []
+        for y in _x:
+            # 2.1 fill missing values
+            if y.isnull().any():
+                y = y.interpolate(method='linear', limit_direction='both')
+
+            # 2.2. if length of each dimension is different, uniformly scale the shorter ones to the max length
+            if len(y) < max_len:
+                y = uniform_scaling(y, max_len)
+            _y.append(y)
+        _y = np.array(np.transpose(_y))
+
+        # 3. adjust the length of the series, chop of the longer series
+        _y = _y[:min_len, :]
+
+        # 4. normalise the series
+        if normalise == "standard":
+            scaler = StandardScaler().fit(_y)
+            _y = scaler.transform(_y)
+        if normalise == "minmax":
+            scaler = MinMaxScaler().fit(_y)
+            _y = scaler.transform(_y)
+
+        tmp.append(_y)
+    X = np.array(tmp)
+    return X
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..f9a5777
--- /dev/null
+++ b/main.py
@@ -0,0 +1,307 @@
+"""
+Written by George Zerveas
+
+If you use any part of the code in this repository, please consider citing the following paper:
+George Zerveas et al. A Transformer-based Framework for Multivariate Time Series Representation Learning, in
+Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '21), August 14--18, 2021
+"""
+
+import logging
+
+logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logger.info("Loading packages ...")
+import os
+import sys
+import time
+import pickle
+import json
+
+# 3rd party packages
+from tqdm import tqdm
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+# Project modules
+from options import Options
+from running import setup, pipeline_factory, validate, check_progress, NEG_METRICS
+from utils import utils
+from datasets.data import data_factory, Normalizer
+from datasets.datasplit import split_dataset
+from models.ts_transformer import model_factory
+from models.loss import get_loss_module
+from optimizers import get_optimizer
+
+
+def main(config):
+
+    total_epoch_time = 0
+    total_eval_time = 0
+
+    total_start_time = time.time()
+
+    # Add file logging besides stdout
+    file_handler = logging.FileHandler(os.path.join(config['output_dir'], 'output.log'))
+    logger.addHandler(file_handler)
+
+    logger.info('Running:\n{}\n'.format(' '.join(sys.argv)))  # command used to run
+
+    if config['seed'] is not None:
+        torch.manual_seed(config['seed'])
+
+    device = torch.device('cuda' if (torch.cuda.is_available() and config['gpu'] != '-1') else 'cpu')
+    logger.info("Using device: {}".format(device))
+    if device == 'cuda':
+        logger.info("Device index: {}".format(torch.cuda.current_device()))
+
+    # Build data
+    logger.info("Loading and preprocessing data ...")
+    data_class = data_factory[config['data_class']]
+    my_data = data_class(config['data_dir'], pattern=config['pattern'], n_proc=config['n_proc'], limit_size=config['limit_size'], config=config)
+    feat_dim = my_data.feature_df.shape[1]  # dimensionality of data features
+    if config['task'] == 'classification':
+        validation_method = 'StratifiedShuffleSplit'
+        labels = my_data.labels_df.values.flatten()
+    else:
+        validation_method = 'ShuffleSplit'
+        labels = None
+
+    # Split dataset
+    test_data = my_data
+    test_indices = None  # will be converted to empty list in `split_dataset`, if also test_set_ratio == 0
+    val_data = my_data
+    val_indices = []
+    if config['test_pattern']:  # used if test data come from different files / file patterns
+        test_data = data_class(config['data_dir'], pattern=config['test_pattern'], n_proc=-1, config=config)
+        test_indices = test_data.all_IDs
+    if config['test_from']:  # load test IDs directly from file, if available, otherwise use `test_set_ratio`. Can work together with `test_pattern`
+        test_indices = list(set([line.rstrip() for line in open(config['test_from']).readlines()]))
+        try:
+            test_indices = [int(ind) for ind in test_indices]  # integer indices
+        except ValueError:
+            pass  # in case indices are non-integers
+        logger.info("Loaded {} test IDs from file: '{}'".format(len(test_indices), config['test_from']))
+    if config['val_pattern']:  # used if val data come from different files / file patterns
+        val_data = data_class(config['data_dir'], pattern=config['val_pattern'], n_proc=-1, config=config)
+        val_indices = val_data.all_IDs
+
+    # Note: currently a validation set must exist, either with `val_pattern` or `val_ratio`
+    # Using a `val_pattern` means that `val_ratio` == 0 and `test_ratio` == 0
+    if config['val_ratio'] > 0:
+        train_indices, val_indices, test_indices = split_dataset(data_indices=my_data.all_IDs,
+                                                                 validation_method=validation_method,
+                                                                 n_splits=1,
+                                                                 validation_ratio=config['val_ratio'],
+                                                                 test_set_ratio=config['test_ratio'],  # used only if test_indices not explicitly specified
+                                                                 test_indices=test_indices,
+                                                                 random_seed=1337,
+                                                                 labels=labels)
+        train_indices = train_indices[0]  # `split_dataset` returns a list of indices *per fold/split*
+        val_indices = val_indices[0]  # `split_dataset` returns a list of indices *per fold/split*
+    else:
+        train_indices = my_data.all_IDs
+        if test_indices is None:
+            test_indices = []
+
+    logger.info("{} samples may be used for training".format(len(train_indices)))
+    logger.info("{} samples will be used for validation".format(len(val_indices)))
+    logger.info("{} samples will be used for testing".format(len(test_indices)))
+
+    with open(os.path.join(config['output_dir'], 'data_indices.json'), 'w') as f:
+        try:
+            json.dump({'train_indices': list(map(int, train_indices)),
+                       'val_indices': list(map(int, val_indices)),
+                       'test_indices': list(map(int, test_indices))}, f, indent=4)
+        except ValueError:  # in case indices are non-integers
+            json.dump({'train_indices': list(train_indices),
+                       'val_indices': list(val_indices),
+                       'test_indices': list(test_indices)}, f, indent=4)
+
+    # Pre-process features
+    normalizer = None
+    if config['norm_from']:
+        with open(config['norm_from'], 'rb') as f:
+            norm_dict = pickle.load(f)
+        normalizer = Normalizer(**norm_dict)
+    elif config['normalization'] is not None:
+        normalizer = Normalizer(config['normalization'])
+        my_data.feature_df.loc[train_indices] = normalizer.normalize(my_data.feature_df.loc[train_indices])
+        if not config['normalization'].startswith('per_sample'):
+            # get normalizing values from training set and store for future use
+            norm_dict = normalizer.__dict__
+            with open(os.path.join(config['output_dir'], 'normalization.pickle'), 'wb') as f:
+                pickle.dump(norm_dict, f, pickle.HIGHEST_PROTOCOL)
+    if normalizer is not None:
+        if len(val_indices):
+            val_data.feature_df.loc[val_indices] = normalizer.normalize(val_data.feature_df.loc[val_indices])
+        if len(test_indices):
+            test_data.feature_df.loc[test_indices] = normalizer.normalize(test_data.feature_df.loc[test_indices])
+
+    # Create model
+    logger.info("Creating model ...")
+    model = model_factory(config, my_data)
+
+    if config['freeze']:
+        for name, param in model.named_parameters():
+            if name.startswith('output_layer'):
+                param.requires_grad = True
+            else:
+                param.requires_grad = False
+
+    logger.info("Model:\n{}".format(model))
+    logger.info("Total number of parameters: {}".format(utils.count_parameters(model)))
+    logger.info("Trainable parameters: {}".format(utils.count_parameters(model, trainable=True)))
+
+
+    # Initialize optimizer
+
+    if config['global_reg']:
+        weight_decay = config['l2_reg']
+        output_reg = None
+    else:
+        weight_decay = 0
+        output_reg = config['l2_reg']
+
+    optim_class = get_optimizer(config['optimizer'])
+    optimizer = optim_class(model.parameters(), lr=config['lr'], weight_decay=weight_decay)
+
+    start_epoch = 0
+    lr_step = 0  # current step index of `lr_step`
+    lr = config['lr']  # current learning step
+    # Load model and optimizer state
+    if args.load_model:
+        model, optimizer, start_epoch = utils.load_model(model, config['load_model'], optimizer, config['resume'],
+                                                         config['change_output'],
+                                                         config['lr'],
+                                                         config['lr_step'],
+                                                         config['lr_factor'])
+    model.to(device)
+
+    loss_module = get_loss_module(config)
+
+    if config['test_only'] == 'testset':  # Only evaluate and skip training
+        dataset_class, collate_fn, runner_class = pipeline_factory(config)
+        test_dataset = dataset_class(test_data, test_indices)
+
+        test_loader = DataLoader(dataset=test_dataset,
+                                 batch_size=config['batch_size'],
+                                 shuffle=False,
+                                 num_workers=config['num_workers'],
+                                 pin_memory=True,
+                                 collate_fn=lambda x: collate_fn(x, max_len=model.max_len))
+        test_evaluator = runner_class(model, test_loader, device, loss_module,
+                                            print_interval=config['print_interval'], console=config['console'])
+        aggr_metrics_test, per_batch_test = test_evaluator.evaluate(keep_all=True)
+        print_str = 'Test Summary: '
+        for k, v in aggr_metrics_test.items():
+            print_str += '{}: {:8f} | '.format(k, v)
+        logger.info(print_str)
+        return
+    
+    # Initialize data generators
+    dataset_class, collate_fn, runner_class = pipeline_factory(config)
+    val_dataset = dataset_class(val_data, val_indices)
+
+    val_loader = DataLoader(dataset=val_dataset,
+                            batch_size=config['batch_size'],
+                            shuffle=False,
+                            num_workers=config['num_workers'],
+                            pin_memory=True,
+                            collate_fn=lambda x: collate_fn(x, max_len=model.max_len))
+
+    train_dataset = dataset_class(my_data, train_indices)
+
+    train_loader = DataLoader(dataset=train_dataset,
+                              batch_size=config['batch_size'],
+                              shuffle=True,
+                              num_workers=config['num_workers'],
+                              pin_memory=True,
+                              collate_fn=lambda x: collate_fn(x, max_len=model.max_len))
+
+    trainer = runner_class(model, train_loader, device, loss_module, optimizer, l2_reg=output_reg,
+                                 print_interval=config['print_interval'], console=config['console'])
+    val_evaluator = runner_class(model, val_loader, device, loss_module,
+                                       print_interval=config['print_interval'], console=config['console'])
+
+    tensorboard_writer = SummaryWriter(config['tensorboard_dir'])
+
+    best_value = 1e16 if config['key_metric'] in NEG_METRICS else -1e16  # initialize with +inf or -inf depending on key metric
+    metrics = []  # (for validation) list of lists: for each epoch, stores metrics like loss, ...
+    best_metrics = {}
+
+    # Evaluate on validation before training
+    aggr_metrics_val, best_metrics, best_value = validate(val_evaluator, tensorboard_writer, config, best_metrics,
+                                                          best_value, epoch=0)
+    metrics_names, metrics_values = zip(*aggr_metrics_val.items())
+    metrics.append(list(metrics_values))
+
+    logger.info('Starting training...')
+    for epoch in tqdm(range(start_epoch + 1, config["epochs"] + 1), desc='Training Epoch', leave=False):
+        mark = epoch if config['save_all'] else 'last'
+        epoch_start_time = time.time()
+        aggr_metrics_train = trainer.train_epoch(epoch)  # dictionary of aggregate epoch metrics
+        epoch_runtime = time.time() - epoch_start_time
+        print()
+        print_str = 'Epoch {} Training Summary: '.format(epoch)
+        for k, v in aggr_metrics_train.items():
+            tensorboard_writer.add_scalar('{}/train'.format(k), v, epoch)
+            print_str += '{}: {:8f} | '.format(k, v)
+        logger.info(print_str)
+        logger.info("Epoch runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(epoch_runtime)))
+        total_epoch_time += epoch_runtime
+        avg_epoch_time = total_epoch_time / (epoch - start_epoch)
+        avg_batch_time = avg_epoch_time / len(train_loader)
+        avg_sample_time = avg_epoch_time / len(train_dataset)
+        logger.info("Avg epoch train. time: {} hours, {} minutes, {} seconds".format(*utils.readable_time(avg_epoch_time)))
+        logger.info("Avg batch train. time: {} seconds".format(avg_batch_time))
+        logger.info("Avg sample train. time: {} seconds".format(avg_sample_time))
+
+        # evaluate if first or last epoch or at specified interval
+        if (epoch == config["epochs"]) or (epoch == start_epoch + 1) or (epoch % config['val_interval'] == 0):
+            aggr_metrics_val, best_metrics, best_value = validate(val_evaluator, tensorboard_writer, config,
+                                                                  best_metrics, best_value, epoch)
+            metrics_names, metrics_values = zip(*aggr_metrics_val.items())
+            metrics.append(list(metrics_values))
+
+        utils.save_model(os.path.join(config['save_dir'], 'model_{}.pth'.format(mark)), epoch, model, optimizer)
+
+        # Learning rate scheduling
+        if epoch == config['lr_step'][lr_step]:
+            utils.save_model(os.path.join(config['save_dir'], 'model_{}.pth'.format(epoch)), epoch, model, optimizer)
+            lr = lr * config['lr_factor'][lr_step]
+            if lr_step < len(config['lr_step']) - 1:  # so that this index does not get out of bounds
+                lr_step += 1
+            logger.info('Learning rate updated to: ', lr)
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = lr
+
+        # Difficulty scheduling
+        if config['harden'] and check_progress(epoch):
+            train_loader.dataset.update()
+            val_loader.dataset.update()
+
+    # Export evolution of metrics over epochs
+    header = metrics_names
+    metrics_filepath = os.path.join(config["output_dir"], "metrics_" + config["experiment_name"] + ".xls")
+    book = utils.export_performance_metrics(metrics_filepath, metrics, header, sheet_name="metrics")
+
+    # Export record metrics to a file accumulating records from all experiments
+    utils.register_record(config["records_file"], config["initial_timestamp"], config["experiment_name"],
+                          best_metrics, aggr_metrics_val, comment=config['comment'])
+
+    logger.info('Best {} was {}. Other metrics: {}'.format(config['key_metric'], best_value, best_metrics))
+    logger.info('All Done!')
+
+    total_runtime = time.time() - total_start_time
+    logger.info("Total runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(total_runtime)))
+
+    return best_value
+
+
+if __name__ == '__main__':
+
+    args = Options().parse()  # `argsparse` object
+    config = setup(args)  # configuration dictionary
+    main(config)
diff --git a/models/loss.py b/models/loss.py
new file mode 100644
index 0000000..f57c68e
--- /dev/null
+++ b/models/loss.py
@@ -0,0 +1,74 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+def get_loss_module(config):
+
+    task = config['task']
+
+    if (task == "imputation") or (task == "transduction"):
+        return MaskedMSELoss(reduction='none')  # outputs loss for each batch element
+
+    if task == "classification":
+        return NoFussCrossEntropyLoss(reduction='none')  # outputs loss for each batch sample
+
+    if task == "regression":
+        return nn.MSELoss(reduction='none')  # outputs loss for each batch sample
+
+    else:
+        raise ValueError("Loss module for task '{}' does not exist".format(task))
+
+
+def l2_reg_loss(model):
+    """Returns the squared L2 norm of output layer of given model"""
+
+    for name, param in model.named_parameters():
+        if name == 'output_layer.weight':
+            return torch.sum(torch.square(param))
+
+
+class NoFussCrossEntropyLoss(nn.CrossEntropyLoss):
+    """
+    pytorch's CrossEntropyLoss is fussy: 1) needs Long (int64) targets only, and 2) only 1D.
+    This function satisfies these requirements
+    """
+
+    def forward(self, inp, target):
+        return F.cross_entropy(inp, target.long().squeeze(), weight=self.weight,
+                               ignore_index=self.ignore_index, reduction=self.reduction)
+
+
+class MaskedMSELoss(nn.Module):
+    """ Masked MSE Loss
+    """
+
+    def __init__(self, reduction: str = 'mean'):
+
+        super().__init__()
+
+        self.reduction = reduction
+        self.mse_loss = nn.MSELoss(reduction=self.reduction)
+
+    def forward(self,
+                y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+        """Compute the loss between a target value and a prediction.
+
+        Args:
+            y_pred: Estimated values
+            y_true: Target values
+            mask: boolean tensor with 0s at places where values should be ignored and 1s where they should be considered
+
+        Returns
+        -------
+        if reduction == 'none':
+            (num_active,) Loss for each active batch element as a tensor with gradient attached.
+        if reduction == 'mean':
+            scalar mean loss over batch as a tensor with gradient attached.
+        """
+
+        # for this particular loss, one may also elementwise multiply y_pred and y_true with the inverted mask
+        masked_pred = torch.masked_select(y_pred, mask)
+        masked_true = torch.masked_select(y_true, mask)
+
+        return self.mse_loss(masked_pred, masked_true)
diff --git a/models/ts_transformer.py b/models/ts_transformer.py
new file mode 100644
index 0000000..025bf3d
--- /dev/null
+++ b/models/ts_transformer.py
@@ -0,0 +1,314 @@
+from typing import Optional, Any
+import math
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torch.nn.modules import MultiheadAttention, Linear, Dropout, BatchNorm1d, TransformerEncoderLayer
+
+
+def model_factory(config, data):
+    task = config['task']
+    feat_dim = data.feature_df.shape[1]  # dimensionality of data features
+    # data windowing is used when samples don't have a predefined length or the length is too long
+    max_seq_len = config['data_window_len'] if config['data_window_len'] is not None else config['max_seq_len']
+    if max_seq_len is None:
+        try:
+            max_seq_len = data.max_seq_len
+        except AttributeError as x:
+            print("Data class does not define a maximum sequence length, so it must be defined with the script argument `max_seq_len`")
+            raise x
+
+    if (task == "imputation") or (task == "transduction"):
+        if config['model'] == 'LINEAR':
+            return DummyTSTransformerEncoder(feat_dim, max_seq_len, config['d_model'], config['num_heads'],
+                                             config['num_layers'], config['dim_feedforward'], dropout=config['dropout'],
+                                             pos_encoding=config['pos_encoding'], activation=config['activation'],
+                                             norm=config['normalization_layer'], freeze=config['freeze'])
+        elif config['model'] == 'transformer':
+            return TSTransformerEncoder(feat_dim, max_seq_len, config['d_model'], config['num_heads'],
+                                        config['num_layers'], config['dim_feedforward'], dropout=config['dropout'],
+                                        pos_encoding=config['pos_encoding'], activation=config['activation'],
+                                        norm=config['normalization_layer'], freeze=config['freeze'])
+
+    if (task == "classification") or (task == "regression"):
+        num_labels = len(data.class_names) if task == "classification" else data.labels_df.shape[1]  # dimensionality of labels
+        if config['model'] == 'LINEAR':
+            return DummyTSTransformerEncoderClassiregressor(feat_dim, max_seq_len, config['d_model'],
+                                                            config['num_heads'],
+                                                            config['num_layers'], config['dim_feedforward'],
+                                                            num_classes=num_labels,
+                                                            dropout=config['dropout'], pos_encoding=config['pos_encoding'],
+                                                            activation=config['activation'],
+                                                            norm=config['normalization_layer'], freeze=config['freeze'])
+        elif config['model'] == 'transformer':
+            return TSTransformerEncoderClassiregressor(feat_dim, max_seq_len, config['d_model'],
+                                                        config['num_heads'],
+                                                        config['num_layers'], config['dim_feedforward'],
+                                                        num_classes=num_labels,
+                                                        dropout=config['dropout'], pos_encoding=config['pos_encoding'],
+                                                        activation=config['activation'],
+                                                        norm=config['normalization_layer'], freeze=config['freeze'])
+    else:
+        raise ValueError("Model class for task '{}' does not exist".format(task))
+
+
+def _get_activation_fn(activation):
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+    raise ValueError("activation should be relu/gelu, not {}".format(activation))
+
+
+# From https://github.com/pytorch/examples/blob/master/word_language_model/model.py
+class FixedPositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=1024).
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=1024, scale_factor=1.0):
+        super(FixedPositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)  # positional encoding
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = scale_factor * pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)  # this stores the variable in the state_dict (used for non-trainable variables)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        """
+
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+class LearnablePositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout=0.1, max_len=1024):
+        super(LearnablePositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # Each position gets its own embedding
+        # Since indices are always 0 ... max_len, we don't have to do a look-up
+        self.pe = nn.Parameter(torch.empty(max_len, 1, d_model))  # requires_grad automatically set to True
+        nn.init.uniform_(self.pe, -0.02, 0.02)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        """
+
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+def get_pos_encoder(pos_encoding):
+    if pos_encoding == "learnable":
+        return LearnablePositionalEncoding
+    elif pos_encoding == "fixed":
+        return FixedPositionalEncoding
+
+    raise NotImplementedError("pos_encoding should be 'learnable'/'fixed', not '{}'".format(pos_encoding))
+
+
+class TransformerBatchNormEncoderLayer(nn.modules.Module):
+    r"""This transformer encoder layer block is made up of self-attn and feedforward network.
+    It differs from TransformerEncoderLayer in torch/nn/modules/transformer.py in that it replaces LayerNorm
+    with BatchNorm.
+
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of intermediate layer, relu or gelu (default=relu).
+    """
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
+        super(TransformerBatchNormEncoderLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model)
+
+        self.norm1 = BatchNorm1d(d_model, eps=1e-5)  # normalizes each feature across batch samples and time steps
+        self.norm2 = BatchNorm1d(d_model, eps=1e-5)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super(TransformerBatchNormEncoderLayer, self).__setstate__(state)
+
+    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+
+        Shape:
+            see the docs in Transformer class.
+        """
+        src2 = self.self_attn(src, src, src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)  # (seq_len, batch_size, d_model)
+        src = src.permute(1, 2, 0)  # (batch_size, d_model, seq_len)
+        # src = src.reshape([src.shape[0], -1])  # (batch_size, seq_length * d_model)
+        src = self.norm1(src)
+        src = src.permute(2, 0, 1)  # restore (seq_len, batch_size, d_model)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)  # (seq_len, batch_size, d_model)
+        src = src.permute(1, 2, 0)  # (batch_size, d_model, seq_len)
+        src = self.norm2(src)
+        src = src.permute(2, 0, 1)  # restore (seq_len, batch_size, d_model)
+        return src
+
+
+class TSTransformerEncoder(nn.Module):
+
+    def __init__(self, feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, dropout=0.1,
+                 pos_encoding='fixed', activation='gelu', norm='BatchNorm', freeze=False):
+        super(TSTransformerEncoder, self).__init__()
+
+        self.max_len = max_len
+        self.d_model = d_model
+        self.n_heads = n_heads
+
+        self.project_inp = nn.Linear(feat_dim, d_model)
+        self.pos_enc = get_pos_encoder(pos_encoding)(d_model, dropout=dropout*(1.0 - freeze), max_len=max_len)
+
+        if norm == 'LayerNorm':
+            encoder_layer = TransformerEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation)
+        else:
+            encoder_layer = TransformerBatchNormEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation)
+
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
+
+        self.output_layer = nn.Linear(d_model, feat_dim)
+
+        self.act = _get_activation_fn(activation)
+
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.feat_dim = feat_dim
+
+    def forward(self, X, padding_masks):
+        """
+        Args:
+            X: (batch_size, seq_length, feat_dim) torch tensor of masked features (input)
+            padding_masks: (batch_size, seq_length) boolean tensor, 1 means keep vector at this position, 0 means padding
+        Returns:
+            output: (batch_size, seq_length, feat_dim)
+        """
+
+        # permute because pytorch convention for transformers is [seq_length, batch_size, feat_dim]. padding_masks [batch_size, feat_dim]
+        inp = X.permute(1, 0, 2)
+        inp = self.project_inp(inp) * math.sqrt(
+            self.d_model)  # [seq_length, batch_size, d_model] project input vectors to d_model dimensional space
+        inp = self.pos_enc(inp)  # add positional encoding
+        # NOTE: logic for padding masks is reversed to comply with definition in MultiHeadAttention, TransformerEncoderLayer
+        output = self.transformer_encoder(inp, src_key_padding_mask=~padding_masks)  # (seq_length, batch_size, d_model)
+        output = self.act(output)  # the output transformer encoder/decoder embeddings don't include non-linearity
+        output = output.permute(1, 0, 2)  # (batch_size, seq_length, d_model)
+        output = self.dropout1(output)
+        # Most probably defining a Linear(d_model,feat_dim) vectorizes the operation over (seq_length, batch_size).
+        output = self.output_layer(output)  # (batch_size, seq_length, feat_dim)
+
+        return output
+
+
+class TSTransformerEncoderClassiregressor(nn.Module):
+    """
+    Simplest classifier/regressor. Can be either regressor or classifier because the output does not include
+    softmax. Concatenates final layer embeddings and uses 0s to ignore padding embeddings in final output layer.
+    """
+
+    def __init__(self, feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, num_classes,
+                 dropout=0.1, pos_encoding='fixed', activation='gelu', norm='BatchNorm', freeze=False):
+        super(TSTransformerEncoderClassiregressor, self).__init__()
+
+        self.max_len = max_len
+        self.d_model = d_model
+        self.n_heads = n_heads
+
+        self.project_inp = nn.Linear(feat_dim, d_model)
+        self.pos_enc = get_pos_encoder(pos_encoding)(d_model, dropout=dropout*(1.0 - freeze), max_len=max_len)
+
+        if norm == 'LayerNorm':
+            encoder_layer = TransformerEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation)
+        else:
+            encoder_layer = TransformerBatchNormEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation)
+
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
+
+        self.act = _get_activation_fn(activation)
+
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.feat_dim = feat_dim
+        self.num_classes = num_classes
+        self.output_layer = self.build_output_module(d_model, max_len, num_classes)
+
+    def build_output_module(self, d_model, max_len, num_classes):
+        output_layer = nn.Linear(d_model * max_len, num_classes)
+        # no softmax (or log softmax), because CrossEntropyLoss does this internally. If probabilities are needed,
+        # add F.log_softmax and use NLLoss
+        return output_layer
+
+    def forward(self, X, padding_masks):
+        """
+        Args:
+            X: (batch_size, seq_length, feat_dim) torch tensor of masked features (input)
+            padding_masks: (batch_size, seq_length) boolean tensor, 1 means keep vector at this position, 0 means padding
+        Returns:
+            output: (batch_size, num_classes)
+        """
+
+        # permute because pytorch convention for transformers is [seq_length, batch_size, feat_dim]. padding_masks [batch_size, feat_dim]
+        inp = X.permute(1, 0, 2)
+        inp = self.project_inp(inp) * math.sqrt(
+            self.d_model)  # [seq_length, batch_size, d_model] project input vectors to d_model dimensional space
+        inp = self.pos_enc(inp)  # add positional encoding
+        # NOTE: logic for padding masks is reversed to comply with definition in MultiHeadAttention, TransformerEncoderLayer
+        output = self.transformer_encoder(inp, src_key_padding_mask=~padding_masks)  # (seq_length, batch_size, d_model)
+        output = self.act(output)  # the output transformer encoder/decoder embeddings don't include non-linearity
+        output = output.permute(1, 0, 2)  # (batch_size, seq_length, d_model)
+        output = self.dropout1(output)
+
+        # Output
+        output = output * padding_masks.unsqueeze(-1)  # zero-out padding embeddings
+        output = output.reshape(output.shape[0], -1)  # (batch_size, seq_length * d_model)
+        output = self.output_layer(output)  # (batch_size, num_classes)
+
+        return output
+
diff --git a/optimizers.py b/optimizers.py
new file mode 100644
index 0000000..ed667b6
--- /dev/null
+++ b/optimizers.py
@@ -0,0 +1,259 @@
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+
+
+def get_optimizer(name):
+
+    if name == "Adam":
+        return torch.optim.Adam
+    elif name == "RAdam":
+        return RAdam
+
+
+# from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam/radam.py
+class RAdam(Optimizer):
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        self.degenerated_to_sgd = degenerated_to_sgd
+        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
+            for param in params:
+                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
+                    param['buffer'] = [[None, None, None] for _ in range(10)]
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        buffer=[[None, None, None] for _ in range(10)])
+        super(RAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RAdam, self).__setstate__(state)
+
+    def step(self, closure=None):
+
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('RAdam does not support sparse gradients')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                state['step'] += 1
+                buffered = group['buffer'][int(state['step'] % 10)]
+                if state['step'] == buffered[0]:
+                    N_sma, step_size = buffered[1], buffered[2]
+                else:
+                    buffered[0] = state['step']
+                    beta2_t = beta2 ** state['step']
+                    N_sma_max = 2 / (1 - beta2) - 1
+                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                    buffered[1] = N_sma
+
+                    # more conservative since it's an approximated value
+                    if N_sma >= 5:
+                        step_size = math.sqrt(
+                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
+                                        N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                    elif self.degenerated_to_sgd:
+                        step_size = 1.0 / (1 - beta1 ** state['step'])
+                    else:
+                        step_size = -1
+                    buffered[2] = step_size
+
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
+                    p.data.copy_(p_data_fp32)
+                elif step_size > 0:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
+                    p.data.copy_(p_data_fp32)
+
+        return loss
+
+
+class PlainRAdam(Optimizer):
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        self.degenerated_to_sgd = degenerated_to_sgd
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+
+        super(PlainRAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(PlainRAdam, self).__setstate__(state)
+
+    def step(self, closure=None):
+
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('RAdam does not support sparse gradients')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                state['step'] += 1
+                beta2_t = beta2 ** state['step']
+                N_sma_max = 2 / (1 - beta2) - 1
+                N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    step_size = group['lr'] * math.sqrt(
+                        (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
+                                    N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
+                    p.data.copy_(p_data_fp32)
+                elif self.degenerated_to_sgd:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    step_size = group['lr'] / (1 - beta1 ** state['step'])
+                    p_data_fp32.add_(-step_size, exp_avg)
+                    p.data.copy_(p_data_fp32)
+
+        return loss
+
+
+class AdamW(Optimizer):
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, warmup=warmup)
+        super(AdamW, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                if group['warmup'] > state['step']:
+                    scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
+                else:
+                    scheduled_lr = group['lr']
+
+                step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
+
+                if group['weight_decay'] != 0:
+                    p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
+
+                p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
+
+                p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/options.py b/options.py
new file mode 100644
index 0000000..9061b2b
--- /dev/null
+++ b/options.py
@@ -0,0 +1,185 @@
+import argparse
+
+
+class Options(object):
+
+    def __init__(self):
+
+        # Handle command line arguments
+        self.parser = argparse.ArgumentParser(
+            description='Run a complete training pipeline. Optionally, a JSON configuration file can be used, to overwrite command-line arguments.')
+
+        ## Run from config file
+        self.parser.add_argument('--config', dest='config_filepath',
+                                 help='Configuration .json file (optional). Overwrites existing command-line args!')
+
+        ## Run from command-line arguments
+        # I/O
+        self.parser.add_argument('--output_dir', default='./output',
+                                 help='Root output directory. Must exist. Time-stamped directories will be created inside.')
+        self.parser.add_argument('--data_dir', default='./data',
+                                 help='Data directory')
+        self.parser.add_argument('--load_model',
+                                 help='Path to pre-trained model.')
+        self.parser.add_argument('--resume', action='store_true',
+                                 help='If set, will load `starting_epoch` and state of optimizer, besides model weights.')
+        self.parser.add_argument('--change_output', action='store_true',
+                                 help='Whether the loaded model will be fine-tuned on a different task (necessitating a different output layer)')
+        self.parser.add_argument('--save_all', action='store_true',
+                                 help='If set, will save model weights (and optimizer state) for every epoch; otherwise just latest')
+        self.parser.add_argument('--name', dest='experiment_name', default='',
+                                 help='A string identifier/name for the experiment to be run - it will be appended to the output directory name, before the timestamp')
+        self.parser.add_argument('--comment', type=str, default='', help='A comment/description of the experiment')
+        self.parser.add_argument('--no_timestamp', action='store_true',
+                                 help='If set, a timestamp will not be appended to the output directory name')
+        self.parser.add_argument('--records_file', default='./records.xls',
+                                 help='Excel file keeping all records of experiments')
+        # System
+        self.parser.add_argument('--console', action='store_true',
+                                 help="Optimize printout for console output; otherwise for file")
+        self.parser.add_argument('--print_interval', type=int, default=1,
+                                 help='Print batch info every this many batches')
+        self.parser.add_argument('--gpu', type=str, default='0',
+                                 help='GPU index, -1 for CPU')
+        self.parser.add_argument('--n_proc', type=int, default=-1,
+                                 help='Number of processes for data loading/preprocessing. By default, equals num. of available cores.')
+        self.parser.add_argument('--num_workers', type=int, default=0,
+                                 help='dataloader threads. 0 for single-thread.')
+        self.parser.add_argument('--seed',
+                                 help='Seed used for splitting sets. None by default, set to an integer for reproducibility')
+        # Dataset
+        self.parser.add_argument('--limit_size', type=float, default=None,
+                                 help="Limit  dataset to specified smaller random sample, e.g. for rapid debugging purposes. "
+                                      "If in [0,1], it will be interpreted as a proportion of the dataset, "
+                                      "otherwise as an integer absolute number of samples")
+        self.parser.add_argument('--test_only', choices={'testset', 'fold_transduction'},
+                                 help='If set, no training will take place; instead, trained model will be loaded and evaluated on test set')
+        self.parser.add_argument('--data_class', type=str, default='weld',
+                                 help="Which type of data should be processed.")
+        self.parser.add_argument('--labels', type=str,
+                                 help="In case a dataset contains several labels (multi-task), "
+                                      "which type of labels should be used in regression or classification, i.e. name of column(s).")
+        self.parser.add_argument('--test_from',
+                                 help='If given, will read test IDs from specified text file containing sample IDs one in each row')
+        self.parser.add_argument('--test_ratio', type=float, default=0,
+                                 help="Set aside this proportion of the dataset as a test set")
+        self.parser.add_argument('--val_ratio', type=float, default=0.2,
+                                 help="Proportion of the dataset to be used as a validation set")
+        self.parser.add_argument('--pattern', type=str,
+                                 help='Regex pattern used to select files contained in `data_dir`. If None, all data will be used.')
+        self.parser.add_argument('--val_pattern', type=str,
+                                 help="""Regex pattern used to select files contained in `data_dir` exclusively for the validation set.
+                            If None, a positive `val_ratio` will be used to reserve part of the common data set.""")
+        self.parser.add_argument('--test_pattern', type=str,
+                                 help="""Regex pattern used to select files contained in `data_dir` exclusively for the test set.
+                            If None, `test_ratio`, if specified, will be used to reserve part of the common data set.""")
+        self.parser.add_argument('--normalization',
+                                 choices={'standardization', 'minmax', 'per_sample_std', 'per_sample_minmax'},
+                                 default='standardization',
+                                 help='If specified, will apply normalization on the input features of a dataset.')
+        self.parser.add_argument('--norm_from',
+                                 help="""If given, will read normalization values (e.g. mean, std, min, max) from specified pickle file.
+                            The columns correspond to features, rows correspond to mean, std or min, max.""")
+        self.parser.add_argument('--subsample_factor', type=int,
+                                 help='Sub-sampling factor used for long sequences: keep every kth sample')
+        # Training process
+        self.parser.add_argument('--task', choices={"imputation", "transduction", "classification", "regression"},
+                                 default="imputation",
+                                 help=("Training objective/task: imputation of masked values,\n"
+                                       "                          transduction of features to other features,\n"
+                                       "                          classification of entire time series,\n"
+                                       "                          regression of scalar(s) for entire time series"))
+        self.parser.add_argument('--masking_ratio', type=float, default=0.15,
+                                 help='Imputation: mask this proportion of each variable')
+        self.parser.add_argument('--mean_mask_length', type=float, default=3,
+                                 help="Imputation: the desired mean length of masked segments. Used only when `mask_distribution` is 'geometric'.")
+        self.parser.add_argument('--mask_mode', choices={'separate', 'concurrent'}, default='separate',
+                                 help=("Imputation: whether each variable should be masked separately "
+                                       "or all variables at a certain positions should be masked concurrently"))
+        self.parser.add_argument('--mask_distribution', choices={'geometric', 'bernoulli'}, default='geometric',
+                                 help=("Imputation: whether each mask sequence element is sampled independently at random"
+                                       "or whether sampling follows a markov chain (stateful), resulting in "
+                                       "geometric distributions of masked squences of a desired mean_mask_length"))
+        self.parser.add_argument('--exclude_feats', type=str, default=None,
+                                 help='Imputation: Comma separated string of indices corresponding to features to be excluded from masking')
+        self.parser.add_argument('--mask_feats', type=str, default='0, 1',
+                                 help='Transduction: Comma separated string of indices corresponding to features to be masked')
+        self.parser.add_argument('--start_hint', type=float, default=0.0,
+                                 help='Transduction: proportion at the beginning of time series which will not be masked')
+        self.parser.add_argument('--end_hint', type=float, default=0.0,
+                                 help='Transduction: proportion at the end of time series which will not be masked')
+        self.parser.add_argument('--harden', action='store_true',
+                                 help='Makes training objective progressively harder, by masking more of the input')
+
+        self.parser.add_argument('--epochs', type=int, default=400,
+                                 help='Number of training epochs')
+        self.parser.add_argument('--val_interval', type=int, default=2,
+                                 help='Evaluate on validation set every this many epochs. Must be >= 1.')
+        self.parser.add_argument('--optimizer', choices={"Adam", "RAdam"}, default="Adam", help="Optimizer")
+        self.parser.add_argument('--lr', type=float, default=1e-3,
+                                 help='learning rate (default holds for batch size 64)')
+        self.parser.add_argument('--lr_step', type=str, default='1000000',
+                                 help='Comma separated string of epochs when to reduce learning rate by a factor of 10.'
+                                      ' The default is a large value, meaning that the learning rate will not change.')
+        self.parser.add_argument('--lr_factor', type=str, default='0.1',
+                                 help=("Comma separated string of multiplicative factors to be applied to lr "
+                                       "at corresponding steps specified in `lr_step`. If a single value is provided, "
+                                       "it will be replicated to match the number of steps in `lr_step`."))
+        self.parser.add_argument('--batch_size', type=int, default=64,
+                                 help='Training batch size')
+        self.parser.add_argument('--l2_reg', type=float, default=0,
+                                 help='L2 weight regularization parameter')
+        self.parser.add_argument('--global_reg', action='store_true',
+                                 help='If set, L2 regularization will be applied to all weights instead of only the output layer')
+        self.parser.add_argument('--key_metric', choices={'loss', 'accuracy', 'precision'}, default='loss',
+                                 help='Metric used for defining best epoch')
+        self.parser.add_argument('--freeze', action='store_true',
+                                 help='If set, freezes all layer parameters except for the output layer. Also removes dropout except before the output layer')
+
+        # Model
+        self.parser.add_argument('--model', choices={"transformer", "LINEAR"}, default="transformer",
+                                 help="Model class")
+        self.parser.add_argument('--max_seq_len', type=int,
+                                 help="""Maximum input sequence length. Determines size of transformer layers.
+                                 If not provided, then the value defined inside the data class will be used.""")
+        self.parser.add_argument('--data_window_len', type=int,
+                                 help="""Used instead of the `max_seq_len`, when the data samples must be
+                                 segmented into windows. Determines maximum input sequence length 
+                                 (size of transformer layers).""")
+        self.parser.add_argument('--d_model', type=int, default=64,
+                                 help='Internal dimension of transformer embeddings')
+        self.parser.add_argument('--dim_feedforward', type=int, default=256,
+                                 help='Dimension of dense feedforward part of transformer layer')
+        self.parser.add_argument('--num_heads', type=int, default=8,
+                                 help='Number of multi-headed attention heads')
+        self.parser.add_argument('--num_layers', type=int, default=3,
+                                 help='Number of transformer encoder layers (blocks)')
+        self.parser.add_argument('--dropout', type=float, default=0.1,
+                                 help='Dropout applied to most transformer encoder layers')
+        self.parser.add_argument('--pos_encoding', choices={'fixed', 'learnable'}, default='fixed',
+                                 help='Internal dimension of transformer embeddings')
+        self.parser.add_argument('--activation', choices={'relu', 'gelu'}, default='gelu',
+                                 help='Activation to be used in transformer encoder')
+        self.parser.add_argument('--normalization_layer', choices={'BatchNorm', 'LayerNorm'}, default='BatchNorm',
+                                 help='Normalization layer to be used internally in transformer encoder')
+
+    def parse(self):
+
+        args = self.parser.parse_args()
+
+        args.lr_step = [int(i) for i in args.lr_step.split(',')]
+        args.lr_factor = [float(i) for i in args.lr_factor.split(',')]
+        if (len(args.lr_step) > 1) and (len(args.lr_factor) == 1):
+            args.lr_factor = len(args.lr_step) * args.lr_factor  # replicate
+        assert len(args.lr_step) == len(
+            args.lr_factor), "You must specify as many values in `lr_step` as in `lr_factors`"
+
+        if args.exclude_feats is not None:
+            args.exclude_feats = [int(i) for i in args.exclude_feats.split(',')]
+        args.mask_feats = [int(i) for i in args.mask_feats.split(',')]
+
+        if args.val_pattern is not None:
+            args.val_ratio = 0
+            args.test_ratio = 0
+
+        return args
diff --git a/running.py b/running.py
new file mode 100644
index 0000000..6231360
--- /dev/null
+++ b/running.py
@@ -0,0 +1,500 @@
+import logging
+import sys
+import os
+import traceback
+import json
+from datetime import datetime
+import string
+import random
+from collections import OrderedDict
+import time
+import pickle
+from functools import partial
+
+import ipdb
+import torch
+from torch.utils.data import DataLoader
+import numpy as np
+import sklearn
+
+from utils import utils, analysis
+from models.loss import l2_reg_loss
+from datasets.dataset import ImputationDataset, TransductionDataset, ClassiregressionDataset, collate_unsuperv, collate_superv
+
+
+logger = logging.getLogger('__main__')
+
+NEG_METRICS = {'loss'}  # metrics for which "better" is less
+
+val_times = {"total_time": 0, "count": 0}
+
+
+def pipeline_factory(config):
+    """For the task specified in the configuration returns the corresponding combination of
+    Dataset class, collate function and Runner class."""
+
+    task = config['task']
+
+    if task == "imputation":
+        return partial(ImputationDataset, mean_mask_length=config['mean_mask_length'],
+                       masking_ratio=config['masking_ratio'], mode=config['mask_mode'],
+                       distribution=config['mask_distribution'], exclude_feats=config['exclude_feats']),\
+                        collate_unsuperv, UnsupervisedRunner
+    if task == "transduction":
+        return partial(TransductionDataset, mask_feats=config['mask_feats'],
+                       start_hint=config['start_hint'], end_hint=config['end_hint']), collate_unsuperv, UnsupervisedRunner
+    if (task == "classification") or (task == "regression"):
+        return ClassiregressionDataset, collate_superv, SupervisedRunner
+    else:
+        raise NotImplementedError("Task '{}' not implemented".format(task))
+
+
+def setup(args):
+    """Prepare training session: read configuration from file (takes precedence), create directories.
+    Input:
+        args: arguments object from argparse
+    Returns:
+        config: configuration dictionary
+    """
+
+    config = args.__dict__  # configuration dictionary
+
+    if args.config_filepath is not None:
+        logger.info("Reading configuration ...")
+        try:  # dictionary containing the entire configuration settings in a hierarchical fashion
+            config.update(utils.load_config(args.config_filepath))
+        except:
+            logger.critical("Failed to load configuration file. Check JSON syntax and verify that files exist")
+            traceback.print_exc()
+            sys.exit(1)
+
+    # Create output directory
+    initial_timestamp = datetime.now()
+    output_dir = config['output_dir']
+    if not os.path.isdir(output_dir):
+        raise IOError(
+            "Root directory '{}', where the directory of the experiment will be created, must exist".format(output_dir))
+
+    output_dir = os.path.join(output_dir, config['experiment_name'])
+
+    formatted_timestamp = initial_timestamp.strftime("%Y-%m-%d_%H-%M-%S")
+    config['initial_timestamp'] = formatted_timestamp
+    if (not config['no_timestamp']) or (len(config['experiment_name']) == 0):
+        rand_suffix = "".join(random.choices(string.ascii_letters + string.digits, k=3))
+        output_dir += "_" + formatted_timestamp + "_" + rand_suffix
+    config['output_dir'] = output_dir
+    config['save_dir'] = os.path.join(output_dir, 'checkpoints')
+    config['pred_dir'] = os.path.join(output_dir, 'predictions')
+    config['tensorboard_dir'] = os.path.join(output_dir, 'tb_summaries')
+    utils.create_dirs([config['save_dir'], config['pred_dir'], config['tensorboard_dir']])
+
+    # Save configuration as a (pretty) json file
+    with open(os.path.join(output_dir, 'configuration.json'), 'w') as fp:
+        json.dump(config, fp, indent=4, sort_keys=True)
+
+    logger.info("Stored configuration file in '{}'".format(output_dir))
+
+    return config
+
+
+def fold_evaluate(dataset, model, device, loss_module, target_feats, config, dataset_name):
+
+    allfolds = {'target_feats': target_feats,  # list of len(num_folds), each element: list of target feature integer indices
+                'predictions': [],  # list of len(num_folds), each element: (num_samples, seq_len, feat_dim) prediction per sample
+                'targets': [],  # list of len(num_folds), each element: (num_samples, seq_len, feat_dim) target/original input per sample
+                'target_masks': [],  # list of len(num_folds), each element: (num_samples, seq_len, feat_dim) boolean mask per sample
+                'metrics': [],  # list of len(num_folds), each element: (num_samples, num_metrics) metric per sample
+                'IDs': []}  # list of len(num_folds), each element: (num_samples,) ID per sample
+
+    for i, tgt_feats in enumerate(target_feats):
+
+        dataset.mask_feats = tgt_feats  # set the transduction target features
+
+        loader = DataLoader(dataset=dataset,
+                            batch_size=config['batch_size'],
+                            shuffle=False,
+                            num_workers=config['num_workers'],
+                            pin_memory=True,
+                            collate_fn=lambda x: collate_unsuperv(x, max_len=config['max_seq_len']))
+
+        evaluator = UnsupervisedRunner(model, loader, device, loss_module,
+                                       print_interval=config['print_interval'], console=config['console'])
+
+        logger.info("Evaluating {} set, fold: {}, target features: {}".format(dataset_name, i, tgt_feats))
+        aggr_metrics, per_batch = evaluate(evaluator)
+
+        metrics_array = convert_metrics_per_batch_to_per_sample(per_batch['metrics'], per_batch['target_masks'])
+        metrics_array = np.concatenate(metrics_array, axis=0)
+        allfolds['metrics'].append(metrics_array)
+        allfolds['predictions'].append(np.concatenate(per_batch['predictions'], axis=0))
+        allfolds['targets'].append(np.concatenate(per_batch['targets'], axis=0))
+        allfolds['target_masks'].append(np.concatenate(per_batch['target_masks'], axis=0))
+        allfolds['IDs'].append(np.concatenate(per_batch['IDs'], axis=0))
+
+        metrics_mean = np.mean(metrics_array, axis=0)
+        metrics_std = np.std(metrics_array, axis=0)
+        for m, metric_name in enumerate(list(aggr_metrics.items())[1:]):
+            logger.info("{}:: Mean: {:.3f}, std: {:.3f}".format(metric_name, metrics_mean[m], metrics_std[m]))
+
+    pred_filepath = os.path.join(config['pred_dir'], dataset_name + '_fold_transduction_predictions.pickle')
+    logger.info("Serializing predictions into {} ... ".format(pred_filepath))
+    with open(pred_filepath, 'wb') as f:
+        pickle.dump(allfolds, f, pickle.HIGHEST_PROTOCOL)
+
+
+def convert_metrics_per_batch_to_per_sample(metrics, target_masks):
+    """
+    Args:
+        metrics: list of len(num_batches), each element: list of len(num_metrics), each element: (num_active_in_batch,) metric per element
+        target_masks: list of len(num_batches), each element: (batch_size, seq_len, feat_dim) boolean mask: 1s active, 0s ignore
+    Returns:
+        metrics_array = list of len(num_batches), each element: (batch_size, num_metrics) metric per sample
+    """
+    metrics_array = []
+    for b, batch_target_masks in enumerate(target_masks):
+        num_active_per_sample = np.sum(batch_target_masks, axis=(1, 2))
+        batch_metrics = np.stack(metrics[b], axis=1)  # (num_active_in_batch, num_metrics)
+        ind = 0
+        metrics_per_sample = np.zeros((len(num_active_per_sample), batch_metrics.shape[1]))  # (batch_size, num_metrics)
+        for n, num_active in enumerate(num_active_per_sample):
+            new_ind = ind + num_active
+            metrics_per_sample[n, :] = np.sum(batch_metrics[ind:new_ind, :], axis=0)
+            ind = new_ind
+        metrics_array.append(metrics_per_sample)
+    return metrics_array
+
+
+def evaluate(evaluator):
+    """Perform a single, one-off evaluation on an evaluator object (initialized with a dataset)"""
+
+    eval_start_time = time.time()
+    with torch.no_grad():
+        aggr_metrics, per_batch = evaluator.evaluate(epoch_num=None, keep_all=True)
+    eval_runtime = time.time() - eval_start_time
+    print()
+    print_str = 'Evaluation Summary: '
+    for k, v in aggr_metrics.items():
+        if v is not None:
+            print_str += '{}: {:8f} | '.format(k, v)
+    logger.info(print_str)
+    logger.info("Evaluation runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(eval_runtime)))
+
+    return aggr_metrics, per_batch
+
+
+def validate(val_evaluator, tensorboard_writer, config, best_metrics, best_value, epoch):
+    """Run an evaluation on the validation set while logging metrics, and handle outcome"""
+
+    logger.info("Evaluating on validation set ...")
+    eval_start_time = time.time()
+    with torch.no_grad():
+        aggr_metrics, per_batch = val_evaluator.evaluate(epoch, keep_all=True)
+    eval_runtime = time.time() - eval_start_time
+    logger.info("Validation runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(eval_runtime)))
+
+    global val_times
+    val_times["total_time"] += eval_runtime
+    val_times["count"] += 1
+    avg_val_time = val_times["total_time"] / val_times["count"]
+    avg_val_batch_time = avg_val_time / len(val_evaluator.dataloader)
+    avg_val_sample_time = avg_val_time / len(val_evaluator.dataloader.dataset)
+    logger.info("Avg val. time: {} hours, {} minutes, {} seconds".format(*utils.readable_time(avg_val_time)))
+    logger.info("Avg batch val. time: {} seconds".format(avg_val_batch_time))
+    logger.info("Avg sample val. time: {} seconds".format(avg_val_sample_time))
+
+    print()
+    print_str = 'Epoch {} Validation Summary: '.format(epoch)
+    for k, v in aggr_metrics.items():
+        tensorboard_writer.add_scalar('{}/val'.format(k), v, epoch)
+        print_str += '{}: {:8f} | '.format(k, v)
+    logger.info(print_str)
+
+    if config['key_metric'] in NEG_METRICS:
+        condition = (aggr_metrics[config['key_metric']] < best_value)
+    else:
+        condition = (aggr_metrics[config['key_metric']] > best_value)
+    if condition:
+        best_value = aggr_metrics[config['key_metric']]
+        utils.save_model(os.path.join(config['save_dir'], 'model_best.pth'), epoch, val_evaluator.model)
+        best_metrics = aggr_metrics.copy()
+
+        pred_filepath = os.path.join(config['pred_dir'], 'best_predictions')
+        np.savez(pred_filepath, **per_batch)
+
+    return aggr_metrics, best_metrics, best_value
+
+
+
+def check_progress(epoch):
+
+    if epoch in [100, 140, 160, 220, 280, 340]:
+        return True
+    else:
+        return False
+
+
+class BaseRunner(object):
+
+    def __init__(self, model, dataloader, device, loss_module, optimizer=None, l2_reg=None, print_interval=10, console=True):
+
+        self.model = model
+        self.dataloader = dataloader
+        self.device = device
+        self.optimizer = optimizer
+        self.loss_module = loss_module
+        self.l2_reg = l2_reg
+        self.print_interval = print_interval
+        self.printer = utils.Printer(console=console)
+
+        self.epoch_metrics = OrderedDict()
+
+    def train_epoch(self, epoch_num=None):
+        raise NotImplementedError('Please override in child class')
+
+    def evaluate(self, epoch_num=None, keep_all=True):
+        raise NotImplementedError('Please override in child class')
+
+    def print_callback(self, i_batch, metrics, prefix=''):
+
+        total_batches = len(self.dataloader)
+
+        template = "{:5.1f}% | batch: {:9d} of {:9d}"
+        content = [100 * (i_batch / total_batches), i_batch, total_batches]
+        for met_name, met_value in metrics.items():
+            template += "\t|\t{}".format(met_name) + ": {:g}"
+            content.append(met_value)
+
+        dyn_string = template.format(*content)
+        dyn_string = prefix + dyn_string
+        self.printer.print(dyn_string)
+
+
+class UnsupervisedRunner(BaseRunner):
+
+    def train_epoch(self, epoch_num=None):
+
+        self.model = self.model.train()
+
+        epoch_loss = 0  # total loss of epoch
+        total_active_elements = 0  # total unmasked elements in epoch
+        for i, batch in enumerate(self.dataloader):
+
+            X, targets, target_masks, padding_masks, IDs = batch
+            targets = targets.to(self.device)
+            target_masks = target_masks.to(self.device)  # 1s: mask and predict, 0s: unaffected input (ignore)
+            padding_masks = padding_masks.to(self.device)  # 0s: ignore
+
+            predictions = self.model(X.to(self.device), padding_masks)  # (batch_size, padded_length, feat_dim)
+
+            # Cascade noise masks (batch_size, padded_length, feat_dim) and padding masks (batch_size, padded_length)
+            target_masks = target_masks * padding_masks.unsqueeze(-1)
+            loss = self.loss_module(predictions, targets, target_masks)  # (num_active,) individual loss (square error per element) for each active value in batch
+            batch_loss = torch.sum(loss)
+            mean_loss = batch_loss / len(loss)  # mean loss (over active elements) used for optimization
+
+            if self.l2_reg:
+                total_loss = mean_loss + self.l2_reg * l2_reg_loss(self.model)
+            else:
+                total_loss = mean_loss
+
+            # Zero gradients, perform a backward pass, and update the weights.
+            self.optimizer.zero_grad()
+            total_loss.backward()
+
+            # torch.nn.utils.clip_grad_value_(self.model.parameters(), clip_value=1.0)
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=4.0)
+            self.optimizer.step()
+
+            metrics = {"loss": mean_loss.item()}
+            if i % self.print_interval == 0:
+                ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num)
+                self.print_callback(i, metrics, prefix='Training ' + ending)
+
+            with torch.no_grad():
+                total_active_elements += len(loss)
+                epoch_loss += batch_loss.item()  # add total loss of batch
+
+        epoch_loss = epoch_loss / total_active_elements  # average loss per element for whole epoch
+        self.epoch_metrics['epoch'] = epoch_num
+        self.epoch_metrics['loss'] = epoch_loss
+        return self.epoch_metrics
+
+    def evaluate(self, epoch_num=None, keep_all=True):
+
+        self.model = self.model.eval()
+
+        epoch_loss = 0  # total loss of epoch
+        total_active_elements = 0  # total unmasked elements in epoch
+
+        if keep_all:
+            per_batch = {'target_masks': [], 'targets': [], 'predictions': [], 'metrics': [], 'IDs': []}
+        for i, batch in enumerate(self.dataloader):
+
+            X, targets, target_masks, padding_masks, IDs = batch
+            targets = targets.to(self.device)
+            target_masks = target_masks.to(self.device)  # 1s: mask and predict, 0s: unaffected input (ignore)
+            padding_masks = padding_masks.to(self.device)  # 0s: ignore
+
+            # TODO: for debugging
+            # input_ok = utils.check_tensor(X, verbose=False, zero_thresh=1e-8, inf_thresh=1e4)
+            # if not input_ok:
+            #     print("Input problem!")
+            #     ipdb.set_trace()
+            #
+            # utils.check_model(self.model, verbose=False, stop_on_error=True)
+
+            predictions = self.model(X.to(self.device), padding_masks)  # (batch_size, padded_length, feat_dim)
+
+            # Cascade noise masks (batch_size, padded_length, feat_dim) and padding masks (batch_size, padded_length)
+            target_masks = target_masks * padding_masks.unsqueeze(-1)
+            loss = self.loss_module(predictions, targets, target_masks) # (num_active,) individual loss (square error per element) for each active value in batch
+            batch_loss = torch.sum(loss).cpu().item()
+            try: 
+                mean_loss = batch_loss / len(loss)  # mean loss (over active elements) used for optimization the batch
+            except ZeroDivisionError as err:
+                print('loss length is zero: target_masks {} \n targets {} \n predictions {}'.format(target_masks, targets, predictions))
+                mean_loss = np.nan
+            
+            if keep_all:
+                per_batch['target_masks'].append(target_masks.cpu().numpy())
+                per_batch['targets'].append(targets.cpu().numpy())
+                per_batch['predictions'].append(predictions.cpu().numpy())
+                per_batch['metrics'].append([loss.cpu().numpy()])
+                per_batch['IDs'].append(IDs)
+
+            metrics = {"loss": mean_loss}
+            if i % self.print_interval == 0:
+                ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num)
+                self.print_callback(i, metrics, prefix='Evaluating ' + ending)
+
+            total_active_elements += len(loss)
+            epoch_loss += batch_loss  # add total loss of batch
+
+        epoch_loss = epoch_loss / total_active_elements  # average loss per element for whole epoch
+        self.epoch_metrics['epoch'] = epoch_num
+        self.epoch_metrics['loss'] = epoch_loss
+
+        if keep_all:
+            return self.epoch_metrics, per_batch
+        else:
+            return self.epoch_metrics
+
+
+class SupervisedRunner(BaseRunner):
+
+    def __init__(self, *args, **kwargs):
+
+        super(SupervisedRunner, self).__init__(*args, **kwargs)
+
+        if isinstance(args[3], torch.nn.CrossEntropyLoss):
+            self.classification = True  # True if classification, False if regression
+            self.analyzer = analysis.Analyzer(print_conf_mat=True)
+        else:
+            self.classification = False
+
+    def train_epoch(self, epoch_num=None):
+
+        self.model = self.model.train()
+
+        epoch_loss = 0  # total loss of epoch
+        total_samples = 0  # total samples in epoch
+
+        for i, batch in enumerate(self.dataloader):
+
+            X, targets, padding_masks, IDs = batch
+            targets = targets.to(self.device)
+            padding_masks = padding_masks.to(self.device)  # 0s: ignore
+            # regression: (batch_size, num_labels); classification: (batch_size, num_classes) of logits
+            predictions = self.model(X.to(self.device), padding_masks)
+
+            loss = self.loss_module(predictions, targets)  # (batch_size,) loss for each sample in the batch
+            batch_loss = torch.sum(loss)
+            mean_loss = batch_loss / len(loss)  # mean loss (over samples) used for optimization
+
+            if self.l2_reg:
+                total_loss = mean_loss + self.l2_reg * l2_reg_loss(self.model)
+            else:
+                total_loss = mean_loss
+
+            # Zero gradients, perform a backward pass, and update the weights.
+            self.optimizer.zero_grad()
+            total_loss.backward()
+
+            # torch.nn.utils.clip_grad_value_(self.model.parameters(), clip_value=1.0)
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=4.0)
+            self.optimizer.step()
+
+            metrics = {"loss": mean_loss.item()}
+            if i % self.print_interval == 0:
+                ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num)
+                self.print_callback(i, metrics, prefix='Training ' + ending)
+
+            with torch.no_grad():
+                total_samples += len(loss)
+                epoch_loss += batch_loss.item()  # add total loss of batch
+
+        epoch_loss = epoch_loss / total_samples  # average loss per sample for whole epoch
+        self.epoch_metrics['epoch'] = epoch_num
+        self.epoch_metrics['loss'] = epoch_loss
+        return self.epoch_metrics
+
+    def evaluate(self, epoch_num=None, keep_all=True):
+
+        self.model = self.model.eval()
+
+        epoch_loss = 0  # total loss of epoch
+        total_samples = 0  # total samples in epoch
+
+        per_batch = {'target_masks': [], 'targets': [], 'predictions': [], 'metrics': [], 'IDs': []}
+        for i, batch in enumerate(self.dataloader):
+
+            X, targets, padding_masks, IDs = batch
+            targets = targets.to(self.device)
+            padding_masks = padding_masks.to(self.device)  # 0s: ignore
+            # regression: (batch_size, num_labels); classification: (batch_size, num_classes) of logits
+            predictions = self.model(X.to(self.device), padding_masks)
+
+            loss = self.loss_module(predictions, targets)  # (batch_size,) loss for each sample in the batch
+            batch_loss = torch.sum(loss).cpu().item()
+            mean_loss = batch_loss / len(loss)  # mean loss (over samples)
+
+            per_batch['targets'].append(targets.cpu().numpy())
+            per_batch['predictions'].append(predictions.cpu().numpy())
+            per_batch['metrics'].append([loss.cpu().numpy()])
+            per_batch['IDs'].append(IDs)
+
+            metrics = {"loss": mean_loss}
+            if i % self.print_interval == 0:
+                ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num)
+                self.print_callback(i, metrics, prefix='Evaluating ' + ending)
+
+            total_samples += len(loss)
+            epoch_loss += batch_loss  # add total loss of batch
+
+        epoch_loss = epoch_loss / total_samples  # average loss per element for whole epoch
+        self.epoch_metrics['epoch'] = epoch_num
+        self.epoch_metrics['loss'] = epoch_loss
+
+        if self.classification:
+            predictions = torch.from_numpy(np.concatenate(per_batch['predictions'], axis=0))
+            probs = torch.nn.functional.softmax(predictions)  # (total_samples, num_classes) est. prob. for each class and sample
+            predictions = torch.argmax(probs, dim=1).cpu().numpy()  # (total_samples,) int class index for each sample
+            probs = probs.cpu().numpy()
+            targets = np.concatenate(per_batch['targets'], axis=0).flatten()
+            class_names = np.arange(probs.shape[1])  # TODO: temporary until I decide how to pass class names
+            metrics_dict = self.analyzer.analyze_classification(predictions, targets, class_names)
+
+            self.epoch_metrics['accuracy'] = metrics_dict['total_accuracy']  # same as average recall over all classes
+            self.epoch_metrics['precision'] = metrics_dict['prec_avg']  # average precision over all classes
+
+            if self.model.num_classes == 2:
+                false_pos_rate, true_pos_rate, _ = sklearn.metrics.roc_curve(targets, probs[:, 1])  # 1D scores needed
+                self.epoch_metrics['AUROC'] = sklearn.metrics.auc(false_pos_rate, true_pos_rate)
+
+                prec, rec, _ = sklearn.metrics.precision_recall_curve(targets, probs[:, 1])
+                self.epoch_metrics['AUPRC'] = sklearn.metrics.auc(rec, prec)
+
+        if keep_all:
+            return self.epoch_metrics, per_batch
+        else:
+            return self.epoch_metrics
diff --git a/utils/analysis.py b/utils/analysis.py
new file mode 100644
index 0000000..8699ee5
--- /dev/null
+++ b/utils/analysis.py
@@ -0,0 +1,478 @@
+"""
+Collection of functions which enable the evaluation of a classifier's performance,
+by showing confusion matrix, accuracy, recall, precision etc.
+"""
+
+import numpy as np
+import sys
+
+import matplotlib.pyplot as plt
+
+from sklearn import metrics
+from tabulate import tabulate
+import math
+import logging
+from datetime import datetime
+
+
+def acc_top_k(predictions, y_true):
+    """Accuracy when allowing for correct class being in the top k predictions.
+
+    Arguments:
+        predictions: (N_samples, k) array of top class indices (pre-sorted class indices based on score) per sample
+        y_true: N_samples 1D-array of ground truth labels (integer indices)
+    Returns:
+        length k 1D-array of accuracy when allowing for correct class being in top 1, 2, ... k predictions"""
+
+    y_true = y_true[:, np.newaxis]
+
+    # Create upper triangular matrix of ones, to be used in construction of V
+    building_blocks = np.zeros((predictions.shape[1], predictions.shape[1]))
+    building_blocks[np.triu_indices(predictions.shape[1])] = 1
+
+    # A matrix of the same shape as predictions. For each sample, the index corresponding
+    # to a correct prediction is 1, as well as all following indices.
+    # Example: y_true = [1,0], predictions = [[1 5 4],[2 0 3]]. Then: V = [[1 1 1],[0 1 1]]
+    V = np.zeros_like(predictions, dtype=int)  # validity matrix
+    sample_ind, rank_ind = np.where(predictions == y_true)
+
+    V[sample_ind, :] = building_blocks[rank_ind, :]
+
+    return np.mean(V, axis=0)
+
+
+def accuracy(y_pred, y_true, excluded_labels=None):
+    """A simple accuracy calculator, which can ignore labels specified in a list"""
+
+    if excluded_labels is None:
+        return np.mean(y_pred == y_true)
+    else:
+        included = (y_pred != excluded_labels[0]) & (y_true != excluded_labels[0])
+        # The following extra check (rather than initializing with an array of ones)
+        # is done because a single excluded label is the most common case
+        if len(excluded_labels) > 1:
+            for label in excluded_labels[1:]:
+                included &= (y_pred != label) & (y_true != label)
+
+        return np.mean(y_pred[included] == y_true[included])
+
+
+def precision(y_true, y_pred, label):
+    """Returns precision for the specified class index"""
+
+    predicted_in_C = (y_pred == label)
+    num_pred_in_C = np.sum(predicted_in_C)
+    if num_pred_in_C == 0:
+        return 0
+    return np.sum(y_true[predicted_in_C] == label) / num_pred_in_C
+
+
+def recall(y_true, y_pred, label):
+    """Returns recall for the specified class index"""
+
+    truly_in_C = (y_true == label)
+    num_truly_in_C = np.sum(truly_in_C)
+    if num_truly_in_C == 0:
+        return 0  # or NaN?
+    return np.sum(y_pred[truly_in_C] == label) / num_truly_in_C
+
+
+def limiter(metric_functions, y_true, y_pred, y_scores, score_thr, label):
+    """Wraps a list of metric functions, i.e precison or recall, by ingoring predictions under the
+    specified threshold for a specific class.
+    """
+
+    ltd_pred = np.copy(y_pred)
+    ltd_pred[(ltd_pred == label) & (y_scores < score_thr)] = -1
+
+    output = [func(y_true, ltd_pred, label) for func in metric_functions]
+
+    return output
+
+
+def prec_rec_parametrized_by_thr(y_true, y_pred, y_scores, label, Npoints, min_score=None, max_score=None):
+    """Returns an array showing for a specified class of interest, how precision and recall change as a function of
+        the score threshold (parameter).
+
+    Input:
+        y_true: 1D array of true labels (class indices)
+        y_pred: 1D array of predicted labels (class indices)
+        y_scores: 1D array of scores corresponding to predictions in y_pred
+        label: class label of interest
+        Npoints: number of score threshold points. Defines "resolution" of the parameter (score threshold)
+        min_score, max_score: if specified, they impose lower and upper bound limits for the parameter (score thr.)
+    Output:
+        prec_rec: ndarray of shape (Npoints, 2), containing a precision (column 0) and recall (column 1) value for each
+            score threshold value
+    """
+
+    if (min_score is None) or (max_score is None):
+        predicted_in_C = (y_pred == label)
+        min_score = 0.99 * np.amin(y_scores[predicted_in_C])  # guarantees that all predictions are kept
+        max_score = 1.01 * np.amax(y_scores[predicted_in_C])  # guarantees that no prediction is kept
+
+    grid = np.linspace(min_score, max_score, Npoints)
+
+    measure = lambda x: limiter([precision, recall], y_true, y_pred, y_scores, x, label)
+
+    return np.array(map(measure, grid)), grid
+
+
+def plot_prec_vs_rec(score_grid, rec, prec, prec_requirement=None, thr_opt=None, title=None, show=True, save_as=None):
+    """Plots a figure depicting precision and recall as a function of the score threshold.
+    Optionally also depicts an imposed precision requirement and a chosen score threshold value."""
+
+    if not (thr_opt is None):
+        thr_opt = thr_opt if not (math.isinf(thr_opt)) else None
+
+    plt.figure()
+    if title:
+        plt.suptitle(title)
+
+    # Recall and Precision vs. Score Threshold
+    plt.subplot(211)
+    l_rec, = plt.plot(score_grid, rec, '.-')
+
+    plt.hold(True)
+    l_prec, = plt.plot(score_grid, prec, 'g.-')
+    plt.ylim((0, 1.01))
+    plt.xlabel('score threshold')
+
+    legend_lines = [l_rec, l_prec]
+    legend_labels = ['recall', 'precision']
+
+    if prec_requirement:
+        l_prec_req = plt.axhline(prec_requirement, color='r', linestyle='--')
+        legend_lines.append(l_prec_req)
+        legend_labels.append('prec. req.')
+
+    if not (thr_opt is None):
+        l_score_thr = plt.axvline(thr_opt, color='r')
+        legend_lines.append(l_score_thr)
+        legend_labels.append('opt. thr.')
+
+    plt.legend(legend_lines, legend_labels, loc='lower right', fontsize=10)
+
+    # Recall vs. Precision
+    plt.subplot(212)
+    plt.plot(prec, rec, '.-')
+
+    plt.ylim((0, 1.01))
+    plt.xlim((0, 1.01))
+    plt.ylabel('recall')
+    plt.xlabel('precision')
+
+    if prec_requirement:
+        l_prec_req = plt.axvline(prec_requirement, color='r', linestyle='--')
+        plt.legend([l_prec_req], ['precision req.'], loc='lower left', fontsize=10)
+
+    if save_as:
+        plt.savefig(save_as, bbox_inches='tight', format='pdf')
+
+    if show:
+        plt.tight_layout()
+        plt.show(block=False)
+
+
+def plot_confusion_matrix(ConfMat, label_strings=None, title='Confusion matrix', cmap=plt.cm.get_cmap('Blues')):
+    """Plot confusion matrix in a separate window"""
+    plt.imshow(ConfMat, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+    if label_strings:
+        tick_marks = np.arange(len(label_strings))
+        plt.xticks(tick_marks, label_strings, rotation=90)
+        plt.yticks(tick_marks, label_strings)
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+
+
+def print_confusion_matrix(ConfMat, label_strings=None, title='Confusion matrix'):
+    """Print confusion matrix as text to terminal"""
+
+    if label_strings is None:
+        label_strings = ConfMat.shape[0] * ['']
+
+    print(title)
+    print(len(title) * '-')
+    # Make printable matrix:
+    print_mat = []
+    for i, row in enumerate(ConfMat):
+        print_mat.append([label_strings[i]] + list(row))
+    print(tabulate(print_mat, headers=['True\Pred'] + label_strings, tablefmt='orgtbl'))
+
+
+class Analyzer(object):
+
+    def __init__(self, maxcharlength=35, plot=False, print_conf_mat=False, output_filepath=None):
+
+        self.maxcharlength = maxcharlength
+        self.plot = plot
+        self.print_conf_mat = print_conf_mat
+
+        # create logger
+        self.logID = str(
+            datetime.now())  # this is to enable individual logging configuration between different instances
+        self.logger = logging.getLogger(self.logID)
+        self.logger.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(message)s')
+
+        # create console handler
+        ch = logging.StreamHandler(sys.stdout)
+        ch.setLevel(logging.INFO)
+        ch.setFormatter(formatter)
+        self.logger.addHandler(ch)
+
+        if output_filepath:
+            # create file handler
+            fh = logging.FileHandler(output_filepath)
+            fh.setLevel(logging.INFO)
+            fh.setFormatter(formatter)
+            self.logger.addHandler(fh)
+
+    def show_acc_top_k_improvement(self, y_pred, y_true, k=5, inp='scores'):
+        """
+        Show how accuracy improves when considering the event of the correct label being among the top k predictions as a successful prediction
+        Arguments:
+            k: integer k mentioned above
+            inp: string, one of 'scores' or 'indices', defining assumptions for `y_pred`, see below
+            y_pred: If inp is 'indices', then this is a (N_samples, k) array of top class indices (pre-sorted class indices based on score) per sample
+                If inp is 'scores', then this is assummed to be a (N_samples, C) array of class scores per sample, where C is the number of classes
+            y_true: (N_samples,) 1D numpy array of ground truth labels (integer indices)
+        """
+
+        print('How accuracy improves when allowing correct result being in the top 1, 2, ..., k predictions:\n')
+
+        if inp == 'scores':
+            predictions = np.argsort(y_pred, axis=1)[:, ::-1]  # sort in descending order
+        else:
+            predictions = y_pred
+
+        predictions = predictions[:, :min(k, predictions.shape[1])]  # take top k
+
+        accuracy_per_rank = acc_top_k(predictions, y_true)
+
+        row1 = ['k'] + range(1, len(accuracy_per_rank) + 1)
+        row2 = ['Accuracy'] + list(accuracy_per_rank)
+        print(tabulate([row1, row2], tablefmt='orgtbl'))
+
+        if self.plot:
+            from matplotlib.ticker import MaxNLocator
+
+            ax = plt.figure().gca()
+            plt.plot(np.arange(1, k + 1, dtype=int), accuracy_per_rank, '.-')
+            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
+            plt.xlabel('Number of allowed predictions (k)')
+            plt.ylabel('Cumulative accuracy\n(prob. of correct result being in top k pred.)')
+            plt.title('Cumulative Accuracy vs Number of allowed predictions')
+
+            plt.show(block=False)
+
+        return accuracy_per_rank
+
+    def generate_classification_report(self, digits=3, number_of_thieves=2, maxcharlength=35):
+        """
+        Returns a string of a report for given metric arrays (array length equals the number of classes).
+        Called internally by `analyze_classification`.
+            digits: number of digits after . for displaying results
+            number_of_thieves: number of biggest thieves to report
+            maxcharlength: max. number of characters to use when displaying thief names
+        """
+
+        relative_freq = self.support / np.sum(self.support)  # relative frequencies of each class in the true lables
+        sorted_class_indices = np.argsort(relative_freq)[
+                               ::-1]  # sort by "importance" of classes (i.e. occurance frequency)
+
+        last_line_heading = 'avg / total'
+
+        width = max(len(cn) for cn in self.existing_class_names)
+        width = max(width, len(last_line_heading), digits)
+
+        headers = ["precision", "recall", "f1-score", "rel. freq.", "abs. freq.", "biggest thieves"]
+        fmt = '%% %ds' % width  # first column: class name
+        fmt += '  '
+        fmt += ' '.join(['% 10s' for _ in headers[:-1]])
+        fmt += '|\t % 5s'
+        fmt += '\n'
+
+        headers = [""] + headers
+        report = fmt % tuple(headers)
+        report += '\n'
+
+        for i in sorted_class_indices:
+            values = [self.existing_class_names[i]]
+            for v in (self.precision[i], self.recall[i], self.f1[i],
+                      relative_freq[i]):  # v is NOT a tuple, just goes through this list 1 el. at a time
+                values += ["{0:0.{1}f}".format(v, digits)]
+            values += ["{}".format(self.support[i])]
+            thieves = np.argsort(self.ConfMatrix_normalized_row[i, :])[::-1][
+                      :number_of_thieves + 1]  # other class indices "stealing" from class. May still contain self
+            thieves = thieves[thieves != i]  # exclude self at this point
+            steal_ratio = self.ConfMatrix_normalized_row[i, thieves]
+            thieves_names = [
+                self.existing_class_names[thief][:min(maxcharlength, len(self.existing_class_names[thief]))] for thief
+                in thieves]  # a little inefficient but inconsequential
+            string_about_stealing = ""
+            for j in range(len(thieves)):
+                string_about_stealing += "{0}: {1:.3f},\t".format(thieves_names[j], steal_ratio[j])
+            values += [string_about_stealing]
+
+            report += fmt % tuple(values)
+
+        report += '\n' + 100 * '-' + '\n'
+
+        # compute averages/sums
+        values = [last_line_heading]
+        for v in (np.average(self.precision, weights=relative_freq),
+                  np.average(self.recall, weights=relative_freq),
+                  np.average(self.f1, weights=relative_freq)):
+            values += ["{0:0.{1}f}".format(v, digits)]
+        values += ['{0}'.format(np.sum(relative_freq))]
+        values += ['{0}'.format(np.sum(self.support))]
+        values += ['']
+
+        # make last ("Total") line for report
+        report += fmt % tuple(values)
+
+        return report
+
+    def get_avg_prec_recall(self, ConfMatrix, existing_class_names, excluded_classes=None):
+        """Get average recall and precision, using class frequencies as weights, optionally excluding
+        specified classes"""
+
+        class2ind = dict(zip(existing_class_names, range(len(existing_class_names))))
+        included_c = np.full(len(existing_class_names), 1, dtype=bool)
+
+        if not (excluded_classes is None):
+            excl_ind = [class2ind[excl_class] for excl_class in excluded_classes]
+            included_c[excl_ind] = False
+
+        pred_per_class = np.sum(ConfMatrix, axis=0)
+        nonzero_pred = (pred_per_class > 0)
+
+        included = included_c & nonzero_pred
+        support = np.sum(ConfMatrix, axis=1)
+        weights = support[included] / np.sum(support[included])
+
+        prec = np.diag(ConfMatrix[included, :][:, included]) / pred_per_class[included]
+        prec_avg = np.dot(weights, prec)
+
+        # rec = np.diag(ConfMatrix[included_c,:][:,included_c])/support[included_c]
+        rec_avg = np.trace(ConfMatrix[included_c, :][:, included_c]) / np.sum(support[included_c])
+
+        return prec_avg, rec_avg
+
+    def prec_rec_histogram(self, precision, recall, binedges=None):
+        """Make a histogram with the distribution of classes with respect to precision and recall
+        """
+
+        if binedges is None:
+            binedges = np.concatenate((np.arange(0, 0.6, 0.2), np.arange(0.6, 1.01, 0.1)), axis=0)
+            binedges = np.append(binedges, binedges[-1] + 0.1)  # add 1 extra bin at the end for >= 1
+
+        hist_precision, binedges = np.histogram(precision, binedges)
+        hist_recall, binedges = np.histogram(recall, binedges)
+
+        print("\n\nDistribution of classes with respect to PRECISION: ")
+        for b in range(len(binedges) - 1):
+            print("[{:.1f}, {:.1f}): {}".format(binedges[b], binedges[b + 1], hist_precision[b]))
+
+        print("\n\nDistribution of classes with respect to RECALL: ")
+        for b in range(len(binedges) - 1):
+            print("[{:.1f}, {:.1f}): {}".format(binedges[b], binedges[b + 1], hist_recall[b]))
+
+        if self.plot:
+            plt.figure()
+            plt.subplot(121)
+            widths = np.diff(binedges)
+            plt.bar(binedges[:-1], hist_precision, width=widths, align='edge')
+            plt.xlim(0, 1)
+            ax = plt.gca()
+            ax.set_xticks(binedges)
+            plt.xlabel('Precision')
+            plt.ylabel('Number of classes')
+            plt.title("Distribution of classes with respect to precision")
+
+            plt.subplot(122)
+            widths = np.diff(binedges)
+            plt.bar(binedges[:-1], hist_recall, width=widths, align='edge')
+            plt.xlim(0, 1)
+            ax = plt.gca()
+            ax.set_xticks(binedges)
+            plt.xlabel('Recall')
+            plt.ylabel('Number of classes')
+            plt.title("Distribution of classes with respect to recall")
+
+            plt.show(block=False)
+
+    def analyze_classification(self, y_pred, y_true, class_names, excluded_classes=None):
+        """
+        For an array of label predictions and the respective true labels, shows confusion matrix, accuracy, recall, precision etc:
+        Input:
+            y_pred: 1D array of predicted labels (class indices)
+            y_true: 1D array of true labels (class indices)
+            class_names: 1D array or list of class names in the order of class indices.
+                Could also be integers [0, 1, ..., num_classes-1].
+            excluded_classes: list of classes to be excluded from average precision, recall calculation (e.g. OTHER)
+        """
+
+        # Trim class_names to include only classes existing in y_pred OR y_true
+        in_pred_labels = set(list(y_pred))
+        in_true_labels = set(list(y_true))
+
+        self.existing_class_ind = sorted(list(in_pred_labels | in_true_labels))
+        class_strings = [str(name) for name in class_names]  # needed in case `class_names` elements are not strings
+        self.existing_class_names = [class_strings[ind][:min(self.maxcharlength, len(class_strings[ind]))] for ind in
+                                     self.existing_class_ind]  # a little inefficient but inconsequential
+
+        # Confusion matrix
+        ConfMatrix = metrics.confusion_matrix(y_true, y_pred)
+
+        if self.print_conf_mat:
+            print_confusion_matrix(ConfMatrix, label_strings=self.existing_class_names, title='Confusion matrix')
+            print('\n')
+        if self.plot:
+            plt.figure()
+            plot_confusion_matrix(ConfMatrix, self.existing_class_names)
+
+        # Normalize the confusion matrix by row (i.e by the number of samples in each class)
+        self.ConfMatrix_normalized_row = ConfMatrix.astype('float') / ConfMatrix.sum(axis=1)[:, np.newaxis]
+
+        if self.print_conf_mat:
+            print_confusion_matrix(self.ConfMatrix_normalized_row, label_strings=self.existing_class_names,
+                                   title='Confusion matrix normalized by row')
+            print('\n')
+        if self.plot:
+            plt.figure()
+            plot_confusion_matrix(self.ConfMatrix_normalized_row, label_strings=self.existing_class_names,
+                                  title='Confusion matrix normalized by row')
+
+            plt.show(block=False)
+
+        # Analyze results
+        self.total_accuracy = np.trace(ConfMatrix) / len(y_true)
+        print('Overall accuracy: {:.3f}\n'.format(self.total_accuracy))
+
+        # returns metrics for each class, in the same order as existing_class_names
+        self.precision, self.recall, self.f1, self.support = metrics.precision_recall_fscore_support(y_true, y_pred,
+                                                                                                     labels=self.existing_class_ind)
+
+        # Print report
+        print(self.generate_classification_report())
+
+        # Calculate average precision and recall
+        self.prec_avg, self.rec_avg = self.get_avg_prec_recall(ConfMatrix, self.existing_class_names, excluded_classes)
+        if excluded_classes:
+            print(
+                "\nAverage PRECISION: {:.2f}\n(using class frequencies as weights, excluding classes with no predictions and predictions in '{}')".format(
+                    self.prec_avg, ', '.join(excluded_classes)))
+            print(
+                "\nAverage RECALL (= ACCURACY): {:.2f}\n(using class frequencies as weights, excluding classes in '{}')".format(
+                    self.rec_avg, ', '.join(excluded_classes)))
+
+        # Make a histogram with the distribution of classes with respect to precision and recall
+        self.prec_rec_histogram(self.precision, self.recall)
+
+        return {"total_accuracy": self.total_accuracy, "precision": self.precision, "recall": self.recall,
+                "f1": self.f1, "support": self.support, "prec_avg": self.prec_avg, "rec_avg": self.rec_avg}
\ No newline at end of file
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000..1d4f5f4
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,343 @@
+import json
+import os
+import sys
+import builtins
+import functools
+import time
+import ipdb
+from copy import deepcopy
+
+import numpy as np
+import torch
+import xlrd
+import xlwt
+from xlutils.copy import copy
+
+import logging
+logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def timer(func):
+    """Print the runtime of the decorated function"""
+    @functools.wraps(func)
+    def wrapper_timer(*args, **kwargs):
+        start_time = time.perf_counter()    # 1
+        value = func(*args, **kwargs)
+        end_time = time.perf_counter()      # 2
+        run_time = end_time - start_time    # 3
+        print(f"Finished {func.__name__!r} in {run_time} secs")
+        return value
+    return wrapper_timer
+
+
+def save_model(path, epoch, model, optimizer=None):
+    if isinstance(model, torch.nn.DataParallel):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    data = {'epoch': epoch,
+            'state_dict': state_dict}
+    if not (optimizer is None):
+        data['optimizer'] = optimizer.state_dict()
+    torch.save(data, path)
+
+
+def load_model(model, model_path, optimizer=None, resume=False, change_output=False,
+               lr=None, lr_step=None, lr_factor=None):
+    start_epoch = 0
+    checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
+    state_dict = deepcopy(checkpoint['state_dict'])
+    if change_output:
+        for key, val in checkpoint['state_dict'].items():
+            if key.startswith('output_layer'):
+                state_dict.pop(key)
+    model.load_state_dict(state_dict, strict=False)
+    print('Loaded model from {}. Epoch: {}'.format(model_path, checkpoint['epoch']))
+
+    # resume optimizer parameters
+    if optimizer is not None and resume:
+        if 'optimizer' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            start_epoch = checkpoint['epoch']
+            start_lr = lr
+            for i in range(len(lr_step)):
+                if start_epoch >= lr_step[i]:
+                    start_lr *= lr_factor[i]
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = start_lr
+            print('Resumed optimizer with start lr', start_lr)
+        else:
+            print('No optimizer parameters in checkpoint.')
+    if optimizer is not None:
+        return model, optimizer, start_epoch
+    else:
+        return model
+
+
+def load_config(config_filepath):
+    """
+    Using a json file with the master configuration (config file for each part of the pipeline),
+    return a dictionary containing the entire configuration settings in a hierarchical fashion.
+    """
+
+    with open(config_filepath) as cnfg:
+        config = json.load(cnfg)
+
+    return config
+
+
+def create_dirs(dirs):
+    """
+    Input:
+        dirs: a list of directories to create, in case these directories are not found
+    Returns:
+        exit_code: 0 if success, -1 if failure
+    """
+    try:
+        for dir_ in dirs:
+            if not os.path.exists(dir_):
+                os.makedirs(dir_)
+        return 0
+    except Exception as err:
+        print("Creating directories error: {0}".format(err))
+        exit(-1)
+
+
+def export_performance_metrics(filepath, metrics_table, header, book=None, sheet_name="metrics"):
+    """Exports performance metrics on the validation set for all epochs to an excel file"""
+
+    if book is None:
+        book = xlwt.Workbook()  # new excel work book
+
+    book = write_table_to_sheet([header] + metrics_table, book, sheet_name=sheet_name)
+
+    book.save(filepath)
+    logger.info("Exported per epoch performance metrics in '{}'".format(filepath))
+
+    return book
+
+
+def write_row(sheet, row_ind, data_list):
+    """Write a list to row_ind row of an excel sheet"""
+
+    row = sheet.row(row_ind)
+    for col_ind, col_value in enumerate(data_list):
+        row.write(col_ind, col_value)
+    return
+
+
+def write_table_to_sheet(table, work_book, sheet_name=None):
+    """Writes a table implemented as a list of lists to an excel sheet in the given work book object"""
+
+    sheet = work_book.add_sheet(sheet_name)
+
+    for row_ind, row_list in enumerate(table):
+        write_row(sheet, row_ind, row_list)
+
+    return work_book
+
+
+def export_record(filepath, values):
+    """Adds a list of values as a bottom row of a table in a given excel file"""
+
+    read_book = xlrd.open_workbook(filepath, formatting_info=True)
+    read_sheet = read_book.sheet_by_index(0)
+    last_row = read_sheet.nrows
+
+    work_book = copy(read_book)
+    sheet = work_book.get_sheet(0)
+    write_row(sheet, last_row, values)
+    work_book.save(filepath)
+
+
+def register_record(filepath, timestamp, experiment_name, best_metrics, final_metrics=None, comment=''):
+    """
+    Adds the best and final metrics of a given experiment as a record in an excel sheet with other experiment records.
+    Creates excel sheet if it doesn't exist.
+    Args:
+        filepath: path of excel file keeping records
+        timestamp: string
+        experiment_name: string
+        best_metrics: dict of metrics at best epoch {metric_name: metric_value}. Includes "epoch" as first key
+        final_metrics: dict of metrics at final epoch {metric_name: metric_value}. Includes "epoch" as first key
+        comment: optional description
+    """
+    metrics_names, metrics_values = zip(*best_metrics.items())
+    row_values = [timestamp, experiment_name, comment] + list(metrics_values)
+    if final_metrics is not None:
+        final_metrics_names, final_metrics_values = zip(*final_metrics.items())
+        row_values += list(final_metrics_values)
+
+    if not os.path.exists(filepath):  # Create a records file for the first time
+        logger.warning("Records file '{}' does not exist! Creating new file ...".format(filepath))
+        directory = os.path.dirname(filepath)
+        if len(directory) and not os.path.exists(directory):
+            os.makedirs(directory)
+        header = ["Timestamp", "Name", "Comment"] + ["Best " + m for m in metrics_names]
+        if final_metrics is not None:
+            header += ["Final " + m for m in final_metrics_names]
+        book = xlwt.Workbook()  # excel work book
+        book = write_table_to_sheet([header, row_values], book, sheet_name="records")
+        book.save(filepath)
+    else:
+        try:
+            export_record(filepath, row_values)
+        except Exception as x:
+            alt_path = os.path.join(os.path.dirname(filepath), "record_" + experiment_name)
+            logger.error("Failed saving in: '{}'! Will save here instead: {}".format(filepath, alt_path))
+            export_record(alt_path, row_values)
+            filepath = alt_path
+
+    logger.info("Exported performance record to '{}'".format(filepath))
+
+
+class Printer(object):
+    """Class for printing output by refreshing the same line in the console, e.g. for indicating progress of a process"""
+
+    def __init__(self, console=True):
+
+        if console:
+            self.print = self.dyn_print
+        else:
+            self.print = builtins.print
+
+    @staticmethod
+    def dyn_print(data):
+        """Print things to stdout on one line, refreshing it dynamically"""
+        sys.stdout.write("\r\x1b[K" + data.__str__())
+        sys.stdout.flush()
+
+
+def readable_time(time_difference):
+    """Convert a float measuring time difference in seconds into a tuple of (hours, minutes, seconds)"""
+
+    hours = time_difference // 3600
+    minutes = (time_difference // 60) % 60
+    seconds = time_difference % 60
+
+    return hours, minutes, seconds
+
+
+# def check_model1(model, verbose=False, stop_on_error=False):
+#     status_ok = True
+#     for name, param in model.named_parameters():
+#         nan_grads = torch.isnan(param.grad)
+#         nan_params = torch.isnan(param)
+#         if nan_grads.any() or nan_params.any():
+#             status_ok = False
+#             print("Param {}: {}/{} nan".format(name, torch.sum(nan_params), param.numel()))
+#             if verbose:
+#                 print(param)
+#             print("Grad {}: {}/{} nan".format(name, torch.sum(nan_grads), param.grad.numel()))
+#             if verbose:
+#                 print(param.grad)
+#             if stop_on_error:
+#                 ipdb.set_trace()
+#     if status_ok:
+#         print("Model Check: OK")
+#     else:
+#         print("Model Check: PROBLEM")
+
+
+def check_model(model, verbose=False, zero_thresh=1e-8, inf_thresh=1e6, stop_on_error=False):
+    status_ok = True
+    for name, param in model.named_parameters():
+        param_ok = check_tensor(param, verbose=verbose, zero_thresh=zero_thresh, inf_thresh=inf_thresh)
+        if not param_ok:
+            status_ok = False
+            print("Parameter '{}' PROBLEM".format(name))
+        grad_ok = True
+        if param.grad is not None:
+            grad_ok = check_tensor(param.grad, verbose=verbose, zero_thresh=zero_thresh, inf_thresh=inf_thresh)
+        if not grad_ok:
+            status_ok = False
+            print("Gradient of parameter '{}' PROBLEM".format(name))
+        if stop_on_error and not (param_ok and grad_ok):
+            ipdb.set_trace()
+
+    if status_ok:
+        print("Model Check: OK")
+    else:
+        print("Model Check: PROBLEM")
+
+
+def check_tensor(X, verbose=True, zero_thresh=1e-8, inf_thresh=1e6):
+
+    is_nan = torch.isnan(X)
+    if is_nan.any():
+        print("{}/{} nan".format(torch.sum(is_nan), X.numel()))
+        return False
+
+    num_small = torch.sum(torch.abs(X) < zero_thresh)
+    num_large = torch.sum(torch.abs(X) > inf_thresh)
+
+    if verbose:
+        print("Shape: {}, {} elements".format(X.shape, X.numel()))
+        print("No 'nan' values")
+        print("Min: {}".format(torch.min(X)))
+        print("Median: {}".format(torch.median(X)))
+        print("Max: {}".format(torch.max(X)))
+
+        print("Histogram of values:")
+        values = X.view(-1).detach().numpy()
+        hist, binedges = np.histogram(values, bins=20)
+        for b in range(len(binedges) - 1):
+            print("[{}, {}): {}".format(binedges[b], binedges[b + 1], hist[b]))
+
+        print("{}/{} abs. values < {}".format(num_small, X.numel(), zero_thresh))
+        print("{}/{} abs. values > {}".format(num_large, X.numel(), inf_thresh))
+
+    if num_large:
+        print("{}/{} abs. values > {}".format(num_large, X.numel(), inf_thresh))
+        return False
+
+    return True
+
+
+def count_parameters(model, trainable=False):
+    if trainable:
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    else:
+        return sum(p.numel() for p in model.parameters())
+
+
+def recursively_hook(model, hook_fn):
+    for name, module in model.named_children(): #model._modules.items():
+        if len(list(module.children())) > 0:  # if not leaf node
+            for submodule in module.children():
+                recursively_hook(submodule, hook_fn)
+        else:
+            module.register_forward_hook(hook_fn)
+
+
+def compute_loss(net: torch.nn.Module,
+                 dataloader: torch.utils.data.DataLoader,
+                 loss_function: torch.nn.Module,
+                 device: torch.device = 'cpu') -> torch.Tensor:
+    """Compute the loss of a network on a given dataset.
+
+    Does not compute gradient.
+
+    Parameters
+    ----------
+    net:
+        Network to evaluate.
+    dataloader:
+        Iterator on the dataset.
+    loss_function:
+        Loss function to compute.
+    device:
+        Torch device, or :py:class:`str`.
+
+    Returns
+    -------
+    Loss as a tensor with no grad.
+    """
+    running_loss = 0
+    with torch.no_grad():
+        for x, y in dataloader:
+            netout = net(x.to(device)).cpu()
+            running_loss += loss_function(y, netout)
+
+    return running_loss / len(dataloader)