diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datasets/data.py b/datasets/data.py new file mode 100644 index 0000000..f73410d --- /dev/null +++ b/datasets/data.py @@ -0,0 +1,818 @@ +from typing import Optional +import os +from multiprocessing import Pool, cpu_count +import glob +import re +import logging +from itertools import repeat, chain + +import numpy as np +import pandas as pd +from tqdm import tqdm +from sktime.utils import load_data +from sklearn import preprocessing +from datetime import datetime + +import matplotlib.pyplot as plt +from datasets import utils + +logger = logging.getLogger('__main__') + +class Normalizer(object): + """ + Normalizes dataframe across ALL contained rows (time steps). Different from per-sample normalization. + """ + + def __init__(self, norm_type, mean=None, std=None, min_val=None, max_val=None): + """ + Args: + norm_type: choose from: + "standardization", "minmax": normalizes dataframe across ALL contained rows (time steps) + "per_sample_std", "per_sample_minmax": normalizes each sample separately (i.e. across only its own rows) + mean, std, min_val, max_val: optional (num_feat,) Series of pre-computed values + """ + + self.norm_type = norm_type + self.mean = mean + self.std = std + self.min_val = min_val + self.max_val = max_val + self.types = None + self.labelEncoder = {} + + def normalize(self, df): + """ + Args: + df: input dataframe + Returns: + df: normalized dataframe + """ + # logger.info(" Normalize : shape {} \n columns {}", .format(df.shape, df.head())) + self.types = df.dtypes + # if any feature in Numeric then done label-encoding; + charcolumns = df.columns[self.types == "object"] + for cols in charcolumns: + self.labelEncoder[cols] = preprocessing.LabelEncoder().fit(df[cols]) + df[cols] = self.labelEncoder[cols].transform(df[cols]) + + if self.norm_type == "standardization": + if self.mean is None: + self.mean = df.mean() + self.std = df.std() + return (df - self.mean) / (self.std + np.finfo(float).eps) + + elif self.norm_type == "minmax": + if self.max_val is None: + self.max_val = df.max() + self.min_val = df.min() + logger.info("normalize: \nmax {} \n min{} ".format(self.max_val, self.min_val)) + return (df - self.min_val) / (self.max_val - self.min_val + np.finfo(float).eps) + + elif self.norm_type == "per_sample_std": + grouped = df.groupby(by=df.index) + self.groupedidx = grouped.indices + self.groupedmean = grouped.transform('mean') + self.groupedstddev = grouped.transform('std') + return (df - self.groupedmean) / self.groupedstddev + + elif self.norm_type == "per_sample_minmax": + grouped = df.groupby(by=df.index) + self.groupedidx = grouped.indices + self.groupedmin_vals = grouped.transform('min') + self.groupedmax_vals = grouped.transform('max') + return (df - self.groupedmin_vals) / (self.groupedmax_vals - self.groupedmin_vals + np.finfo(float).eps) + + else: + raise (NameError(f'Normalize method "{self.norm_type}" not implemented')) + + def inverse_normalize(self, df:pd.DataFrame=None): + """reverses the transformation done with the method that was used normalize function above. + + Args: + df (_type_): dataFrame with the same names as that was given earlier + """ + dfout = pd.DataFrame() + # inverse normalize the character columns once done with reversing of the minmax + + if self.norm_type == "standardization": + assert set(self.mean.index).issuperset(set(df.columns)) + means = self.means[df.columns] + stds = self.std[df.columns] + dfout = pd.DataFrame(df*(stds+ np.finfo(float).eps) + means) + + elif self.norm_type == "minmax": + assert set(self.max_val.index).issuperset(set(df.columns)) + minvals = self.min_val[df.columns] + maxvals = self.max_val[df.columns] + dfout = pd.DataFrame((maxvals - minvals + np.finfo(float).eps)*df + minvals) + + elif self.norm_type == "per_sample_std": + logger.info( "---------------- YTD ------------------------") + idx = df.index + means = self.groupedmean[idx] + stds = self.groupedstddev[idx] + dfout = pd.DataFrame(df*stds + means) + + elif self.norm_type == "per_sample_minmax": + idx = df.index + min_vals = self.groupedmin_vals[idx] + max_vals = self.groupedmax_vals[idx] + dfout = pd.DataFrame(df * (max_vals - min_vals + np.finfo(float).eps) - min_vals) + + else: + raise (NameError(f'Normalize method "{self.norm_type}" not implemented')) + return None + + dfout.columns = df.columns + charcolumns = dfout.columns[dfout.columns.dtypes == "object"] + for cols in charcolumns: + dfout[cols] = self.labelEncoder.inverse_transform(dfout[cols]) + + return dfout + + +def interpolate_missing(y): + """ + Replaces NaN values in pd.Series `y` using linear interpolation + """ + if y.isna().any(): + y = y.interpolate(method='linear', limit_direction='both') + return y + + +def subsample(y, limit=256, factor=2): + """ + If a given Series is longer than `limit`, returns subsampled sequence by the specified integer factor + """ + if len(y) > limit: + return y[::factor].reset_index(drop=True) + return y + + +class BaseData(object): + + def set_num_processes(self, n_proc): + + if (n_proc is None) or (n_proc <= 0): + self.n_proc = cpu_count() # max(1, cpu_count() - 1) + else: + self.n_proc = min(n_proc, cpu_count()) + + +class WeldData(BaseData): + """ + Dataset class for welding dataset. + Attributes: + all_df: dataframe indexed by ID, with multiple rows corresponding to the same index (sample). + Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature. + feature_df: contains the subset of columns of `all_df` which correspond to selected features + feature_names: names of columns contained in `feature_df` (same as feature_df.columns) + all_IDs: IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() ) + max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used. + (Moreover, script argument overrides this attribute) + """ + + def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None, config=None): + + self.set_num_processes(n_proc=n_proc) + + self.all_df = self.load_all(root_dir, file_list=file_list, pattern=pattern) + self.all_df = self.all_df.sort_values(by=['weld_record_index']) # datasets is presorted + # TODO: There is a single ID that causes the model output to become nan - not clear why + self.all_df = self.all_df[self.all_df['weld_record_index'] != 920397] # exclude particular ID + self.all_df = self.all_df.set_index('weld_record_index') + self.all_IDs = self.all_df.index.unique() # all sample (session) IDs + self.max_seq_len = 66 + if limit_size is not None: + if limit_size > 1: + limit_size = int(limit_size) + else: # interpret as proportion if in (0, 1] + limit_size = int(limit_size * len(self.all_IDs)) + self.all_IDs = self.all_IDs[:limit_size] + self.all_df = self.all_df.loc[self.all_IDs] + + self.feature_names = ['wire_feed_speed', 'current', 'voltage', 'motor_current', 'power'] + self.feature_df = self.all_df[self.feature_names] + + def load_all(self, root_dir, file_list=None, pattern=None): + """ + Loads datasets from csv files contained in `root_dir` into a dataframe, optionally choosing from `pattern` + Args: + root_dir: directory containing all individual .csv files + file_list: optionally, provide a list of file paths within `root_dir` to consider. + Otherwise, entire `root_dir` contents will be used. + pattern: optionally, apply regex string to select subset of files + Returns: + all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files + """ + # each file name corresponds to another date. Also tools (A, B) and others. + + # Select paths for training and evaluation + if file_list is None: + data_paths = glob.glob(os.path.join(root_dir, '*')) # list of all paths + else: + data_paths = [os.path.join(root_dir, p) for p in file_list] + if len(data_paths) == 0: + raise Exception('No files found using: {}'.format(os.path.join(root_dir, '*'))) + + if pattern is None: + # by default evaluate on + selected_paths = data_paths + else: + selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths)) + + input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.csv')] + if len(input_paths) == 0: + raise Exception("No .csv files found using pattern: '{}'".format(pattern)) + + if self.n_proc > 1: + # Load in parallel + _n_proc = min(self.n_proc, len(input_paths)) # no more than file_names needed here + logger.info("Loading {} datasets files using {} parallel processes ...".format(len(input_paths), _n_proc)) + with Pool(processes=_n_proc) as pool: + all_df = pd.concat(pool.map(WeldData.load_single, input_paths)) + else: # read 1 file at a time + all_df = pd.concat(WeldData.load_single(path) for path in input_paths) + + return all_df + + @staticmethod + def load_single(filepath): + df = WeldData.read_data(filepath) + df = WeldData.select_columns(df) + num_nan = df.isna().sum().sum() + if num_nan > 0: + logger.warning("{} nan values in {} will be replaced by 0".format(num_nan, filepath)) + df = df.fillna(0) + + return df + + @staticmethod + def read_data(filepath): + """Reads a single .csv, which typically contains a day of datasets of various weld sessions. + """ + df = pd.read_csv(filepath) + return df + + @staticmethod + def select_columns(df): + """""" + df = df.rename(columns={"per_energy": "power"}) + # Sometimes 'diff_time' is not measured correctly (is 0), and power ('per_energy') becomes infinite + is_error = df['power'] > 1e16 + df.loc[is_error, 'power'] = df.loc[is_error, 'true_energy'] / df['diff_time'].median() + + df['weld_record_index'] = df['weld_record_index'].astype(int) + keep_cols = ['weld_record_index', 'wire_feed_speed', 'current', 'voltage', 'motor_current', 'power'] + df = df[keep_cols] + + return df + + +class TSRegressionArchive(BaseData): + """ + Dataset class for datasets included in: + 1) the Time Series Regression Archive (www.timeseriesregression.org), or + 2) the Time Series Classification Archive (www.timeseriesclassification.com) + Attributes: + all_df: (num_samples * seq_len, num_columns) dataframe indexed by integer indices, with multiple rows corresponding to the same index (sample). + Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature. + feature_df: (num_samples * seq_len, feat_dim) dataframe; contains the subset of columns of `all_df` which correspond to selected features + feature_names: names of columns contained in `feature_df` (same as feature_df.columns) + all_IDs: (num_samples,) series of IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() ) + labels_df: (num_samples, num_labels) pd.DataFrame of label(s) for each sample + max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used. + (Moreover, script argument overrides this attribute) + """ + + def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None, config=None): + + #self.set_num_processes(n_proc=n_proc) + + self.config = config + + self.all_df, self.labels_df = self.load_all(root_dir, file_list=file_list, pattern=pattern) + self.all_IDs = self.all_df.index.unique() # all sample IDs (integer indices 0 ... num_samples-1) + + if limit_size is not None: + if limit_size > 1: + limit_size = int(limit_size) + else: # interpret as proportion if in (0, 1] + limit_size = int(limit_size * len(self.all_IDs)) + self.all_IDs = self.all_IDs[:limit_size] + self.all_df = self.all_df.loc[self.all_IDs] + + # use all features + self.feature_names = self.all_df.columns + self.feature_df = self.all_df + + def load_all(self, root_dir, file_list=None, pattern=None): + """ + Loads datasets from csv files contained in `root_dir` into a dataframe, optionally choosing from `pattern` + Args: + root_dir: directory containing all individual .csv files + file_list: optionally, provide a list of file paths within `root_dir` to consider. + Otherwise, entire `root_dir` contents will be used. + pattern: optionally, apply regex string to select subset of files + Returns: + all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files + labels_df: dataframe containing label(s) for each sample + """ + + # Select paths for training and evaluation + if file_list is None: + data_paths = glob.glob(os.path.join(root_dir, '*')) # list of all paths + else: + data_paths = [os.path.join(root_dir, p) for p in file_list] + if len(data_paths) == 0: + raise Exception('No files found using: {}'.format(os.path.join(root_dir, '*'))) + + if pattern is None: + # by default evaluate on + selected_paths = data_paths + else: + selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths)) + + input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.ts')] + if len(input_paths) == 0: + raise Exception("No .ts files found using pattern: '{}'".format(pattern)) + + all_df, labels_df = self.load_single(input_paths[0]) # a single file contains dataset + + return all_df, labels_df + + def load_single(self, filepath): + + # Every row of the returned df corresponds to a sample; + # every column is a pd.Series indexed by timestamp and corresponds to a different dimension (feature) + if self.config['task'] == 'regression': + df, labels = utils.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True, replace_missing_vals_with='NaN') + labels_df = pd.DataFrame(labels, dtype=np.float32) + elif self.config['task'] == 'classification': + df, labels = load_data.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True, replace_missing_vals_with='NaN') + labels = pd.Series(labels, dtype="category") + self.class_names = labels.cat.categories + labels_df = pd.DataFrame(labels.cat.codes, dtype=np.int8) # int8-32 gives an error when using nn.CrossEntropyLoss + else: # e.g. imputation + try: + data = load_data.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True, + replace_missing_vals_with='NaN') + if isinstance(data, tuple): + df, labels = data + else: + df = data + except: + df, _ = utils.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True, + replace_missing_vals_with='NaN') + labels_df = None + + lengths = df.applymap(lambda x: len(x)).values # (num_samples, num_dimensions) array containing the length of each series + horiz_diffs = np.abs(lengths - np.expand_dims(lengths[:, 0], -1)) + + # most general check: len(np.unique(lengths.values)) > 1: # returns array of unique lengths of sequences + if np.sum(horiz_diffs) > 0: # if any row (sample) has varying length across dimensions + logger.warning("Not all time series dimensions have same length - will attempt to fix by subsampling first dimension...") + df = df.applymap(subsample) # TODO: this addresses a very specific case (PPGDalia) + + if self.config['subsample_factor']: + df = df.applymap(lambda x: subsample(x, limit=0, factor=self.config['subsample_factor'])) + + lengths = df.applymap(lambda x: len(x)).values + vert_diffs = np.abs(lengths - np.expand_dims(lengths[0, :], 0)) + if np.sum(vert_diffs) > 0: # if any column (dimension) has varying length across samples + self.max_seq_len = int(np.max(lengths[:, 0])) + logger.warning("Not all samples have same length: maximum length set to {}".format(self.max_seq_len)) + else: + self.max_seq_len = lengths[0, 0] + + # First create a (seq_len, feat_dim) dataframe for each sample, indexed by a single integer ("ID" of the sample) + # Then concatenate into a (num_samples * seq_len, feat_dim) dataframe, with multiple rows corresponding to the + # sample index (i.e. the same scheme as all datasets in this project) + df = pd.concat((pd.DataFrame({col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index( + pd.Series(lengths[row, 0]*[row])) for row in range(df.shape[0])), axis=0) + + # Replace NaN values + grp = df.groupby(by=df.index) + df = grp.transform(interpolate_missing) + + return df, labels_df + + +class PMUData(BaseData): + """ + Dataset class for Phasor Measurement Unit dataset. + Attributes: + all_df: dataframe indexed by ID, with multiple rows corresponding to the same index (sample). + Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature. + feature_df: contains the subset of columns of `all_df` which correspond to selected features + feature_names: names of columns contained in `feature_df` (same as feature_df.columns) + all_IDs: IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() ) + max_seq_len: maximum sequence (time series) length (optional). Used only if script argument `max_seq_len` is not + defined. + """ + + def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None, config=None): + + self.set_num_processes(n_proc=n_proc) + + self.all_df = self.load_all(root_dir, file_list=file_list, pattern=pattern) + + if config['data_window_len'] is not None: + self.max_seq_len = config['data_window_len'] + # construct sample IDs: 0, 0, ..., 0, 1, 1, ..., 1, 2, ..., (num_whole_samples - 1) + # num_whole_samples = len(self.all_df) // self.max_seq_len # commented code is for more general IDs + # IDs = list(chain.from_iterable(map(lambda x: repeat(x, self.max_seq_len), range(num_whole_samples + 1)))) + # IDs = IDs[:len(self.all_df)] # either last sample is completely superfluous, or it has to be shortened + IDs = [i // self.max_seq_len for i in range(self.all_df.shape[0])] + self.all_df.insert(loc=0, column='ExID', value=IDs) + else: + # self.all_df = self.all_df.sort_values(by=['ExID']) # dataset is presorted + self.max_seq_len = 30 + + self.all_df = self.all_df.set_index('ExID') + # rename columns + self.all_df.columns = [re.sub(r'\d+', str(i//3), col_name) for i, col_name in enumerate(self.all_df.columns[:])] + #self.all_df.columns = ["_".join(col_name.split(" ")[:-1]) for col_name in self.all_df.columns[:]] + self.all_IDs = self.all_df.index.unique() # all sample (session) IDs + + if limit_size is not None: + if limit_size > 1: + limit_size = int(limit_size) + else: # interpret as proportion if in (0, 1] + limit_size = int(limit_size * len(self.all_IDs)) + self.all_IDs = self.all_IDs[:limit_size] + self.all_df = self.all_df.loc[self.all_IDs] + + self.feature_names = self.all_df.columns # all columns are used as features + self.feature_df = self.all_df[self.feature_names] + + def load_all(self, root_dir, file_list=None, pattern=None): + """ + Loads datasets from csv files contained in `root_dir` into a dataframe, optionally choosing from `pattern` + Args: + root_dir: directory containing all individual .csv files + file_list: optionally, provide a list of file paths within `root_dir` to consider. + Otherwise, entire `root_dir` contents will be used. + pattern: optionally, apply regex string to select subset of files + Returns: + all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files + """ + + # Select paths for training and evaluation + if file_list is None: + data_paths = glob.glob(os.path.join(root_dir, '*')) # list of all paths + else: + data_paths = [os.path.join(root_dir, p) for p in file_list] + if len(data_paths) == 0: + raise Exception('No files found using: {}'.format(os.path.join(root_dir, '*'))) + + if pattern is None: + # by default evaluate on + selected_paths = data_paths + else: + selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths)) + + input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.csv')] + if len(input_paths) == 0: + raise Exception("No .csv files found using pattern: '{}'".format(pattern)) + + if self.n_proc > 1: + # Load in parallel + _n_proc = min(self.n_proc, len(input_paths)) # no more than file_names needed here + logger.info("Loading {} datasets files using {} parallel processes ...".format(len(input_paths), _n_proc)) + with Pool(processes=_n_proc) as pool: + all_df = pd.concat(pool.map(PMUData.load_single, input_paths)) + else: # read 1 file at a time + all_df = pd.concat(PMUData.load_single(path) for path in input_paths) + + return all_df + + @staticmethod + def load_single(filepath): + df = PMUData.read_data(filepath) + #df = PMUData.select_columns(df) + num_nan = df.isna().sum().sum() + if num_nan > 0: + logger.warning("{} nan values in {} will be replaced by 0".format(num_nan, filepath)) + df = df.fillna(0) + + return df + + @staticmethod + def read_data(filepath): + """Reads a single .csv, which typically contains a day of datasets of various weld sessions. + """ + df = pd.read_csv(filepath) + return df + + +## create a class of surveillance data +class WasteWaterData(BaseData): + """ + Dataset class for Machine dataset. + Attributes: + all_df: dataframe indexed by ID, with multiple rows corresponding to the same index (sample). + Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature. + + """ + def __init__(self, root_dir, file_list=None, pattern=None, n_proc=1, limit_size=None, + navalue = None, config=None): + + self.config = config + self.wideData =None + self.wideDataFinal = None + self.feature_df = None + self.set_num_processes(n_proc=n_proc) + self.Normalizer = None + all_df = None + + try: + # print(" ---- init 0: ", root_dir ) + all_df = self.load_all(root_dir, file_list=file_list, pattern=pattern) + + wwdata, self.demodata = self.processData(all_df) + + except Exception as e: + print(" __init__ 1:") + print(all_df.head()) + + raise + + try: + self.all_df = self.cleanData(wwdata) + self.all_df = self.all_df.set_index('days') + self.all_IDs = self.all_df.index.unique() # all sample (session) IDs + self.navalue = navalue + self.max_seq_len = 66 + if limit_size is not None: + if limit_size > 1: + limit_size = int(limit_size) + else: # interpret as proportion if in (0, 1] + limit_size = int(limit_size * len(self.all_IDs)) + self.all_IDs = self.all_IDs[:limit_size] + all_df = all_df.loc[self.all_IDs] + + except Exception as e: + print(" __init__ 2:") + print(wwdata.head()) + raise + + """ + sets variable that are features, uses regex feature match to create the list + of features that are needed to generate the sequence. + + feature_df: contains the subset of columns of `all_df` which correspond to selected features + feature_names: names of columns contained in `feature_df` (same as feature_df.columns) + all_IDs: IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() ) + max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used. + (Moreover, script argument overrides this attribute) + """ + feature_names = ['geoLat','geoLong','phureg', 'sewershedPop', + 'mQ', 'mN1', 'mN2', 'mBiomarker'] + + self.feature_df = (self.all_df.loc[self.all_df.phureg.isin(['Central East','Toronto','Central West']).values,feature_names] + .reset_index().sort_values(['days','phureg','geoLat','geoLong']).set_index('days')) + self.feature_names = feature_names + + # replacing the missing with -1 in mQ, mN1, mN2, and mBiomarker + self.feature_df.loc[:,['mQ', 'mN1', 'mN2', 'mBiomarker']] = self.feature_df.loc[:,['mQ', 'mN1', 'mN2', 'mBiomarker']].fillna(np.nan, inplace=True) + logger.info("data shape is {}\n names: {}\n".format(self.feature_df.shape,self.feature_names)) + + def load_all(self, root_dir, file_list=None, pattern=None): + + # read in MECP data + if file_list is None: + data_paths = glob.glob(os.path.join(root_dir,'*')) # list of all paths + else: + data_paths = [os.path.join(root_dir, p) for p in file_list] + + if pattern is None: + # by default evaluate on + selected_paths = data_paths + else: + selected_paths = list(filter(lambda x: re.search(pattern, x), data_paths)) + + input_paths = [p for p in selected_paths if os.path.isfile(p) and p.endswith('.csv')] + print(" load_all .... ", data_paths,input_paths) + + if len(input_paths) == 0: + raise Exception(f"No .csv files found using pattern: {pattern}") + + if self.n_proc >1: + #do this....if you really have big data + logger.info(" ----------------------------YTD ---------------------\n") + all_df = pd.concat(WasteWaterData.load_single(path) for path in input_paths) + else: + logger.info(" ----------------- concatenating-----------------------------") + all_df = pd.concat(WasteWaterData.load_single(path) for path in input_paths) + + return all_df + + def set_data(self, phureg=None, vars= None, scale=True): + """ creates the wide data from the dataset for slice based on the 'phureg' and 'vars', + output is data frame which is stored in parameter 'X' with column names having the latitude + and longitude information prepended with 'its vars' e.g. var1__ + + Args: + phureg (_type_, optional): one of the regions that you want to consider e.g. GTA. Defaults to None. + vars (_type_, optional): These are the variable obtained from the self.wwdata . Defaults to None. + normalize (_str_) : if the data has to be normalized before being transformed into wider format + + """ + if scale: + self.Normalizer = Normalizer('minmax') + wideDatalist = WasteWaterData.Transform(self.all_df, phureg=phureg, varlist=vars, + normalizer=self.Normalizer) + else : + wideDatalist = WasteWaterData.Transform(self.all_df, phureg=phureg, varlist=vars) + self.wideData = pd.concat(wideDatalist, axis=1, ignore_index=False) + self.X = self.wideData + + def append_data(self, data:pd.DataFrame=None,axis=1): + """ + This function lets one add another dataset to the already created wide dataset in wide format + using the index of days, note that indexes can be retrived on wideData + Note that the object doesn't remember any new data that was added already if you want to + add more data then you have to merge the data outside of the function and then call this function + to append it to the root "wideData". + + Args: + data (pd.DataFrame, optional): Data frame with row index same as the row index + of the wideData. Defaults to None. + + """ + self.X = pd.concat( [self.wideData, data], axis=axis,join='inner') + + def processData(self,data:pd.DataFrame=None): + """_summary_ + Args: + data (_type_, optional): _description_. Defaults to None. + Returns: + _type_: _description_ + """ + + data["date"] = data.sampleDate.astype('datetime64') + data.rename(columns={"publicHealthRegion":"phureg", "publicHealthUnit":"phu"}, inplace=True) + logger.info("data shape is {}".format(data.shape)) + + self.idxcols = ['phureg','phu','geoLat','geoLong'] + self.origcols = ['mQ', 'mN1', 'mN2', 'mN1N2', 'mBiomarker'] + self.mavars = ['mN1_7dma', 'mN2_7dma', 'mN1N2_7dma','mN1E_7dma', 'nmN1_7dma', 'nmN2_7dma','nmN1N2_7dma', + 'nmE_7dma', 'nmN1E_7dma', 'qmN1N2', 'nqmN1N2','qmE', 'qmN1E', 'nqmE', 'nqmN1E','nmE', 'nmN1E', + 'mE', 'mN1E'] + + self.othervars = ['date','sewershedPop', 'sewershedPop2020', 'siteID', 'nmN1', 'nmN2', + 'nmN1N2', 'qmN1', 'qmN2', 'qmN1N2', 'nqmN1', 'nqmN2','nqmN1N2'] + + self.demovars = ['moh_cbed','moh_cbed_7dma', 'moh_abed', 'moh_cbrd', 'moh_cbrd_7dma', + 'moh_covidHospitalization', 'moh_covidHosp_7dma', + 'moh_vaccine_atLeastTwoDoses'] + + # keep only required columns; + dat3 = data[ self.idxcols + self.origcols + self.mavars + self.othervars + self.demovars ].copy() + + dat3 = dat3[dat3.date >= datetime.strptime("01/04/2021","%d/%m/%Y")] + dat3_1 = dat3[self.idxcols + self.demovars].copy() + dat3 = dat3[ self.idxcols + self.othervars + self.origcols ].copy() + + return dat3,dat3_1 + + @staticmethod + def load_single(filepath): + print(f'Load_single: {filepath}\n') + df = pd.read_csv(filepath, low_memory=False) + # df = WasteWaterDataClass.select_columns(df) + num_nan = df.isna().sum().sum() + if num_nan > 0: + logger.warning("{} nan values in {} will be replaced with 0".format(num_nan,filepath)) + + return df + + @staticmethod + def Transform(df, phureg = 'gta_region', varlist=["nmN1"], index=None, + normalizer:Normalizer = None ): + """Normalization routine for the data we have and it can be done by phu-region and variable list + + Args: + df (_type_): _description_ + phureg (str, optional): _description_. Defaults to 'gta_region'. + varlist (list, optional): List of variables to used for normalization. Defaults to ["nmN1"]. + index (_type_, optional): _description_. Defaults to None. + normalizer (Normalizer, optional): _description_. Defaults to None. + + Returns: + _type_: _description_ + """ + if index is not None : + df = df.set_index(index) + + if not isinstance(phureg,list): + phureg = [phureg] + + df = df[df["phureg"].isin(phureg)].copy() + + #Extract columns to build model + df = df[["geoLat", "geoLong"]+ varlist] + + if normalizer is not None: + # print(f'\t Normalizing using {normalizer.norm_type}...\n') + df[varlist] = normalizer.normalize(df[varlist].copy()) + + #Group the data into a multivariate time series grouped by geoLat and geoLong + df_var = [] + for i,vars in enumerate(varlist): + df_var.append(df.groupby(["geoLat", "geoLong"])[vars].apply(list).reset_index(drop=False)) + + #Extract geoLat and geoLong to make labels for time series, then drop the columns + names = [ str(a[0]) + '_' + str(a[1]) for a in list(zip(df_var[0]["geoLat"], df_var[0]["geoLong"])) ] + + # replace the colum names prepended with the latitude and longitude. + for i in range(len(df_var)): + df_var[i].drop(columns=["geoLat", "geoLong"], inplace=True) + + #List -> columns format + for i, vars in enumerate(varlist): + df_var[i] = pd.DataFrame(df_var[i][vars].tolist(), index=df_var[i].index).transpose() + df_var[i].columns = [ vars + '_' + name for name in names] + + return df_var + + def cleanData(self, df:pd.DataFrame): + + t1 = df[self.origcols].isna().apply(sum,axis=1) + df3 = df[t1 <=30].sort_values(self.idxcols +['date']) + + df3['days'] = (df3.date - pd.to_datetime('2021-04-01')).dt.days + + global_max = max(df3.days) + global_min = min(df3.days) + # function to check contiguousness of the days if not add missing line in between + def checknadd_missingdays(XX :pd.DataFrame = None ): + assert XX.days.unique().shape[0] == XX.shape[0], "====Days are not unique here====" + daysdf = pd.DataFrame({'days': range(global_min,global_max)}).reset_index(drop=True) + XX = daysdf.merge(XX, how = 'left', on='days') + return XX + + logger.info('adding missing dates in the data') + df4 = (df3.groupby(self.idxcols) + .apply(lambda x: checknadd_missingdays(x.drop(columns=self.idxcols))) + .reset_index().drop(columns=['date','level_4']) + ) + + # impute the sewershedpop if missing from 2020 after backfill and forward fill grouping by + # region/PHU latitude and longitude. + logger.info("imputing the seweshed population") + df4['sewershedPop'] = (df4.groupby(self.idxcols)['sewershedPop'] + .apply(lambda x: x.ffill().bfill()) + .reset_index(drop=True) + ).values + df4['sewershedPop2020'] = (df4.groupby(self.idxcols)['sewershedPop2020'] + .apply(lambda x: x.ffill().bfill()) + .reset_index(drop=True) + ).values + df4.loc[df4.sewershedPop.isna(),'sewershedPop'] = df4.loc[df4.sewershedPop.isna(),'sewershedPop2020'] + + # dropping lat long where there less than 10% non-null. + print("Dropping Latitude and Longitude with less than 10% non-missing in all columns \n\ + of the original values....") + _X = (df4[self.idxcols + self.origcols ].groupby(['geoLat','geoLong']) + .apply(lambda x : x.notnull().mean()).drop(columns=self.idxcols) ) + drop_rows = (_X<.1).all(axis=1) + drop_lat_long = _X[drop_rows].reset_index().iloc[:,:2] + + + logger.info('dropping following columns :\n\t {} ...'.format(drop_lat_long)) + df4 = df4[~df4.geoLat.isin(drop_lat_long.geoLat) & ~df4.geoLong.isin(drop_lat_long.geoLong)] + + # print( '--------------restricting length of latitude/Longitude ----------------' ) + df4.geoLat = df4.geoLat.round(4) + df4.geoLong = df4.geoLong.round(4) + logger.info( '------------ done ...-------------') + return df4 + + @staticmethod + def mytsplot(df:pd.DataFrame = None, labels = None, xlabel=None, color='rgmbckyte', ylog=False, + title="", ylim=None): + if labels is None : + labels = df.columns.values + n = df.shape[1] + maxvalue = df.max().values.max() + ymin = max(df.min().values.min(), 8e-6) + fig = plt.figure(figsize=(20,12),dpi=120) + ax = fig.add_subplot() + ax.plot(df, linewidth=1.5 ) + hdl = ax.get_legend_handles_labels() + if ylim is not None: + ax.set_ylim(ymin=ylim[0],ymax=ylim[1]) + else: + ax.set_ylim(ymin=ymin, ymax=maxvalue) + if ylog: + ax.set_yscale('log') + fig.legend(hdl,labels=labels,bbox_to_anchor=[1.2,0.6], loc='center right') + ax.set_title(title) + + +data_factory = {'weld': WeldData, + 'tsra': TSRegressionArchive, + 'pmu': PMUData, + 'wastewater': WasteWaterData} diff --git a/datasets/dataset.py b/datasets/dataset.py new file mode 100644 index 0000000..d84a736 --- /dev/null +++ b/datasets/dataset.py @@ -0,0 +1,324 @@ +import numpy as np +from torch.utils.data import Dataset +import torch +import logging + +logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ImputationDataset(Dataset): + """Dynamically computes missingness (noise) mask for each sample""" + + def __init__(self, data, indices, mean_mask_length=3, masking_ratio=0.15, + mode='separate', distribution='geometric', exclude_feats=None): + super(ImputationDataset, self).__init__() + + self.data = data # this is a subclass of the BaseData class in data.py + self.IDs = indices # list of data IDs, but also mapping between integer index and ID + self.feature_df = self.data.feature_df.loc[self.IDs] + + logger.info(" Creating ImputatDataset w/data \n{} ".format(self.feature_df.head())) + + self.masking_ratio = masking_ratio + self.mean_mask_length = mean_mask_length + self.mode = mode + self.distribution = distribution + self.exclude_feats = exclude_feats + + def __getitem__(self, ind): + """ + For a given integer index, returns the corresponding (seq_length, feat_dim) array and a noise mask of same shape + Args: + ind: integer index of sample in dataset + Returns: + X: (seq_length, feat_dim) tensor of the multivariate time series corresponding to a sample + mask: (seq_length, feat_dim) boolean tensor: 0s mask and predict, 1s: unaffected input + ID: ID of sample + """ + + X = self.feature_df.loc[self.IDs[ind]].values # (seq_length, feat_dim) array + mask = noise_mask(X, self.masking_ratio, self.mean_mask_length, self.mode, + self.distribution, self.exclude_feats) # (seq_length, feat_dim) boolean array + + + + data_tr = torch.from_numpy(X.astype(float)) + mask_tr = torch.from_numpy(mask) + # logger.info("__getitem__ \n X type {}\n Mask type {}".format( + # np.array(data_tr)[:5,:] ,mask_tr.dtype)) + + return data_tr, mask_tr, self.IDs[ind] + + def update(self): + self.mean_mask_length = min(20, self.mean_mask_length + 1) + self.masking_ratio = min(1, self.masking_ratio + 0.05) + + def __len__(self): + return len(self.IDs) + + +class TransductionDataset(Dataset): + + def __init__(self, data, indices, mask_feats, start_hint=0.0, end_hint=0.0): + super(TransductionDataset, self).__init__() + + self.data = data # this is a subclass of the BaseData class in data.py + self.IDs = indices # list of data IDs, but also mapping between integer index and ID + self.feature_df = self.data.feature_df.loc[self.IDs] + + self.mask_feats = mask_feats # list/array of indices corresponding to features to be masked + self.start_hint = start_hint # proportion at beginning of time series which will not be masked + self.end_hint = end_hint # end_hint: proportion at the end of time series which will not be masked + + def __getitem__(self, ind): + """ + For a given integer index, returns the corresponding (seq_length, feat_dim) array and a noise mask of same shape + Args: + ind: integer index of sample in dataset + Returns: + X: (seq_length, feat_dim) tensor of the multivariate time series corresponding to a sample + mask: (seq_length, feat_dim) boolean tensor: 0s mask and predict, 1s: unaffected input + ID: ID of sample + """ + + X = self.feature_df.loc[self.IDs[ind]].values # (seq_length, feat_dim) array + mask = transduct_mask(X, self.mask_feats, self.start_hint, + self.end_hint) # (seq_length, feat_dim) boolean array + + return torch.from_numpy(X), torch.from_numpy(mask), self.IDs[ind] + + def update(self): + self.start_hint = max(0, self.start_hint - 0.1) + self.end_hint = max(0, self.end_hint - 0.1) + + def __len__(self): + return len(self.IDs) + + +def collate_superv(data, max_len=None): + """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create + Args: + data: len(batch_size) list of tuples (X, y). + - X: torch tensor of shape (seq_length, feat_dim); variable seq_length. + - y: torch tensor of shape (num_labels,) : class indices or numerical targets + (for classification or regression, respectively). num_labels > 1 for multi-task models + max_len: global fixed sequence length. Used for architectures requiring fixed length input, + where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s + Returns: + X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input) + targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output) + target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor + 0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values + padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 means padding + """ + + batch_size = len(data) + features, labels, IDs = zip(*data) + + # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension) + lengths = [X.shape[0] for X in features] # original sequence length for each time series + if max_len is None: + max_len = max(lengths) + X = torch.zeros(batch_size, max_len, features[0].shape[-1]) # (batch_size, padded_length, feat_dim) + for i in range(batch_size): + end = min(lengths[i], max_len) + X[i, :end, :] = features[i][:end, :] + + targets = torch.stack(labels, dim=0) # (batch_size, num_labels) + + padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16), + max_len=max_len) # (batch_size, padded_length) boolean tensor, "1" means keep + + return X, targets, padding_masks, IDs + + +class ClassiregressionDataset(Dataset): + + def __init__(self, data, indices): + super(ClassiregressionDataset, self).__init__() + + self.data = data # this is a subclass of the BaseData class in data.py + self.IDs = indices # list of data IDs, but also mapping between integer index and ID + self.feature_df = self.data.feature_df.loc[self.IDs] + + self.labels_df = self.data.labels_df.loc[self.IDs] + + def __getitem__(self, ind): + """ + For a given integer index, returns the corresponding (seq_length, feat_dim) array and a noise mask of same shape + Args: + ind: integer index of sample in dataset + Returns: + X: (seq_length, feat_dim) tensor of the multivariate time series corresponding to a sample + y: (num_labels,) tensor of labels (num_labels > 1 for multi-task models) for each sample + ID: ID of sample + """ + + X = self.feature_df.loc[self.IDs[ind]].values # (seq_length, feat_dim) array + y = self.labels_df.loc[self.IDs[ind]].values # (num_labels,) array + + return torch.from_numpy(X), torch.from_numpy(y), self.IDs[ind] + + def __len__(self): + return len(self.IDs) + + +def transduct_mask(X, mask_feats, start_hint=0.0, end_hint=0.0): + """ + Creates a boolean mask of the same shape as X, with 0s at places where a feature should be masked. + Args: + X: (seq_length, feat_dim) numpy array of features corresponding to a single sample + mask_feats: list/array of indices corresponding to features to be masked + start_hint: + end_hint: proportion at the end of time series which will not be masked + + Returns: + boolean numpy array with the same shape as X, with 0s at places where a feature should be masked + """ + + mask = np.ones(X.shape, dtype=bool) + start_ind = int(start_hint * X.shape[0]) + end_ind = max(start_ind, int((1 - end_hint) * X.shape[0])) + mask[start_ind:end_ind, mask_feats] = 0 + + return mask + + +def compensate_masking(X, mask): + """ + Compensate feature vectors after masking values, in a way that the matrix product W @ X would not be affected on average. + If p is the proportion of unmasked (active) elements, X' = X / p = X * feat_dim/num_active + Args: + X: (batch_size, seq_length, feat_dim) torch tensor + mask: (batch_size, seq_length, feat_dim) torch tensor: 0s means mask and predict, 1s: unaffected (active) input + Returns: + (batch_size, seq_length, feat_dim) compensated features + """ + + # number of unmasked elements of feature vector for each time step + num_active = torch.sum(mask, dim=-1).unsqueeze(-1) # (batch_size, seq_length, 1) + # to avoid division by 0, set the minimum to 1 + num_active = torch.max(num_active, torch.ones(num_active.shape, dtype=torch.int16)) # (batch_size, seq_length, 1) + return X.shape[-1] * X / num_active + + +def collate_unsuperv(data, max_len=None, mask_compensation=False): + """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create + Args: + data: len(batch_size) list of tuples (X, mask). + - X: torch tensor of shape (seq_length, feat_dim); variable seq_length. + - mask: boolean torch tensor of shape (seq_length, feat_dim); variable seq_length. + max_len: global fixed sequence length. Used for architectures requiring fixed length input, + where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s + Returns: + X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input) + targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output) + target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor + 0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values + padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 ignore (padding) + """ + + batch_size = len(data) + features, masks, IDs = zip(*data) + + # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension) + lengths = [X.shape[0] for X in features] # original sequence length for each time series + if max_len is None: + max_len = max(lengths) + X = torch.zeros(batch_size, max_len, features[0].shape[-1]) # (batch_size, padded_length, feat_dim) + target_masks = torch.zeros_like(X, + dtype=torch.bool) # (batch_size, padded_length, feat_dim) masks related to objective + for i in range(batch_size): + end = min(lengths[i], max_len) + X[i, :end, :] = features[i][:end, :] + target_masks[i, :end, :] = masks[i][:end, :] + + targets = X.clone() + X = X * target_masks # mask input + if mask_compensation: + X = compensate_masking(X, target_masks) + + padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16), max_len=max_len) # (batch_size, padded_length) boolean tensor, "1" means keep + target_masks = ~target_masks # inverse logic: 0 now means ignore, 1 means predict + return X, targets, target_masks, padding_masks, IDs + + +def noise_mask(X, masking_ratio, lm=3, mode='separate', distribution='geometric', exclude_feats=None): + """ + Creates a random boolean mask of the same shape as X, with 0s at places where a feature should be masked. + Args: + X: (seq_length, feat_dim) numpy array of features corresponding to a single sample + masking_ratio: proportion of seq_length to be masked. At each time step, will also be the proportion of + feat_dim that will be masked on average + lm: average length of masking subsequences (streaks of 0s). Used only when `distribution` is 'geometric'. + mode: whether each variable should be masked separately ('separate'), or all variables at a certain positions + should be masked concurrently ('concurrent') + distribution: whether each mask sequence element is sampled independently at random, or whether + sampling follows a markov chain (and thus is stateful), resulting in geometric distributions of + masked squences of a desired mean length `lm` + exclude_feats: iterable of indices corresponding to features to be excluded from masking (i.e. to remain all 1s) + + Returns: + boolean numpy array with the same shape as X, with 0s at places where a feature should be masked + """ + if exclude_feats is not None: + exclude_feats = set(exclude_feats) + + if distribution == 'geometric': # stateful (Markov chain) + if mode == 'separate': # each variable (feature) is independent + mask = np.ones(X.shape, dtype=bool) + for m in range(X.shape[1]): # feature dimension + if exclude_feats is None or m not in exclude_feats: + mask[:, m] = geom_noise_mask_single(X.shape[0], lm, masking_ratio) # time dimension + else: # replicate across feature dimension (mask all variables at the same positions concurrently) + mask = np.tile(np.expand_dims(geom_noise_mask_single(X.shape[0], lm, masking_ratio), 1), X.shape[1]) + else: # each position is independent Bernoulli with p = 1 - masking_ratio + if mode == 'separate': + mask = np.random.choice(np.array([True, False]), size=X.shape, replace=True, + p=(1 - masking_ratio, masking_ratio)) + else: + mask = np.tile(np.random.choice(np.array([True, False]), size=(X.shape[0], 1), replace=True, + p=(1 - masking_ratio, masking_ratio)), X.shape[1]) + + return mask + + +def geom_noise_mask_single(L, lm, masking_ratio): + """ + Randomly create a boolean mask of length `L`, consisting of subsequences of average length lm, masking with 0s a `masking_ratio` + proportion of the sequence L. The length of masking subsequences and intervals follow a geometric distribution. + Args: + L: length of mask and sequence to be masked + lm: average length of masking subsequences (streaks of 0s) + masking_ratio: proportion of L to be masked + + Returns: + (L,) boolean numpy array intended to mask ('drop') with 0s a sequence of length L + """ + keep_mask = np.ones(L, dtype=bool) + p_m = 1 / lm # probability of each masking sequence stopping. parameter of geometric distribution. + p_u = p_m * masking_ratio / (1 - masking_ratio) # probability of each unmasked sequence stopping. parameter of geometric distribution. + p = [p_m, p_u] + + # Start in state 0 with masking_ratio probability + state = int(np.random.rand() > masking_ratio) # state 0 means masking, 1 means not masking + for i in range(L): + keep_mask[i] = state # here it happens that state and masking value corresponding to state are identical + if np.random.rand() < p[state]: + state = 1 - state + + return keep_mask + + +def padding_mask(lengths, max_len=None): + """ + Used to mask padded positions: creates a (batch_size, max_len) boolean mask from a tensor of sequence lengths, + where 1 means keep element at this position (time step) + """ + batch_size = lengths.numel() + max_len = max_len or lengths.max_val() # trick works because of overloading of 'or' operator for non-boolean types + return (torch.arange(0, max_len, device=lengths.device) + .type_as(lengths) + .repeat(batch_size, 1) + .lt(lengths.unsqueeze(1))) diff --git a/datasets/datasplit.py b/datasets/datasplit.py new file mode 100644 index 0000000..5041fbd --- /dev/null +++ b/datasets/datasplit.py @@ -0,0 +1,189 @@ +import numpy as np +from sklearn import model_selection + + +def split_dataset(data_indices, validation_method, n_splits, validation_ratio, test_set_ratio=0, + test_indices=None, + random_seed=1337, labels=None): + """ + Splits dataset (i.e. the global datasets indices) into a test set and a training/validation set. + The training/validation set is used to produce `n_splits` different configurations/splits of indices. + + Returns: + test_indices: numpy array containing the global datasets indices corresponding to the test set + (empty if test_set_ratio is 0 or None) + train_indices: iterable of `n_splits` (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's training set + val_indices: iterable of `n_splits` (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's validation set + """ + + # Set aside test set, if explicitly defined + if test_indices is not None: + data_indices = np.array([ind for ind in data_indices if ind not in set(test_indices)]) # to keep initial order + + datasplitter = DataSplitter.factory(validation_method, data_indices, labels) # DataSplitter object + + # Set aside a random partition of all data as a test set + if test_indices is None: + if test_set_ratio: # only if test set not explicitly defined + datasplitter.split_testset(test_ratio=test_set_ratio, random_state=random_seed) + test_indices = datasplitter.test_indices + else: + test_indices = [] + # Split train / validation sets + datasplitter.split_validation(n_splits, validation_ratio, random_state=random_seed) + + return datasplitter.train_indices, datasplitter.val_indices, test_indices + + +class DataSplitter(object): + """Factory class, constructing subclasses based on feature type""" + + def __init__(self, data_indices, data_labels=None): + """data_indices = train_val_indices | test_indices""" + + self.data_indices = data_indices # global datasets indices + self.data_labels = data_labels # global raw datasets labels + self.train_val_indices = np.copy(self.data_indices) # global non-test indices (training and validation) + self.test_indices = [] # global test indices + + if data_labels is not None: + self.train_val_labels = np.copy( + self.data_labels) # global non-test labels (includes training and validation) + self.test_labels = [] # global test labels # TODO: maybe not needed + + @staticmethod + def factory(split_type, *args, **kwargs): + if split_type == "StratifiedShuffleSplit": + return StratifiedShuffleSplitter(*args, **kwargs) + if split_type == "ShuffleSplit": + return ShuffleSplitter(*args, **kwargs) + else: + raise ValueError("DataSplitter for '{}' does not exist".format(split_type)) + + def split_testset(self, test_ratio, random_state=1337): + """ + Input: + test_ratio: ratio of test set with respect to the entire dataset. Should result in an absolute number of + samples which is greater or equal to the number of classes + Returns: + test_indices: numpy array containing the global datasets indices corresponding to the test set + test_labels: numpy array containing the labels corresponding to the test set + """ + + raise NotImplementedError("Please override function in child class") + + def split_validation(self): + """ + Returns: + train_indices: iterable of n_splits (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's training set + val_indices: iterable of n_splits (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's validation set + """ + + raise NotImplementedError("Please override function in child class") + + +class StratifiedShuffleSplitter(DataSplitter): + """ + Returns randomized shuffled folds, which preserve the class proportions of samples in each fold. Differs from k-fold + in that not all samples are evaluated, and samples may be shared across validation sets, + which becomes more probable proportionally to validation_ratio/n_splits. + """ + + def split_testset(self, test_ratio, random_state=1337): + """ + Input: + test_ratio: ratio of test set with respect to the entire dataset. Should result in an absolute number of + samples which is greater or equal to the number of classes + Returns: + test_indices: numpy array containing the global datasets indices corresponding to the test set + test_labels: numpy array containing the labels corresponding to the test set + """ + + splitter = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=random_state) + # get local indices, i.e. indices in [0, len(data_labels)) + train_val_indices, test_indices = next(splitter.split(X=np.zeros(len(self.data_indices)), y=self.data_labels)) + # return global datasets indices and labels + self.train_val_indices, self.train_val_labels = self.data_indices[train_val_indices], self.data_labels[train_val_indices] + self.test_indices, self.test_labels = self.data_indices[test_indices], self.data_labels[test_indices] + + return + + def split_validation(self, n_splits, validation_ratio, random_state=1337): + """ + Input: + n_splits: number of different, randomized and independent from one-another folds + validation_ratio: ratio of validation set with respect to the entire dataset. Should result in an absolute number of + samples which is greater or equal to the number of classes + Returns: + train_indices: iterable of n_splits (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's training set + val_indices: iterable of n_splits (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's validation set + """ + + splitter = model_selection.StratifiedShuffleSplit(n_splits=n_splits, test_size=validation_ratio, + random_state=random_state) + # get local indices, i.e. indices in [0, len(train_val_labels)), per fold + train_indices, val_indices = zip(*splitter.split(X=np.zeros(len(self.train_val_labels)), y=self.train_val_labels)) + # return global datasets indices per fold + self.train_indices = [self.train_val_indices[fold_indices] for fold_indices in train_indices] + self.val_indices = [self.train_val_indices[fold_indices] for fold_indices in val_indices] + + return + + +class ShuffleSplitter(DataSplitter): + """ + Returns randomized shuffled folds without requiring or taking into account the sample labels. Differs from k-fold + in that not all samples are evaluated, and samples may be shared across validation sets, + which becomes more probable proportionally to validation_ratio/n_splits. + """ + + def split_testset(self, test_ratio, random_state=1337): + """ + Input: + test_ratio: ratio of test set with respect to the entire dataset. Should result in an absolute number of + samples which is greater or equal to the number of classes + Returns: + test_indices: numpy array containing the global datasets indices corresponding to the test set + test_labels: numpy array containing the labels corresponding to the test set + """ + + splitter = model_selection.ShuffleSplit(n_splits=1, test_size=test_ratio, random_state=random_state) + # get local indices, i.e. indices in [0, len(data_indices)) + train_val_indices, test_indices = next(splitter.split(X=np.zeros(len(self.data_indices)))) + # return global datasets indices and labels + self.train_val_indices = self.data_indices[train_val_indices] + self.test_indices = self.data_indices[test_indices] + if self.data_labels is not None: + self.train_val_labels = self.data_labels[train_val_indices] + self.test_labels = self.data_labels[test_indices] + + return + + def split_validation(self, n_splits, validation_ratio, random_state=1337): + """ + Input: + n_splits: number of different, randomized and independent from one-another folds + validation_ratio: ratio of validation set with respect to the entire dataset. Should result in an absolute number of + samples which is greater or equal to the number of classes + Returns: + train_indices: iterable of n_splits (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's training set + val_indices: iterable of n_splits (num. of folds) numpy arrays, + each array containing the global datasets indices corresponding to a fold's validation set + """ + + splitter = model_selection.ShuffleSplit(n_splits=n_splits, test_size=validation_ratio, + random_state=random_state) + # get local indices, i.e. indices in [0, len(train_val_labels)), per fold + train_indices, val_indices = zip(*splitter.split(X=np.zeros(len(self.train_val_indices)))) + # return global datasets indices per fold + self.train_indices = [self.train_val_indices[fold_indices] for fold_indices in train_indices] + self.val_indices = [self.train_val_indices[fold_indices] for fold_indices in val_indices] + + return diff --git a/datasets/utils.py b/datasets/utils.py new file mode 100644 index 0000000..0db0d35 --- /dev/null +++ b/datasets/utils.py @@ -0,0 +1,605 @@ +""" +Code to load Time Series Regression datasets. From: +https://github.com/ChangWeiTan/TSRegression/blob/master/utils +""" + +import numpy as np +import pandas as pd +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from tqdm import tqdm + +regression_datasets = ["AustraliaRainfall", + "HouseholdPowerConsumption1", + "HouseholdPowerConsumption2", + "BeijingPM25Quality", + "BeijingPM10Quality", + "Covid3Month", + "LiveFuelMoistureContent", + "FloodModeling1", + "FloodModeling2", + "FloodModeling3", + "AppliancesEnergy", + "BenzeneConcentration", + "NewsHeadlineSentiment", + "NewsTitleSentiment", + "BIDMC32RR", + "BIDMC32HR", + "BIDMC32SpO2", + "IEEEPPG", + "PPGDalia"] + + +def uniform_scaling(data, max_len): + """ + This is a function to scale the time series uniformly + :param data: + :param max_len: + :return: + """ + seq_len = len(data) + scaled_data = [data[int(j * seq_len / max_len)] for j in range(max_len)] + + return scaled_data + + +# The following code is adapted from the python package sktime to read .ts file. +class TsFileParseException(Exception): + """ + Should be raised when parsing a .ts file and the format is incorrect. + """ + pass + + +def load_from_tsfile_to_dataframe(full_file_path_and_name, return_separate_X_and_y=True, + replace_missing_vals_with='NaN'): + """Loads data from a .ts file into a Pandas DataFrame. + + Parameters + ---------- + full_file_path_and_name: str + The full pathname of the .ts file to read. + return_separate_X_and_y: bool + true if X and Y values should be returned as separate Data Frames (X) and a numpy array (y), false otherwise. + This is only relevant for data that + replace_missing_vals_with: str + The value that missing values in the text file should be replaced with prior to parsing. + + Returns + ------- + DataFrame, ndarray + If return_separate_X_and_y then a tuple containing a DataFrame and a numpy array containing the relevant time-series and corresponding class values. + DataFrame + If not return_separate_X_and_y then a single DataFrame containing all time-series and (if relevant) a column "class_vals" the associated class values. + """ + + # Initialize flags and variables used when parsing the file + metadata_started = False + data_started = False + + has_problem_name_tag = False + has_timestamps_tag = False + has_univariate_tag = False + has_class_labels_tag = False + has_target_labels_tag = False + has_data_tag = False + + previous_timestamp_was_float = None + previous_timestamp_was_int = None + previous_timestamp_was_timestamp = None + num_dimensions = None + is_first_case = True + instance_list = [] + class_val_list = [] + line_num = 0 + target_labels = False + + # Parse the file + # print(full_file_path_and_name) + with open(full_file_path_and_name, 'r', encoding='utf-8') as file: + for line in tqdm(file): + # print(".", end='') + # Strip white space from start/end of line and change to lowercase for use below + line = line.strip().lower() + # Empty lines are valid at any point in a file + if line: + # Check if this line contains metadata + # Please note that even though metadata is stored in this function it is not currently published externally + if line.startswith("@problemname"): + # Check that the data has not started + if data_started: + raise TsFileParseException("metadata must come before data") + # Check that the associated value is valid + tokens = line.split(' ') + token_len = len(tokens) + + if token_len == 1: + raise TsFileParseException("problemname tag requires an associated value") + + problem_name = line[len("@problemname") + 1:] + has_problem_name_tag = True + metadata_started = True + elif line.startswith("@timestamps"): + # Check that the data has not started + if data_started: + raise TsFileParseException("metadata must come before data") + + # Check that the associated value is valid + tokens = line.split(' ') + token_len = len(tokens) + + if token_len != 2: + raise TsFileParseException("timestamps tag requires an associated Boolean value") + elif tokens[1] == "true": + timestamps = True + elif tokens[1] == "false": + timestamps = False + else: + raise TsFileParseException("invalid timestamps value") + has_timestamps_tag = True + metadata_started = True + elif line.startswith("@univariate"): + # Check that the data has not started + if data_started: + raise TsFileParseException("metadata must come before data") + + # Check that the associated value is valid + tokens = line.split(' ') + token_len = len(tokens) + if token_len != 2: + raise TsFileParseException("univariate tag requires an associated Boolean value") + elif tokens[1] == "true": + univariate = True + elif tokens[1] == "false": + univariate = False + else: + raise TsFileParseException("invalid univariate value") + + has_univariate_tag = True + metadata_started = True + elif line.startswith("@classlabel"): + # Check that the data has not started + if data_started: + raise TsFileParseException("metadata must come before data") + + # Check that the associated value is valid + tokens = line.split(' ') + token_len = len(tokens) + + if token_len == 1: + raise TsFileParseException("classlabel tag requires an associated Boolean value") + + if tokens[1] == "true": + class_labels = True + elif tokens[1] == "false": + class_labels = False + else: + raise TsFileParseException("invalid classLabel value") + + # Check if we have any associated class values + if token_len == 2 and class_labels: + raise TsFileParseException("if the classlabel tag is true then class values must be supplied") + + has_class_labels_tag = True + class_label_list = [token.strip() for token in tokens[2:]] + metadata_started = True + elif line.startswith("@targetlabel"): + # Check that the data has not started + if data_started: + raise TsFileParseException("metadata must come before data") + + # Check that the associated value is valid + tokens = line.split(' ') + token_len = len(tokens) + + if token_len == 1: + raise TsFileParseException("targetlabel tag requires an associated Boolean value") + + if tokens[1] == "true": + target_labels = True + elif tokens[1] == "false": + target_labels = False + else: + raise TsFileParseException("invalid targetLabel value") + + has_target_labels_tag = True + class_val_list = [] + metadata_started = True + # Check if this line contains the start of data + elif line.startswith("@data"): + if line != "@data": + raise TsFileParseException("data tag should not have an associated value") + + if data_started and not metadata_started: + raise TsFileParseException("metadata must come before data") + else: + has_data_tag = True + data_started = True + # If the 'data tag has been found then metadata has been parsed and data can be loaded + elif data_started: + # Check that a full set of metadata has been provided + incomplete_regression_meta_data = not has_problem_name_tag or not has_timestamps_tag or not has_univariate_tag or not has_target_labels_tag or not has_data_tag + incomplete_classification_meta_data = not has_problem_name_tag or not has_timestamps_tag or not has_univariate_tag or not has_class_labels_tag or not has_data_tag + if incomplete_regression_meta_data and incomplete_classification_meta_data: + raise TsFileParseException("a full set of metadata has not been provided before the data") + + # Replace any missing values with the value specified + line = line.replace("?", replace_missing_vals_with) + + # Check if we dealing with data that has timestamps + if timestamps: + # We're dealing with timestamps so cannot just split line on ':' as timestamps may contain one + has_another_value = False + has_another_dimension = False + + timestamps_for_dimension = [] + values_for_dimension = [] + + this_line_num_dimensions = 0 + line_len = len(line) + char_num = 0 + + while char_num < line_len: + # Move through any spaces + while char_num < line_len and str.isspace(line[char_num]): + char_num += 1 + + # See if there is any more data to read in or if we should validate that read thus far + + if char_num < line_len: + + # See if we have an empty dimension (i.e. no values) + if line[char_num] == ":": + if len(instance_list) < (this_line_num_dimensions + 1): + instance_list.append([]) + + instance_list[this_line_num_dimensions].append(pd.Series()) + this_line_num_dimensions += 1 + + has_another_value = False + has_another_dimension = True + + timestamps_for_dimension = [] + values_for_dimension = [] + + char_num += 1 + else: + # Check if we have reached a class label + if line[char_num] != "(" and target_labels: + class_val = line[char_num:].strip() + + # if class_val not in class_val_list: + # raise TsFileParseException( + # "the class value '" + class_val + "' on line " + str( + # line_num + 1) + " is not valid") + + class_val_list.append(float(class_val)) + char_num = line_len + + has_another_value = False + has_another_dimension = False + + timestamps_for_dimension = [] + values_for_dimension = [] + + else: + + # Read in the data contained within the next tuple + + if line[char_num] != "(" and not target_labels: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " does not start with a '('") + + char_num += 1 + tuple_data = "" + + while char_num < line_len and line[char_num] != ")": + tuple_data += line[char_num] + char_num += 1 + + if char_num >= line_len or line[char_num] != ")": + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " does not end with a ')'") + + # Read in any spaces immediately after the current tuple + + char_num += 1 + + while char_num < line_len and str.isspace(line[char_num]): + char_num += 1 + + # Check if there is another value or dimension to process after this tuple + + if char_num >= line_len: + has_another_value = False + has_another_dimension = False + + elif line[char_num] == ",": + has_another_value = True + has_another_dimension = False + + elif line[char_num] == ":": + has_another_value = False + has_another_dimension = True + + char_num += 1 + + # Get the numeric value for the tuple by reading from the end of the tuple data backwards to the last comma + + last_comma_index = tuple_data.rfind(',') + + if last_comma_index == -1: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " contains a tuple that has no comma inside of it") + + try: + value = tuple_data[last_comma_index + 1:] + value = float(value) + + except ValueError: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " contains a tuple that does not have a valid numeric value") + + # Check the type of timestamp that we have + + timestamp = tuple_data[0: last_comma_index] + + try: + timestamp = int(timestamp) + timestamp_is_int = True + timestamp_is_timestamp = False + except ValueError: + timestamp_is_int = False + + if not timestamp_is_int: + try: + timestamp = float(timestamp) + timestamp_is_float = True + timestamp_is_timestamp = False + except ValueError: + timestamp_is_float = False + + if not timestamp_is_int and not timestamp_is_float: + try: + timestamp = timestamp.strip() + timestamp_is_timestamp = True + except ValueError: + timestamp_is_timestamp = False + + # Make sure that the timestamps in the file (not just this dimension or case) are consistent + + if not timestamp_is_timestamp and not timestamp_is_int and not timestamp_is_float: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " contains a tuple that has an invalid timestamp '" + timestamp + "'") + + if previous_timestamp_was_float is not None and previous_timestamp_was_float and not timestamp_is_float: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " contains tuples where the timestamp format is inconsistent") + + if previous_timestamp_was_int is not None and previous_timestamp_was_int and not timestamp_is_int: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " contains tuples where the timestamp format is inconsistent") + + if previous_timestamp_was_timestamp is not None and previous_timestamp_was_timestamp and not timestamp_is_timestamp: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " contains tuples where the timestamp format is inconsistent") + + # Store the values + + timestamps_for_dimension += [timestamp] + values_for_dimension += [value] + + # If this was our first tuple then we store the type of timestamp we had + + if previous_timestamp_was_timestamp is None and timestamp_is_timestamp: + previous_timestamp_was_timestamp = True + previous_timestamp_was_int = False + previous_timestamp_was_float = False + + if previous_timestamp_was_int is None and timestamp_is_int: + previous_timestamp_was_timestamp = False + previous_timestamp_was_int = True + previous_timestamp_was_float = False + + if previous_timestamp_was_float is None and timestamp_is_float: + previous_timestamp_was_timestamp = False + previous_timestamp_was_int = False + previous_timestamp_was_float = True + + # See if we should add the data for this dimension + + if not has_another_value: + if len(instance_list) < (this_line_num_dimensions + 1): + instance_list.append([]) + + if timestamp_is_timestamp: + timestamps_for_dimension = pd.DatetimeIndex(timestamps_for_dimension) + + instance_list[this_line_num_dimensions].append( + pd.Series(index=timestamps_for_dimension, data=values_for_dimension)) + this_line_num_dimensions += 1 + + timestamps_for_dimension = [] + values_for_dimension = [] + + elif has_another_value: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " ends with a ',' that is not followed by another tuple") + + elif has_another_dimension and target_labels: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " ends with a ':' while it should list a class value") + + elif has_another_dimension and not target_labels: + if len(instance_list) < (this_line_num_dimensions + 1): + instance_list.append([]) + + instance_list[this_line_num_dimensions].append(pd.Series(dtype=np.float32)) + this_line_num_dimensions += 1 + num_dimensions = this_line_num_dimensions + + # If this is the 1st line of data we have seen then note the dimensions + + if not has_another_value and not has_another_dimension: + if num_dimensions is None: + num_dimensions = this_line_num_dimensions + + if num_dimensions != this_line_num_dimensions: + raise TsFileParseException("line " + str( + line_num + 1) + " does not have the same number of dimensions as the previous line of data") + + # Check that we are not expecting some more data, and if not, store that processed above + + if has_another_value: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " ends with a ',' that is not followed by another tuple") + + elif has_another_dimension and target_labels: + raise TsFileParseException( + "dimension " + str(this_line_num_dimensions + 1) + " on line " + str( + line_num + 1) + " ends with a ':' while it should list a class value") + + elif has_another_dimension and not target_labels: + if len(instance_list) < (this_line_num_dimensions + 1): + instance_list.append([]) + + instance_list[this_line_num_dimensions].append(pd.Series()) + this_line_num_dimensions += 1 + num_dimensions = this_line_num_dimensions + + # If this is the 1st line of data we have seen then note the dimensions + + if not has_another_value and num_dimensions != this_line_num_dimensions: + raise TsFileParseException("line " + str( + line_num + 1) + " does not have the same number of dimensions as the previous line of data") + + # Check if we should have class values, and if so that they are contained in those listed in the metadata + + if target_labels and len(class_val_list) == 0: + raise TsFileParseException("the cases have no associated class values") + else: + dimensions = line.split(":") + # If first row then note the number of dimensions (that must be the same for all cases) + if is_first_case: + num_dimensions = len(dimensions) + + if target_labels: + num_dimensions -= 1 + + for dim in range(0, num_dimensions): + instance_list.append([]) + is_first_case = False + + # See how many dimensions that the case whose data in represented in this line has + this_line_num_dimensions = len(dimensions) + + if target_labels: + this_line_num_dimensions -= 1 + + # All dimensions should be included for all series, even if they are empty + if this_line_num_dimensions != num_dimensions: + raise TsFileParseException("inconsistent number of dimensions. Expecting " + str( + num_dimensions) + " but have read " + str(this_line_num_dimensions)) + + # Process the data for each dimension + for dim in range(0, num_dimensions): + dimension = dimensions[dim].strip() + + if dimension: + data_series = dimension.split(",") + data_series = [float(i) for i in data_series] + instance_list[dim].append(pd.Series(data_series)) + else: + instance_list[dim].append(pd.Series()) + + if target_labels: + class_val_list.append(float(dimensions[num_dimensions].strip())) + + line_num += 1 + + # Check that the file was not empty + if line_num: + # Check that the file contained both metadata and data + complete_regression_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag and has_target_labels_tag and has_data_tag + complete_classification_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag and has_class_labels_tag and has_data_tag + + if metadata_started and not complete_regression_meta_data and not complete_classification_meta_data: + raise TsFileParseException("metadata incomplete") + elif metadata_started and not data_started: + raise TsFileParseException("file contained metadata but no data") + elif metadata_started and data_started and len(instance_list) == 0: + raise TsFileParseException("file contained metadata but no data") + + # Create a DataFrame from the data parsed above + data = pd.DataFrame(dtype=np.float32) + + for dim in range(0, num_dimensions): + data['dim_' + str(dim)] = instance_list[dim] + + # Check if we should return any associated class labels separately + + if target_labels: + if return_separate_X_and_y: + return data, np.asarray(class_val_list) + else: + data['class_vals'] = pd.Series(class_val_list) + return data + else: + return data + else: + raise TsFileParseException("empty file") + + +def process_data(X, min_len, normalise=None): + """ + This is a function to process the data, i.e. convert dataframe to numpy array + :param X: + :param min_len: + :param normalise: + :return: + """ + tmp = [] + for i in tqdm(range(len(X))): + _x = X.iloc[i, :].copy(deep=True) + + # 1. find the maximum length of each dimension + all_len = [len(y) for y in _x] + max_len = max(all_len) + + # 2. adjust the length of each dimension + _y = [] + for y in _x: + # 2.1 fill missing values + if y.isnull().any(): + y = y.interpolate(method='linear', limit_direction='both') + + # 2.2. if length of each dimension is different, uniformly scale the shorter ones to the max length + if len(y) < max_len: + y = uniform_scaling(y, max_len) + _y.append(y) + _y = np.array(np.transpose(_y)) + + # 3. adjust the length of the series, chop of the longer series + _y = _y[:min_len, :] + + # 4. normalise the series + if normalise == "standard": + scaler = StandardScaler().fit(_y) + _y = scaler.transform(_y) + if normalise == "minmax": + scaler = MinMaxScaler().fit(_y) + _y = scaler.transform(_y) + + tmp.append(_y) + X = np.array(tmp) + return X \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..f9a5777 --- /dev/null +++ b/main.py @@ -0,0 +1,307 @@ +""" +Written by George Zerveas + +If you use any part of the code in this repository, please consider citing the following paper: +George Zerveas et al. A Transformer-based Framework for Multivariate Time Series Representation Learning, in +Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '21), August 14--18, 2021 +""" + +import logging + +logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) + +logger.info("Loading packages ...") +import os +import sys +import time +import pickle +import json + +# 3rd party packages +from tqdm import tqdm +import torch +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +# Project modules +from options import Options +from running import setup, pipeline_factory, validate, check_progress, NEG_METRICS +from utils import utils +from datasets.data import data_factory, Normalizer +from datasets.datasplit import split_dataset +from models.ts_transformer import model_factory +from models.loss import get_loss_module +from optimizers import get_optimizer + + +def main(config): + + total_epoch_time = 0 + total_eval_time = 0 + + total_start_time = time.time() + + # Add file logging besides stdout + file_handler = logging.FileHandler(os.path.join(config['output_dir'], 'output.log')) + logger.addHandler(file_handler) + + logger.info('Running:\n{}\n'.format(' '.join(sys.argv))) # command used to run + + if config['seed'] is not None: + torch.manual_seed(config['seed']) + + device = torch.device('cuda' if (torch.cuda.is_available() and config['gpu'] != '-1') else 'cpu') + logger.info("Using device: {}".format(device)) + if device == 'cuda': + logger.info("Device index: {}".format(torch.cuda.current_device())) + + # Build data + logger.info("Loading and preprocessing data ...") + data_class = data_factory[config['data_class']] + my_data = data_class(config['data_dir'], pattern=config['pattern'], n_proc=config['n_proc'], limit_size=config['limit_size'], config=config) + feat_dim = my_data.feature_df.shape[1] # dimensionality of data features + if config['task'] == 'classification': + validation_method = 'StratifiedShuffleSplit' + labels = my_data.labels_df.values.flatten() + else: + validation_method = 'ShuffleSplit' + labels = None + + # Split dataset + test_data = my_data + test_indices = None # will be converted to empty list in `split_dataset`, if also test_set_ratio == 0 + val_data = my_data + val_indices = [] + if config['test_pattern']: # used if test data come from different files / file patterns + test_data = data_class(config['data_dir'], pattern=config['test_pattern'], n_proc=-1, config=config) + test_indices = test_data.all_IDs + if config['test_from']: # load test IDs directly from file, if available, otherwise use `test_set_ratio`. Can work together with `test_pattern` + test_indices = list(set([line.rstrip() for line in open(config['test_from']).readlines()])) + try: + test_indices = [int(ind) for ind in test_indices] # integer indices + except ValueError: + pass # in case indices are non-integers + logger.info("Loaded {} test IDs from file: '{}'".format(len(test_indices), config['test_from'])) + if config['val_pattern']: # used if val data come from different files / file patterns + val_data = data_class(config['data_dir'], pattern=config['val_pattern'], n_proc=-1, config=config) + val_indices = val_data.all_IDs + + # Note: currently a validation set must exist, either with `val_pattern` or `val_ratio` + # Using a `val_pattern` means that `val_ratio` == 0 and `test_ratio` == 0 + if config['val_ratio'] > 0: + train_indices, val_indices, test_indices = split_dataset(data_indices=my_data.all_IDs, + validation_method=validation_method, + n_splits=1, + validation_ratio=config['val_ratio'], + test_set_ratio=config['test_ratio'], # used only if test_indices not explicitly specified + test_indices=test_indices, + random_seed=1337, + labels=labels) + train_indices = train_indices[0] # `split_dataset` returns a list of indices *per fold/split* + val_indices = val_indices[0] # `split_dataset` returns a list of indices *per fold/split* + else: + train_indices = my_data.all_IDs + if test_indices is None: + test_indices = [] + + logger.info("{} samples may be used for training".format(len(train_indices))) + logger.info("{} samples will be used for validation".format(len(val_indices))) + logger.info("{} samples will be used for testing".format(len(test_indices))) + + with open(os.path.join(config['output_dir'], 'data_indices.json'), 'w') as f: + try: + json.dump({'train_indices': list(map(int, train_indices)), + 'val_indices': list(map(int, val_indices)), + 'test_indices': list(map(int, test_indices))}, f, indent=4) + except ValueError: # in case indices are non-integers + json.dump({'train_indices': list(train_indices), + 'val_indices': list(val_indices), + 'test_indices': list(test_indices)}, f, indent=4) + + # Pre-process features + normalizer = None + if config['norm_from']: + with open(config['norm_from'], 'rb') as f: + norm_dict = pickle.load(f) + normalizer = Normalizer(**norm_dict) + elif config['normalization'] is not None: + normalizer = Normalizer(config['normalization']) + my_data.feature_df.loc[train_indices] = normalizer.normalize(my_data.feature_df.loc[train_indices]) + if not config['normalization'].startswith('per_sample'): + # get normalizing values from training set and store for future use + norm_dict = normalizer.__dict__ + with open(os.path.join(config['output_dir'], 'normalization.pickle'), 'wb') as f: + pickle.dump(norm_dict, f, pickle.HIGHEST_PROTOCOL) + if normalizer is not None: + if len(val_indices): + val_data.feature_df.loc[val_indices] = normalizer.normalize(val_data.feature_df.loc[val_indices]) + if len(test_indices): + test_data.feature_df.loc[test_indices] = normalizer.normalize(test_data.feature_df.loc[test_indices]) + + # Create model + logger.info("Creating model ...") + model = model_factory(config, my_data) + + if config['freeze']: + for name, param in model.named_parameters(): + if name.startswith('output_layer'): + param.requires_grad = True + else: + param.requires_grad = False + + logger.info("Model:\n{}".format(model)) + logger.info("Total number of parameters: {}".format(utils.count_parameters(model))) + logger.info("Trainable parameters: {}".format(utils.count_parameters(model, trainable=True))) + + + # Initialize optimizer + + if config['global_reg']: + weight_decay = config['l2_reg'] + output_reg = None + else: + weight_decay = 0 + output_reg = config['l2_reg'] + + optim_class = get_optimizer(config['optimizer']) + optimizer = optim_class(model.parameters(), lr=config['lr'], weight_decay=weight_decay) + + start_epoch = 0 + lr_step = 0 # current step index of `lr_step` + lr = config['lr'] # current learning step + # Load model and optimizer state + if args.load_model: + model, optimizer, start_epoch = utils.load_model(model, config['load_model'], optimizer, config['resume'], + config['change_output'], + config['lr'], + config['lr_step'], + config['lr_factor']) + model.to(device) + + loss_module = get_loss_module(config) + + if config['test_only'] == 'testset': # Only evaluate and skip training + dataset_class, collate_fn, runner_class = pipeline_factory(config) + test_dataset = dataset_class(test_data, test_indices) + + test_loader = DataLoader(dataset=test_dataset, + batch_size=config['batch_size'], + shuffle=False, + num_workers=config['num_workers'], + pin_memory=True, + collate_fn=lambda x: collate_fn(x, max_len=model.max_len)) + test_evaluator = runner_class(model, test_loader, device, loss_module, + print_interval=config['print_interval'], console=config['console']) + aggr_metrics_test, per_batch_test = test_evaluator.evaluate(keep_all=True) + print_str = 'Test Summary: ' + for k, v in aggr_metrics_test.items(): + print_str += '{}: {:8f} | '.format(k, v) + logger.info(print_str) + return + + # Initialize data generators + dataset_class, collate_fn, runner_class = pipeline_factory(config) + val_dataset = dataset_class(val_data, val_indices) + + val_loader = DataLoader(dataset=val_dataset, + batch_size=config['batch_size'], + shuffle=False, + num_workers=config['num_workers'], + pin_memory=True, + collate_fn=lambda x: collate_fn(x, max_len=model.max_len)) + + train_dataset = dataset_class(my_data, train_indices) + + train_loader = DataLoader(dataset=train_dataset, + batch_size=config['batch_size'], + shuffle=True, + num_workers=config['num_workers'], + pin_memory=True, + collate_fn=lambda x: collate_fn(x, max_len=model.max_len)) + + trainer = runner_class(model, train_loader, device, loss_module, optimizer, l2_reg=output_reg, + print_interval=config['print_interval'], console=config['console']) + val_evaluator = runner_class(model, val_loader, device, loss_module, + print_interval=config['print_interval'], console=config['console']) + + tensorboard_writer = SummaryWriter(config['tensorboard_dir']) + + best_value = 1e16 if config['key_metric'] in NEG_METRICS else -1e16 # initialize with +inf or -inf depending on key metric + metrics = [] # (for validation) list of lists: for each epoch, stores metrics like loss, ... + best_metrics = {} + + # Evaluate on validation before training + aggr_metrics_val, best_metrics, best_value = validate(val_evaluator, tensorboard_writer, config, best_metrics, + best_value, epoch=0) + metrics_names, metrics_values = zip(*aggr_metrics_val.items()) + metrics.append(list(metrics_values)) + + logger.info('Starting training...') + for epoch in tqdm(range(start_epoch + 1, config["epochs"] + 1), desc='Training Epoch', leave=False): + mark = epoch if config['save_all'] else 'last' + epoch_start_time = time.time() + aggr_metrics_train = trainer.train_epoch(epoch) # dictionary of aggregate epoch metrics + epoch_runtime = time.time() - epoch_start_time + print() + print_str = 'Epoch {} Training Summary: '.format(epoch) + for k, v in aggr_metrics_train.items(): + tensorboard_writer.add_scalar('{}/train'.format(k), v, epoch) + print_str += '{}: {:8f} | '.format(k, v) + logger.info(print_str) + logger.info("Epoch runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(epoch_runtime))) + total_epoch_time += epoch_runtime + avg_epoch_time = total_epoch_time / (epoch - start_epoch) + avg_batch_time = avg_epoch_time / len(train_loader) + avg_sample_time = avg_epoch_time / len(train_dataset) + logger.info("Avg epoch train. time: {} hours, {} minutes, {} seconds".format(*utils.readable_time(avg_epoch_time))) + logger.info("Avg batch train. time: {} seconds".format(avg_batch_time)) + logger.info("Avg sample train. time: {} seconds".format(avg_sample_time)) + + # evaluate if first or last epoch or at specified interval + if (epoch == config["epochs"]) or (epoch == start_epoch + 1) or (epoch % config['val_interval'] == 0): + aggr_metrics_val, best_metrics, best_value = validate(val_evaluator, tensorboard_writer, config, + best_metrics, best_value, epoch) + metrics_names, metrics_values = zip(*aggr_metrics_val.items()) + metrics.append(list(metrics_values)) + + utils.save_model(os.path.join(config['save_dir'], 'model_{}.pth'.format(mark)), epoch, model, optimizer) + + # Learning rate scheduling + if epoch == config['lr_step'][lr_step]: + utils.save_model(os.path.join(config['save_dir'], 'model_{}.pth'.format(epoch)), epoch, model, optimizer) + lr = lr * config['lr_factor'][lr_step] + if lr_step < len(config['lr_step']) - 1: # so that this index does not get out of bounds + lr_step += 1 + logger.info('Learning rate updated to: ', lr) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + # Difficulty scheduling + if config['harden'] and check_progress(epoch): + train_loader.dataset.update() + val_loader.dataset.update() + + # Export evolution of metrics over epochs + header = metrics_names + metrics_filepath = os.path.join(config["output_dir"], "metrics_" + config["experiment_name"] + ".xls") + book = utils.export_performance_metrics(metrics_filepath, metrics, header, sheet_name="metrics") + + # Export record metrics to a file accumulating records from all experiments + utils.register_record(config["records_file"], config["initial_timestamp"], config["experiment_name"], + best_metrics, aggr_metrics_val, comment=config['comment']) + + logger.info('Best {} was {}. Other metrics: {}'.format(config['key_metric'], best_value, best_metrics)) + logger.info('All Done!') + + total_runtime = time.time() - total_start_time + logger.info("Total runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(total_runtime))) + + return best_value + + +if __name__ == '__main__': + + args = Options().parse() # `argsparse` object + config = setup(args) # configuration dictionary + main(config) diff --git a/models/loss.py b/models/loss.py new file mode 100644 index 0000000..f57c68e --- /dev/null +++ b/models/loss.py @@ -0,0 +1,74 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + + +def get_loss_module(config): + + task = config['task'] + + if (task == "imputation") or (task == "transduction"): + return MaskedMSELoss(reduction='none') # outputs loss for each batch element + + if task == "classification": + return NoFussCrossEntropyLoss(reduction='none') # outputs loss for each batch sample + + if task == "regression": + return nn.MSELoss(reduction='none') # outputs loss for each batch sample + + else: + raise ValueError("Loss module for task '{}' does not exist".format(task)) + + +def l2_reg_loss(model): + """Returns the squared L2 norm of output layer of given model""" + + for name, param in model.named_parameters(): + if name == 'output_layer.weight': + return torch.sum(torch.square(param)) + + +class NoFussCrossEntropyLoss(nn.CrossEntropyLoss): + """ + pytorch's CrossEntropyLoss is fussy: 1) needs Long (int64) targets only, and 2) only 1D. + This function satisfies these requirements + """ + + def forward(self, inp, target): + return F.cross_entropy(inp, target.long().squeeze(), weight=self.weight, + ignore_index=self.ignore_index, reduction=self.reduction) + + +class MaskedMSELoss(nn.Module): + """ Masked MSE Loss + """ + + def __init__(self, reduction: str = 'mean'): + + super().__init__() + + self.reduction = reduction + self.mse_loss = nn.MSELoss(reduction=self.reduction) + + def forward(self, + y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: + """Compute the loss between a target value and a prediction. + + Args: + y_pred: Estimated values + y_true: Target values + mask: boolean tensor with 0s at places where values should be ignored and 1s where they should be considered + + Returns + ------- + if reduction == 'none': + (num_active,) Loss for each active batch element as a tensor with gradient attached. + if reduction == 'mean': + scalar mean loss over batch as a tensor with gradient attached. + """ + + # for this particular loss, one may also elementwise multiply y_pred and y_true with the inverted mask + masked_pred = torch.masked_select(y_pred, mask) + masked_true = torch.masked_select(y_true, mask) + + return self.mse_loss(masked_pred, masked_true) diff --git a/models/ts_transformer.py b/models/ts_transformer.py new file mode 100644 index 0000000..025bf3d --- /dev/null +++ b/models/ts_transformer.py @@ -0,0 +1,314 @@ +from typing import Optional, Any +import math + +import torch +from torch import nn, Tensor +from torch.nn import functional as F +from torch.nn.modules import MultiheadAttention, Linear, Dropout, BatchNorm1d, TransformerEncoderLayer + + +def model_factory(config, data): + task = config['task'] + feat_dim = data.feature_df.shape[1] # dimensionality of data features + # data windowing is used when samples don't have a predefined length or the length is too long + max_seq_len = config['data_window_len'] if config['data_window_len'] is not None else config['max_seq_len'] + if max_seq_len is None: + try: + max_seq_len = data.max_seq_len + except AttributeError as x: + print("Data class does not define a maximum sequence length, so it must be defined with the script argument `max_seq_len`") + raise x + + if (task == "imputation") or (task == "transduction"): + if config['model'] == 'LINEAR': + return DummyTSTransformerEncoder(feat_dim, max_seq_len, config['d_model'], config['num_heads'], + config['num_layers'], config['dim_feedforward'], dropout=config['dropout'], + pos_encoding=config['pos_encoding'], activation=config['activation'], + norm=config['normalization_layer'], freeze=config['freeze']) + elif config['model'] == 'transformer': + return TSTransformerEncoder(feat_dim, max_seq_len, config['d_model'], config['num_heads'], + config['num_layers'], config['dim_feedforward'], dropout=config['dropout'], + pos_encoding=config['pos_encoding'], activation=config['activation'], + norm=config['normalization_layer'], freeze=config['freeze']) + + if (task == "classification") or (task == "regression"): + num_labels = len(data.class_names) if task == "classification" else data.labels_df.shape[1] # dimensionality of labels + if config['model'] == 'LINEAR': + return DummyTSTransformerEncoderClassiregressor(feat_dim, max_seq_len, config['d_model'], + config['num_heads'], + config['num_layers'], config['dim_feedforward'], + num_classes=num_labels, + dropout=config['dropout'], pos_encoding=config['pos_encoding'], + activation=config['activation'], + norm=config['normalization_layer'], freeze=config['freeze']) + elif config['model'] == 'transformer': + return TSTransformerEncoderClassiregressor(feat_dim, max_seq_len, config['d_model'], + config['num_heads'], + config['num_layers'], config['dim_feedforward'], + num_classes=num_labels, + dropout=config['dropout'], pos_encoding=config['pos_encoding'], + activation=config['activation'], + norm=config['normalization_layer'], freeze=config['freeze']) + else: + raise ValueError("Model class for task '{}' does not exist".format(task)) + + +def _get_activation_fn(activation): + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + raise ValueError("activation should be relu/gelu, not {}".format(activation)) + + +# From https://github.com/pytorch/examples/blob/master/word_language_model/model.py +class FixedPositionalEncoding(nn.Module): + r"""Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=1024). + """ + + def __init__(self, d_model, dropout=0.1, max_len=1024, scale_factor=1.0): + super(FixedPositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) # positional encoding + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = scale_factor * pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) # this stores the variable in the state_dict (used for non-trainable variables) + + def forward(self, x): + r"""Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + """ + + x = x + self.pe[:x.size(0), :] + return self.dropout(x) + + +class LearnablePositionalEncoding(nn.Module): + + def __init__(self, d_model, dropout=0.1, max_len=1024): + super(LearnablePositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + # Each position gets its own embedding + # Since indices are always 0 ... max_len, we don't have to do a look-up + self.pe = nn.Parameter(torch.empty(max_len, 1, d_model)) # requires_grad automatically set to True + nn.init.uniform_(self.pe, -0.02, 0.02) + + def forward(self, x): + r"""Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + """ + + x = x + self.pe[:x.size(0), :] + return self.dropout(x) + + +def get_pos_encoder(pos_encoding): + if pos_encoding == "learnable": + return LearnablePositionalEncoding + elif pos_encoding == "fixed": + return FixedPositionalEncoding + + raise NotImplementedError("pos_encoding should be 'learnable'/'fixed', not '{}'".format(pos_encoding)) + + +class TransformerBatchNormEncoderLayer(nn.modules.Module): + r"""This transformer encoder layer block is made up of self-attn and feedforward network. + It differs from TransformerEncoderLayer in torch/nn/modules/transformer.py in that it replaces LayerNorm + with BatchNorm. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of intermediate layer, relu or gelu (default=relu). + """ + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): + super(TransformerBatchNormEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model) + + self.norm1 = BatchNorm1d(d_model, eps=1e-5) # normalizes each feature across batch samples and time steps + self.norm2 = BatchNorm1d(d_model, eps=1e-5) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + + self.activation = _get_activation_fn(activation) + + def __setstate__(self, state): + if 'activation' not in state: + state['activation'] = F.relu + super(TransformerBatchNormEncoderLayer, self).__setstate__(state) + + def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None) -> Tensor: + r"""Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + + Shape: + see the docs in Transformer class. + """ + src2 = self.self_attn(src, src, src, attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) # (seq_len, batch_size, d_model) + src = src.permute(1, 2, 0) # (batch_size, d_model, seq_len) + # src = src.reshape([src.shape[0], -1]) # (batch_size, seq_length * d_model) + src = self.norm1(src) + src = src.permute(2, 0, 1) # restore (seq_len, batch_size, d_model) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) # (seq_len, batch_size, d_model) + src = src.permute(1, 2, 0) # (batch_size, d_model, seq_len) + src = self.norm2(src) + src = src.permute(2, 0, 1) # restore (seq_len, batch_size, d_model) + return src + + +class TSTransformerEncoder(nn.Module): + + def __init__(self, feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, dropout=0.1, + pos_encoding='fixed', activation='gelu', norm='BatchNorm', freeze=False): + super(TSTransformerEncoder, self).__init__() + + self.max_len = max_len + self.d_model = d_model + self.n_heads = n_heads + + self.project_inp = nn.Linear(feat_dim, d_model) + self.pos_enc = get_pos_encoder(pos_encoding)(d_model, dropout=dropout*(1.0 - freeze), max_len=max_len) + + if norm == 'LayerNorm': + encoder_layer = TransformerEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation) + else: + encoder_layer = TransformerBatchNormEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation) + + self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers) + + self.output_layer = nn.Linear(d_model, feat_dim) + + self.act = _get_activation_fn(activation) + + self.dropout1 = nn.Dropout(dropout) + + self.feat_dim = feat_dim + + def forward(self, X, padding_masks): + """ + Args: + X: (batch_size, seq_length, feat_dim) torch tensor of masked features (input) + padding_masks: (batch_size, seq_length) boolean tensor, 1 means keep vector at this position, 0 means padding + Returns: + output: (batch_size, seq_length, feat_dim) + """ + + # permute because pytorch convention for transformers is [seq_length, batch_size, feat_dim]. padding_masks [batch_size, feat_dim] + inp = X.permute(1, 0, 2) + inp = self.project_inp(inp) * math.sqrt( + self.d_model) # [seq_length, batch_size, d_model] project input vectors to d_model dimensional space + inp = self.pos_enc(inp) # add positional encoding + # NOTE: logic for padding masks is reversed to comply with definition in MultiHeadAttention, TransformerEncoderLayer + output = self.transformer_encoder(inp, src_key_padding_mask=~padding_masks) # (seq_length, batch_size, d_model) + output = self.act(output) # the output transformer encoder/decoder embeddings don't include non-linearity + output = output.permute(1, 0, 2) # (batch_size, seq_length, d_model) + output = self.dropout1(output) + # Most probably defining a Linear(d_model,feat_dim) vectorizes the operation over (seq_length, batch_size). + output = self.output_layer(output) # (batch_size, seq_length, feat_dim) + + return output + + +class TSTransformerEncoderClassiregressor(nn.Module): + """ + Simplest classifier/regressor. Can be either regressor or classifier because the output does not include + softmax. Concatenates final layer embeddings and uses 0s to ignore padding embeddings in final output layer. + """ + + def __init__(self, feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, num_classes, + dropout=0.1, pos_encoding='fixed', activation='gelu', norm='BatchNorm', freeze=False): + super(TSTransformerEncoderClassiregressor, self).__init__() + + self.max_len = max_len + self.d_model = d_model + self.n_heads = n_heads + + self.project_inp = nn.Linear(feat_dim, d_model) + self.pos_enc = get_pos_encoder(pos_encoding)(d_model, dropout=dropout*(1.0 - freeze), max_len=max_len) + + if norm == 'LayerNorm': + encoder_layer = TransformerEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation) + else: + encoder_layer = TransformerBatchNormEncoderLayer(d_model, self.n_heads, dim_feedforward, dropout*(1.0 - freeze), activation=activation) + + self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers) + + self.act = _get_activation_fn(activation) + + self.dropout1 = nn.Dropout(dropout) + + self.feat_dim = feat_dim + self.num_classes = num_classes + self.output_layer = self.build_output_module(d_model, max_len, num_classes) + + def build_output_module(self, d_model, max_len, num_classes): + output_layer = nn.Linear(d_model * max_len, num_classes) + # no softmax (or log softmax), because CrossEntropyLoss does this internally. If probabilities are needed, + # add F.log_softmax and use NLLoss + return output_layer + + def forward(self, X, padding_masks): + """ + Args: + X: (batch_size, seq_length, feat_dim) torch tensor of masked features (input) + padding_masks: (batch_size, seq_length) boolean tensor, 1 means keep vector at this position, 0 means padding + Returns: + output: (batch_size, num_classes) + """ + + # permute because pytorch convention for transformers is [seq_length, batch_size, feat_dim]. padding_masks [batch_size, feat_dim] + inp = X.permute(1, 0, 2) + inp = self.project_inp(inp) * math.sqrt( + self.d_model) # [seq_length, batch_size, d_model] project input vectors to d_model dimensional space + inp = self.pos_enc(inp) # add positional encoding + # NOTE: logic for padding masks is reversed to comply with definition in MultiHeadAttention, TransformerEncoderLayer + output = self.transformer_encoder(inp, src_key_padding_mask=~padding_masks) # (seq_length, batch_size, d_model) + output = self.act(output) # the output transformer encoder/decoder embeddings don't include non-linearity + output = output.permute(1, 0, 2) # (batch_size, seq_length, d_model) + output = self.dropout1(output) + + # Output + output = output * padding_masks.unsqueeze(-1) # zero-out padding embeddings + output = output.reshape(output.shape[0], -1) # (batch_size, seq_length * d_model) + output = self.output_layer(output) # (batch_size, num_classes) + + return output + diff --git a/optimizers.py b/optimizers.py new file mode 100644 index 0000000..ed667b6 --- /dev/null +++ b/optimizers.py @@ -0,0 +1,259 @@ +import math +import torch +from torch.optim.optimizer import Optimizer + + +def get_optimizer(name): + + if name == "Adam": + return torch.optim.Adam + elif name == "RAdam": + return RAdam + + +# from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam/radam.py +class RAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + for param in params: + if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): + param['buffer'] = [[None, None, None] for _ in range(10)] + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, + buffer=[[None, None, None] for _ in range(10)]) + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + buffered = group['buffer'][int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt( + (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( + N_sma_max - 2)) / (1 - beta1 ** state['step']) + elif self.degenerated_to_sgd: + step_size = 1.0 / (1 - beta1 ** state['step']) + else: + step_size = -1 + buffered[2] = step_size + + # more conservative since it's an approximated value + if N_sma >= 5: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) + p.data.copy_(p_data_fp32) + elif step_size > 0: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + p_data_fp32.add_(-step_size * group['lr'], exp_avg) + p.data.copy_(p_data_fp32) + + return loss + + +class PlainRAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super(PlainRAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(PlainRAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + + # more conservative since it's an approximated value + if N_sma >= 5: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + step_size = group['lr'] * math.sqrt( + (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( + N_sma_max - 2)) / (1 - beta1 ** state['step']) + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + p.data.copy_(p_data_fp32) + elif self.degenerated_to_sgd: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + step_size = group['lr'] / (1 - beta1 ** state['step']) + p_data_fp32.add_(-step_size, exp_avg) + p.data.copy_(p_data_fp32) + + return loss + + +class AdamW(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, warmup=warmup) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + denom = exp_avg_sq.sqrt().add_(group['eps']) + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + if group['warmup'] > state['step']: + scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] + else: + scheduled_lr = group['lr'] + + step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1 + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) + + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + + p.data.copy_(p_data_fp32) + + return loss diff --git a/options.py b/options.py new file mode 100644 index 0000000..9061b2b --- /dev/null +++ b/options.py @@ -0,0 +1,185 @@ +import argparse + + +class Options(object): + + def __init__(self): + + # Handle command line arguments + self.parser = argparse.ArgumentParser( + description='Run a complete training pipeline. Optionally, a JSON configuration file can be used, to overwrite command-line arguments.') + + ## Run from config file + self.parser.add_argument('--config', dest='config_filepath', + help='Configuration .json file (optional). Overwrites existing command-line args!') + + ## Run from command-line arguments + # I/O + self.parser.add_argument('--output_dir', default='./output', + help='Root output directory. Must exist. Time-stamped directories will be created inside.') + self.parser.add_argument('--data_dir', default='./data', + help='Data directory') + self.parser.add_argument('--load_model', + help='Path to pre-trained model.') + self.parser.add_argument('--resume', action='store_true', + help='If set, will load `starting_epoch` and state of optimizer, besides model weights.') + self.parser.add_argument('--change_output', action='store_true', + help='Whether the loaded model will be fine-tuned on a different task (necessitating a different output layer)') + self.parser.add_argument('--save_all', action='store_true', + help='If set, will save model weights (and optimizer state) for every epoch; otherwise just latest') + self.parser.add_argument('--name', dest='experiment_name', default='', + help='A string identifier/name for the experiment to be run - it will be appended to the output directory name, before the timestamp') + self.parser.add_argument('--comment', type=str, default='', help='A comment/description of the experiment') + self.parser.add_argument('--no_timestamp', action='store_true', + help='If set, a timestamp will not be appended to the output directory name') + self.parser.add_argument('--records_file', default='./records.xls', + help='Excel file keeping all records of experiments') + # System + self.parser.add_argument('--console', action='store_true', + help="Optimize printout for console output; otherwise for file") + self.parser.add_argument('--print_interval', type=int, default=1, + help='Print batch info every this many batches') + self.parser.add_argument('--gpu', type=str, default='0', + help='GPU index, -1 for CPU') + self.parser.add_argument('--n_proc', type=int, default=-1, + help='Number of processes for data loading/preprocessing. By default, equals num. of available cores.') + self.parser.add_argument('--num_workers', type=int, default=0, + help='dataloader threads. 0 for single-thread.') + self.parser.add_argument('--seed', + help='Seed used for splitting sets. None by default, set to an integer for reproducibility') + # Dataset + self.parser.add_argument('--limit_size', type=float, default=None, + help="Limit dataset to specified smaller random sample, e.g. for rapid debugging purposes. " + "If in [0,1], it will be interpreted as a proportion of the dataset, " + "otherwise as an integer absolute number of samples") + self.parser.add_argument('--test_only', choices={'testset', 'fold_transduction'}, + help='If set, no training will take place; instead, trained model will be loaded and evaluated on test set') + self.parser.add_argument('--data_class', type=str, default='weld', + help="Which type of data should be processed.") + self.parser.add_argument('--labels', type=str, + help="In case a dataset contains several labels (multi-task), " + "which type of labels should be used in regression or classification, i.e. name of column(s).") + self.parser.add_argument('--test_from', + help='If given, will read test IDs from specified text file containing sample IDs one in each row') + self.parser.add_argument('--test_ratio', type=float, default=0, + help="Set aside this proportion of the dataset as a test set") + self.parser.add_argument('--val_ratio', type=float, default=0.2, + help="Proportion of the dataset to be used as a validation set") + self.parser.add_argument('--pattern', type=str, + help='Regex pattern used to select files contained in `data_dir`. If None, all data will be used.') + self.parser.add_argument('--val_pattern', type=str, + help="""Regex pattern used to select files contained in `data_dir` exclusively for the validation set. + If None, a positive `val_ratio` will be used to reserve part of the common data set.""") + self.parser.add_argument('--test_pattern', type=str, + help="""Regex pattern used to select files contained in `data_dir` exclusively for the test set. + If None, `test_ratio`, if specified, will be used to reserve part of the common data set.""") + self.parser.add_argument('--normalization', + choices={'standardization', 'minmax', 'per_sample_std', 'per_sample_minmax'}, + default='standardization', + help='If specified, will apply normalization on the input features of a dataset.') + self.parser.add_argument('--norm_from', + help="""If given, will read normalization values (e.g. mean, std, min, max) from specified pickle file. + The columns correspond to features, rows correspond to mean, std or min, max.""") + self.parser.add_argument('--subsample_factor', type=int, + help='Sub-sampling factor used for long sequences: keep every kth sample') + # Training process + self.parser.add_argument('--task', choices={"imputation", "transduction", "classification", "regression"}, + default="imputation", + help=("Training objective/task: imputation of masked values,\n" + " transduction of features to other features,\n" + " classification of entire time series,\n" + " regression of scalar(s) for entire time series")) + self.parser.add_argument('--masking_ratio', type=float, default=0.15, + help='Imputation: mask this proportion of each variable') + self.parser.add_argument('--mean_mask_length', type=float, default=3, + help="Imputation: the desired mean length of masked segments. Used only when `mask_distribution` is 'geometric'.") + self.parser.add_argument('--mask_mode', choices={'separate', 'concurrent'}, default='separate', + help=("Imputation: whether each variable should be masked separately " + "or all variables at a certain positions should be masked concurrently")) + self.parser.add_argument('--mask_distribution', choices={'geometric', 'bernoulli'}, default='geometric', + help=("Imputation: whether each mask sequence element is sampled independently at random" + "or whether sampling follows a markov chain (stateful), resulting in " + "geometric distributions of masked squences of a desired mean_mask_length")) + self.parser.add_argument('--exclude_feats', type=str, default=None, + help='Imputation: Comma separated string of indices corresponding to features to be excluded from masking') + self.parser.add_argument('--mask_feats', type=str, default='0, 1', + help='Transduction: Comma separated string of indices corresponding to features to be masked') + self.parser.add_argument('--start_hint', type=float, default=0.0, + help='Transduction: proportion at the beginning of time series which will not be masked') + self.parser.add_argument('--end_hint', type=float, default=0.0, + help='Transduction: proportion at the end of time series which will not be masked') + self.parser.add_argument('--harden', action='store_true', + help='Makes training objective progressively harder, by masking more of the input') + + self.parser.add_argument('--epochs', type=int, default=400, + help='Number of training epochs') + self.parser.add_argument('--val_interval', type=int, default=2, + help='Evaluate on validation set every this many epochs. Must be >= 1.') + self.parser.add_argument('--optimizer', choices={"Adam", "RAdam"}, default="Adam", help="Optimizer") + self.parser.add_argument('--lr', type=float, default=1e-3, + help='learning rate (default holds for batch size 64)') + self.parser.add_argument('--lr_step', type=str, default='1000000', + help='Comma separated string of epochs when to reduce learning rate by a factor of 10.' + ' The default is a large value, meaning that the learning rate will not change.') + self.parser.add_argument('--lr_factor', type=str, default='0.1', + help=("Comma separated string of multiplicative factors to be applied to lr " + "at corresponding steps specified in `lr_step`. If a single value is provided, " + "it will be replicated to match the number of steps in `lr_step`.")) + self.parser.add_argument('--batch_size', type=int, default=64, + help='Training batch size') + self.parser.add_argument('--l2_reg', type=float, default=0, + help='L2 weight regularization parameter') + self.parser.add_argument('--global_reg', action='store_true', + help='If set, L2 regularization will be applied to all weights instead of only the output layer') + self.parser.add_argument('--key_metric', choices={'loss', 'accuracy', 'precision'}, default='loss', + help='Metric used for defining best epoch') + self.parser.add_argument('--freeze', action='store_true', + help='If set, freezes all layer parameters except for the output layer. Also removes dropout except before the output layer') + + # Model + self.parser.add_argument('--model', choices={"transformer", "LINEAR"}, default="transformer", + help="Model class") + self.parser.add_argument('--max_seq_len', type=int, + help="""Maximum input sequence length. Determines size of transformer layers. + If not provided, then the value defined inside the data class will be used.""") + self.parser.add_argument('--data_window_len', type=int, + help="""Used instead of the `max_seq_len`, when the data samples must be + segmented into windows. Determines maximum input sequence length + (size of transformer layers).""") + self.parser.add_argument('--d_model', type=int, default=64, + help='Internal dimension of transformer embeddings') + self.parser.add_argument('--dim_feedforward', type=int, default=256, + help='Dimension of dense feedforward part of transformer layer') + self.parser.add_argument('--num_heads', type=int, default=8, + help='Number of multi-headed attention heads') + self.parser.add_argument('--num_layers', type=int, default=3, + help='Number of transformer encoder layers (blocks)') + self.parser.add_argument('--dropout', type=float, default=0.1, + help='Dropout applied to most transformer encoder layers') + self.parser.add_argument('--pos_encoding', choices={'fixed', 'learnable'}, default='fixed', + help='Internal dimension of transformer embeddings') + self.parser.add_argument('--activation', choices={'relu', 'gelu'}, default='gelu', + help='Activation to be used in transformer encoder') + self.parser.add_argument('--normalization_layer', choices={'BatchNorm', 'LayerNorm'}, default='BatchNorm', + help='Normalization layer to be used internally in transformer encoder') + + def parse(self): + + args = self.parser.parse_args() + + args.lr_step = [int(i) for i in args.lr_step.split(',')] + args.lr_factor = [float(i) for i in args.lr_factor.split(',')] + if (len(args.lr_step) > 1) and (len(args.lr_factor) == 1): + args.lr_factor = len(args.lr_step) * args.lr_factor # replicate + assert len(args.lr_step) == len( + args.lr_factor), "You must specify as many values in `lr_step` as in `lr_factors`" + + if args.exclude_feats is not None: + args.exclude_feats = [int(i) for i in args.exclude_feats.split(',')] + args.mask_feats = [int(i) for i in args.mask_feats.split(',')] + + if args.val_pattern is not None: + args.val_ratio = 0 + args.test_ratio = 0 + + return args diff --git a/running.py b/running.py new file mode 100644 index 0000000..6231360 --- /dev/null +++ b/running.py @@ -0,0 +1,500 @@ +import logging +import sys +import os +import traceback +import json +from datetime import datetime +import string +import random +from collections import OrderedDict +import time +import pickle +from functools import partial + +import ipdb +import torch +from torch.utils.data import DataLoader +import numpy as np +import sklearn + +from utils import utils, analysis +from models.loss import l2_reg_loss +from datasets.dataset import ImputationDataset, TransductionDataset, ClassiregressionDataset, collate_unsuperv, collate_superv + + +logger = logging.getLogger('__main__') + +NEG_METRICS = {'loss'} # metrics for which "better" is less + +val_times = {"total_time": 0, "count": 0} + + +def pipeline_factory(config): + """For the task specified in the configuration returns the corresponding combination of + Dataset class, collate function and Runner class.""" + + task = config['task'] + + if task == "imputation": + return partial(ImputationDataset, mean_mask_length=config['mean_mask_length'], + masking_ratio=config['masking_ratio'], mode=config['mask_mode'], + distribution=config['mask_distribution'], exclude_feats=config['exclude_feats']),\ + collate_unsuperv, UnsupervisedRunner + if task == "transduction": + return partial(TransductionDataset, mask_feats=config['mask_feats'], + start_hint=config['start_hint'], end_hint=config['end_hint']), collate_unsuperv, UnsupervisedRunner + if (task == "classification") or (task == "regression"): + return ClassiregressionDataset, collate_superv, SupervisedRunner + else: + raise NotImplementedError("Task '{}' not implemented".format(task)) + + +def setup(args): + """Prepare training session: read configuration from file (takes precedence), create directories. + Input: + args: arguments object from argparse + Returns: + config: configuration dictionary + """ + + config = args.__dict__ # configuration dictionary + + if args.config_filepath is not None: + logger.info("Reading configuration ...") + try: # dictionary containing the entire configuration settings in a hierarchical fashion + config.update(utils.load_config(args.config_filepath)) + except: + logger.critical("Failed to load configuration file. Check JSON syntax and verify that files exist") + traceback.print_exc() + sys.exit(1) + + # Create output directory + initial_timestamp = datetime.now() + output_dir = config['output_dir'] + if not os.path.isdir(output_dir): + raise IOError( + "Root directory '{}', where the directory of the experiment will be created, must exist".format(output_dir)) + + output_dir = os.path.join(output_dir, config['experiment_name']) + + formatted_timestamp = initial_timestamp.strftime("%Y-%m-%d_%H-%M-%S") + config['initial_timestamp'] = formatted_timestamp + if (not config['no_timestamp']) or (len(config['experiment_name']) == 0): + rand_suffix = "".join(random.choices(string.ascii_letters + string.digits, k=3)) + output_dir += "_" + formatted_timestamp + "_" + rand_suffix + config['output_dir'] = output_dir + config['save_dir'] = os.path.join(output_dir, 'checkpoints') + config['pred_dir'] = os.path.join(output_dir, 'predictions') + config['tensorboard_dir'] = os.path.join(output_dir, 'tb_summaries') + utils.create_dirs([config['save_dir'], config['pred_dir'], config['tensorboard_dir']]) + + # Save configuration as a (pretty) json file + with open(os.path.join(output_dir, 'configuration.json'), 'w') as fp: + json.dump(config, fp, indent=4, sort_keys=True) + + logger.info("Stored configuration file in '{}'".format(output_dir)) + + return config + + +def fold_evaluate(dataset, model, device, loss_module, target_feats, config, dataset_name): + + allfolds = {'target_feats': target_feats, # list of len(num_folds), each element: list of target feature integer indices + 'predictions': [], # list of len(num_folds), each element: (num_samples, seq_len, feat_dim) prediction per sample + 'targets': [], # list of len(num_folds), each element: (num_samples, seq_len, feat_dim) target/original input per sample + 'target_masks': [], # list of len(num_folds), each element: (num_samples, seq_len, feat_dim) boolean mask per sample + 'metrics': [], # list of len(num_folds), each element: (num_samples, num_metrics) metric per sample + 'IDs': []} # list of len(num_folds), each element: (num_samples,) ID per sample + + for i, tgt_feats in enumerate(target_feats): + + dataset.mask_feats = tgt_feats # set the transduction target features + + loader = DataLoader(dataset=dataset, + batch_size=config['batch_size'], + shuffle=False, + num_workers=config['num_workers'], + pin_memory=True, + collate_fn=lambda x: collate_unsuperv(x, max_len=config['max_seq_len'])) + + evaluator = UnsupervisedRunner(model, loader, device, loss_module, + print_interval=config['print_interval'], console=config['console']) + + logger.info("Evaluating {} set, fold: {}, target features: {}".format(dataset_name, i, tgt_feats)) + aggr_metrics, per_batch = evaluate(evaluator) + + metrics_array = convert_metrics_per_batch_to_per_sample(per_batch['metrics'], per_batch['target_masks']) + metrics_array = np.concatenate(metrics_array, axis=0) + allfolds['metrics'].append(metrics_array) + allfolds['predictions'].append(np.concatenate(per_batch['predictions'], axis=0)) + allfolds['targets'].append(np.concatenate(per_batch['targets'], axis=0)) + allfolds['target_masks'].append(np.concatenate(per_batch['target_masks'], axis=0)) + allfolds['IDs'].append(np.concatenate(per_batch['IDs'], axis=0)) + + metrics_mean = np.mean(metrics_array, axis=0) + metrics_std = np.std(metrics_array, axis=0) + for m, metric_name in enumerate(list(aggr_metrics.items())[1:]): + logger.info("{}:: Mean: {:.3f}, std: {:.3f}".format(metric_name, metrics_mean[m], metrics_std[m])) + + pred_filepath = os.path.join(config['pred_dir'], dataset_name + '_fold_transduction_predictions.pickle') + logger.info("Serializing predictions into {} ... ".format(pred_filepath)) + with open(pred_filepath, 'wb') as f: + pickle.dump(allfolds, f, pickle.HIGHEST_PROTOCOL) + + +def convert_metrics_per_batch_to_per_sample(metrics, target_masks): + """ + Args: + metrics: list of len(num_batches), each element: list of len(num_metrics), each element: (num_active_in_batch,) metric per element + target_masks: list of len(num_batches), each element: (batch_size, seq_len, feat_dim) boolean mask: 1s active, 0s ignore + Returns: + metrics_array = list of len(num_batches), each element: (batch_size, num_metrics) metric per sample + """ + metrics_array = [] + for b, batch_target_masks in enumerate(target_masks): + num_active_per_sample = np.sum(batch_target_masks, axis=(1, 2)) + batch_metrics = np.stack(metrics[b], axis=1) # (num_active_in_batch, num_metrics) + ind = 0 + metrics_per_sample = np.zeros((len(num_active_per_sample), batch_metrics.shape[1])) # (batch_size, num_metrics) + for n, num_active in enumerate(num_active_per_sample): + new_ind = ind + num_active + metrics_per_sample[n, :] = np.sum(batch_metrics[ind:new_ind, :], axis=0) + ind = new_ind + metrics_array.append(metrics_per_sample) + return metrics_array + + +def evaluate(evaluator): + """Perform a single, one-off evaluation on an evaluator object (initialized with a dataset)""" + + eval_start_time = time.time() + with torch.no_grad(): + aggr_metrics, per_batch = evaluator.evaluate(epoch_num=None, keep_all=True) + eval_runtime = time.time() - eval_start_time + print() + print_str = 'Evaluation Summary: ' + for k, v in aggr_metrics.items(): + if v is not None: + print_str += '{}: {:8f} | '.format(k, v) + logger.info(print_str) + logger.info("Evaluation runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(eval_runtime))) + + return aggr_metrics, per_batch + + +def validate(val_evaluator, tensorboard_writer, config, best_metrics, best_value, epoch): + """Run an evaluation on the validation set while logging metrics, and handle outcome""" + + logger.info("Evaluating on validation set ...") + eval_start_time = time.time() + with torch.no_grad(): + aggr_metrics, per_batch = val_evaluator.evaluate(epoch, keep_all=True) + eval_runtime = time.time() - eval_start_time + logger.info("Validation runtime: {} hours, {} minutes, {} seconds\n".format(*utils.readable_time(eval_runtime))) + + global val_times + val_times["total_time"] += eval_runtime + val_times["count"] += 1 + avg_val_time = val_times["total_time"] / val_times["count"] + avg_val_batch_time = avg_val_time / len(val_evaluator.dataloader) + avg_val_sample_time = avg_val_time / len(val_evaluator.dataloader.dataset) + logger.info("Avg val. time: {} hours, {} minutes, {} seconds".format(*utils.readable_time(avg_val_time))) + logger.info("Avg batch val. time: {} seconds".format(avg_val_batch_time)) + logger.info("Avg sample val. time: {} seconds".format(avg_val_sample_time)) + + print() + print_str = 'Epoch {} Validation Summary: '.format(epoch) + for k, v in aggr_metrics.items(): + tensorboard_writer.add_scalar('{}/val'.format(k), v, epoch) + print_str += '{}: {:8f} | '.format(k, v) + logger.info(print_str) + + if config['key_metric'] in NEG_METRICS: + condition = (aggr_metrics[config['key_metric']] < best_value) + else: + condition = (aggr_metrics[config['key_metric']] > best_value) + if condition: + best_value = aggr_metrics[config['key_metric']] + utils.save_model(os.path.join(config['save_dir'], 'model_best.pth'), epoch, val_evaluator.model) + best_metrics = aggr_metrics.copy() + + pred_filepath = os.path.join(config['pred_dir'], 'best_predictions') + np.savez(pred_filepath, **per_batch) + + return aggr_metrics, best_metrics, best_value + + + +def check_progress(epoch): + + if epoch in [100, 140, 160, 220, 280, 340]: + return True + else: + return False + + +class BaseRunner(object): + + def __init__(self, model, dataloader, device, loss_module, optimizer=None, l2_reg=None, print_interval=10, console=True): + + self.model = model + self.dataloader = dataloader + self.device = device + self.optimizer = optimizer + self.loss_module = loss_module + self.l2_reg = l2_reg + self.print_interval = print_interval + self.printer = utils.Printer(console=console) + + self.epoch_metrics = OrderedDict() + + def train_epoch(self, epoch_num=None): + raise NotImplementedError('Please override in child class') + + def evaluate(self, epoch_num=None, keep_all=True): + raise NotImplementedError('Please override in child class') + + def print_callback(self, i_batch, metrics, prefix=''): + + total_batches = len(self.dataloader) + + template = "{:5.1f}% | batch: {:9d} of {:9d}" + content = [100 * (i_batch / total_batches), i_batch, total_batches] + for met_name, met_value in metrics.items(): + template += "\t|\t{}".format(met_name) + ": {:g}" + content.append(met_value) + + dyn_string = template.format(*content) + dyn_string = prefix + dyn_string + self.printer.print(dyn_string) + + +class UnsupervisedRunner(BaseRunner): + + def train_epoch(self, epoch_num=None): + + self.model = self.model.train() + + epoch_loss = 0 # total loss of epoch + total_active_elements = 0 # total unmasked elements in epoch + for i, batch in enumerate(self.dataloader): + + X, targets, target_masks, padding_masks, IDs = batch + targets = targets.to(self.device) + target_masks = target_masks.to(self.device) # 1s: mask and predict, 0s: unaffected input (ignore) + padding_masks = padding_masks.to(self.device) # 0s: ignore + + predictions = self.model(X.to(self.device), padding_masks) # (batch_size, padded_length, feat_dim) + + # Cascade noise masks (batch_size, padded_length, feat_dim) and padding masks (batch_size, padded_length) + target_masks = target_masks * padding_masks.unsqueeze(-1) + loss = self.loss_module(predictions, targets, target_masks) # (num_active,) individual loss (square error per element) for each active value in batch + batch_loss = torch.sum(loss) + mean_loss = batch_loss / len(loss) # mean loss (over active elements) used for optimization + + if self.l2_reg: + total_loss = mean_loss + self.l2_reg * l2_reg_loss(self.model) + else: + total_loss = mean_loss + + # Zero gradients, perform a backward pass, and update the weights. + self.optimizer.zero_grad() + total_loss.backward() + + # torch.nn.utils.clip_grad_value_(self.model.parameters(), clip_value=1.0) + torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=4.0) + self.optimizer.step() + + metrics = {"loss": mean_loss.item()} + if i % self.print_interval == 0: + ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num) + self.print_callback(i, metrics, prefix='Training ' + ending) + + with torch.no_grad(): + total_active_elements += len(loss) + epoch_loss += batch_loss.item() # add total loss of batch + + epoch_loss = epoch_loss / total_active_elements # average loss per element for whole epoch + self.epoch_metrics['epoch'] = epoch_num + self.epoch_metrics['loss'] = epoch_loss + return self.epoch_metrics + + def evaluate(self, epoch_num=None, keep_all=True): + + self.model = self.model.eval() + + epoch_loss = 0 # total loss of epoch + total_active_elements = 0 # total unmasked elements in epoch + + if keep_all: + per_batch = {'target_masks': [], 'targets': [], 'predictions': [], 'metrics': [], 'IDs': []} + for i, batch in enumerate(self.dataloader): + + X, targets, target_masks, padding_masks, IDs = batch + targets = targets.to(self.device) + target_masks = target_masks.to(self.device) # 1s: mask and predict, 0s: unaffected input (ignore) + padding_masks = padding_masks.to(self.device) # 0s: ignore + + # TODO: for debugging + # input_ok = utils.check_tensor(X, verbose=False, zero_thresh=1e-8, inf_thresh=1e4) + # if not input_ok: + # print("Input problem!") + # ipdb.set_trace() + # + # utils.check_model(self.model, verbose=False, stop_on_error=True) + + predictions = self.model(X.to(self.device), padding_masks) # (batch_size, padded_length, feat_dim) + + # Cascade noise masks (batch_size, padded_length, feat_dim) and padding masks (batch_size, padded_length) + target_masks = target_masks * padding_masks.unsqueeze(-1) + loss = self.loss_module(predictions, targets, target_masks) # (num_active,) individual loss (square error per element) for each active value in batch + batch_loss = torch.sum(loss).cpu().item() + try: + mean_loss = batch_loss / len(loss) # mean loss (over active elements) used for optimization the batch + except ZeroDivisionError as err: + print('loss length is zero: target_masks {} \n targets {} \n predictions {}'.format(target_masks, targets, predictions)) + mean_loss = np.nan + + if keep_all: + per_batch['target_masks'].append(target_masks.cpu().numpy()) + per_batch['targets'].append(targets.cpu().numpy()) + per_batch['predictions'].append(predictions.cpu().numpy()) + per_batch['metrics'].append([loss.cpu().numpy()]) + per_batch['IDs'].append(IDs) + + metrics = {"loss": mean_loss} + if i % self.print_interval == 0: + ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num) + self.print_callback(i, metrics, prefix='Evaluating ' + ending) + + total_active_elements += len(loss) + epoch_loss += batch_loss # add total loss of batch + + epoch_loss = epoch_loss / total_active_elements # average loss per element for whole epoch + self.epoch_metrics['epoch'] = epoch_num + self.epoch_metrics['loss'] = epoch_loss + + if keep_all: + return self.epoch_metrics, per_batch + else: + return self.epoch_metrics + + +class SupervisedRunner(BaseRunner): + + def __init__(self, *args, **kwargs): + + super(SupervisedRunner, self).__init__(*args, **kwargs) + + if isinstance(args[3], torch.nn.CrossEntropyLoss): + self.classification = True # True if classification, False if regression + self.analyzer = analysis.Analyzer(print_conf_mat=True) + else: + self.classification = False + + def train_epoch(self, epoch_num=None): + + self.model = self.model.train() + + epoch_loss = 0 # total loss of epoch + total_samples = 0 # total samples in epoch + + for i, batch in enumerate(self.dataloader): + + X, targets, padding_masks, IDs = batch + targets = targets.to(self.device) + padding_masks = padding_masks.to(self.device) # 0s: ignore + # regression: (batch_size, num_labels); classification: (batch_size, num_classes) of logits + predictions = self.model(X.to(self.device), padding_masks) + + loss = self.loss_module(predictions, targets) # (batch_size,) loss for each sample in the batch + batch_loss = torch.sum(loss) + mean_loss = batch_loss / len(loss) # mean loss (over samples) used for optimization + + if self.l2_reg: + total_loss = mean_loss + self.l2_reg * l2_reg_loss(self.model) + else: + total_loss = mean_loss + + # Zero gradients, perform a backward pass, and update the weights. + self.optimizer.zero_grad() + total_loss.backward() + + # torch.nn.utils.clip_grad_value_(self.model.parameters(), clip_value=1.0) + torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=4.0) + self.optimizer.step() + + metrics = {"loss": mean_loss.item()} + if i % self.print_interval == 0: + ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num) + self.print_callback(i, metrics, prefix='Training ' + ending) + + with torch.no_grad(): + total_samples += len(loss) + epoch_loss += batch_loss.item() # add total loss of batch + + epoch_loss = epoch_loss / total_samples # average loss per sample for whole epoch + self.epoch_metrics['epoch'] = epoch_num + self.epoch_metrics['loss'] = epoch_loss + return self.epoch_metrics + + def evaluate(self, epoch_num=None, keep_all=True): + + self.model = self.model.eval() + + epoch_loss = 0 # total loss of epoch + total_samples = 0 # total samples in epoch + + per_batch = {'target_masks': [], 'targets': [], 'predictions': [], 'metrics': [], 'IDs': []} + for i, batch in enumerate(self.dataloader): + + X, targets, padding_masks, IDs = batch + targets = targets.to(self.device) + padding_masks = padding_masks.to(self.device) # 0s: ignore + # regression: (batch_size, num_labels); classification: (batch_size, num_classes) of logits + predictions = self.model(X.to(self.device), padding_masks) + + loss = self.loss_module(predictions, targets) # (batch_size,) loss for each sample in the batch + batch_loss = torch.sum(loss).cpu().item() + mean_loss = batch_loss / len(loss) # mean loss (over samples) + + per_batch['targets'].append(targets.cpu().numpy()) + per_batch['predictions'].append(predictions.cpu().numpy()) + per_batch['metrics'].append([loss.cpu().numpy()]) + per_batch['IDs'].append(IDs) + + metrics = {"loss": mean_loss} + if i % self.print_interval == 0: + ending = "" if epoch_num is None else 'Epoch {} '.format(epoch_num) + self.print_callback(i, metrics, prefix='Evaluating ' + ending) + + total_samples += len(loss) + epoch_loss += batch_loss # add total loss of batch + + epoch_loss = epoch_loss / total_samples # average loss per element for whole epoch + self.epoch_metrics['epoch'] = epoch_num + self.epoch_metrics['loss'] = epoch_loss + + if self.classification: + predictions = torch.from_numpy(np.concatenate(per_batch['predictions'], axis=0)) + probs = torch.nn.functional.softmax(predictions) # (total_samples, num_classes) est. prob. for each class and sample + predictions = torch.argmax(probs, dim=1).cpu().numpy() # (total_samples,) int class index for each sample + probs = probs.cpu().numpy() + targets = np.concatenate(per_batch['targets'], axis=0).flatten() + class_names = np.arange(probs.shape[1]) # TODO: temporary until I decide how to pass class names + metrics_dict = self.analyzer.analyze_classification(predictions, targets, class_names) + + self.epoch_metrics['accuracy'] = metrics_dict['total_accuracy'] # same as average recall over all classes + self.epoch_metrics['precision'] = metrics_dict['prec_avg'] # average precision over all classes + + if self.model.num_classes == 2: + false_pos_rate, true_pos_rate, _ = sklearn.metrics.roc_curve(targets, probs[:, 1]) # 1D scores needed + self.epoch_metrics['AUROC'] = sklearn.metrics.auc(false_pos_rate, true_pos_rate) + + prec, rec, _ = sklearn.metrics.precision_recall_curve(targets, probs[:, 1]) + self.epoch_metrics['AUPRC'] = sklearn.metrics.auc(rec, prec) + + if keep_all: + return self.epoch_metrics, per_batch + else: + return self.epoch_metrics diff --git a/utils/analysis.py b/utils/analysis.py new file mode 100644 index 0000000..8699ee5 --- /dev/null +++ b/utils/analysis.py @@ -0,0 +1,478 @@ +""" +Collection of functions which enable the evaluation of a classifier's performance, +by showing confusion matrix, accuracy, recall, precision etc. +""" + +import numpy as np +import sys + +import matplotlib.pyplot as plt + +from sklearn import metrics +from tabulate import tabulate +import math +import logging +from datetime import datetime + + +def acc_top_k(predictions, y_true): + """Accuracy when allowing for correct class being in the top k predictions. + + Arguments: + predictions: (N_samples, k) array of top class indices (pre-sorted class indices based on score) per sample + y_true: N_samples 1D-array of ground truth labels (integer indices) + Returns: + length k 1D-array of accuracy when allowing for correct class being in top 1, 2, ... k predictions""" + + y_true = y_true[:, np.newaxis] + + # Create upper triangular matrix of ones, to be used in construction of V + building_blocks = np.zeros((predictions.shape[1], predictions.shape[1])) + building_blocks[np.triu_indices(predictions.shape[1])] = 1 + + # A matrix of the same shape as predictions. For each sample, the index corresponding + # to a correct prediction is 1, as well as all following indices. + # Example: y_true = [1,0], predictions = [[1 5 4],[2 0 3]]. Then: V = [[1 1 1],[0 1 1]] + V = np.zeros_like(predictions, dtype=int) # validity matrix + sample_ind, rank_ind = np.where(predictions == y_true) + + V[sample_ind, :] = building_blocks[rank_ind, :] + + return np.mean(V, axis=0) + + +def accuracy(y_pred, y_true, excluded_labels=None): + """A simple accuracy calculator, which can ignore labels specified in a list""" + + if excluded_labels is None: + return np.mean(y_pred == y_true) + else: + included = (y_pred != excluded_labels[0]) & (y_true != excluded_labels[0]) + # The following extra check (rather than initializing with an array of ones) + # is done because a single excluded label is the most common case + if len(excluded_labels) > 1: + for label in excluded_labels[1:]: + included &= (y_pred != label) & (y_true != label) + + return np.mean(y_pred[included] == y_true[included]) + + +def precision(y_true, y_pred, label): + """Returns precision for the specified class index""" + + predicted_in_C = (y_pred == label) + num_pred_in_C = np.sum(predicted_in_C) + if num_pred_in_C == 0: + return 0 + return np.sum(y_true[predicted_in_C] == label) / num_pred_in_C + + +def recall(y_true, y_pred, label): + """Returns recall for the specified class index""" + + truly_in_C = (y_true == label) + num_truly_in_C = np.sum(truly_in_C) + if num_truly_in_C == 0: + return 0 # or NaN? + return np.sum(y_pred[truly_in_C] == label) / num_truly_in_C + + +def limiter(metric_functions, y_true, y_pred, y_scores, score_thr, label): + """Wraps a list of metric functions, i.e precison or recall, by ingoring predictions under the + specified threshold for a specific class. + """ + + ltd_pred = np.copy(y_pred) + ltd_pred[(ltd_pred == label) & (y_scores < score_thr)] = -1 + + output = [func(y_true, ltd_pred, label) for func in metric_functions] + + return output + + +def prec_rec_parametrized_by_thr(y_true, y_pred, y_scores, label, Npoints, min_score=None, max_score=None): + """Returns an array showing for a specified class of interest, how precision and recall change as a function of + the score threshold (parameter). + + Input: + y_true: 1D array of true labels (class indices) + y_pred: 1D array of predicted labels (class indices) + y_scores: 1D array of scores corresponding to predictions in y_pred + label: class label of interest + Npoints: number of score threshold points. Defines "resolution" of the parameter (score threshold) + min_score, max_score: if specified, they impose lower and upper bound limits for the parameter (score thr.) + Output: + prec_rec: ndarray of shape (Npoints, 2), containing a precision (column 0) and recall (column 1) value for each + score threshold value + """ + + if (min_score is None) or (max_score is None): + predicted_in_C = (y_pred == label) + min_score = 0.99 * np.amin(y_scores[predicted_in_C]) # guarantees that all predictions are kept + max_score = 1.01 * np.amax(y_scores[predicted_in_C]) # guarantees that no prediction is kept + + grid = np.linspace(min_score, max_score, Npoints) + + measure = lambda x: limiter([precision, recall], y_true, y_pred, y_scores, x, label) + + return np.array(map(measure, grid)), grid + + +def plot_prec_vs_rec(score_grid, rec, prec, prec_requirement=None, thr_opt=None, title=None, show=True, save_as=None): + """Plots a figure depicting precision and recall as a function of the score threshold. + Optionally also depicts an imposed precision requirement and a chosen score threshold value.""" + + if not (thr_opt is None): + thr_opt = thr_opt if not (math.isinf(thr_opt)) else None + + plt.figure() + if title: + plt.suptitle(title) + + # Recall and Precision vs. Score Threshold + plt.subplot(211) + l_rec, = plt.plot(score_grid, rec, '.-') + + plt.hold(True) + l_prec, = plt.plot(score_grid, prec, 'g.-') + plt.ylim((0, 1.01)) + plt.xlabel('score threshold') + + legend_lines = [l_rec, l_prec] + legend_labels = ['recall', 'precision'] + + if prec_requirement: + l_prec_req = plt.axhline(prec_requirement, color='r', linestyle='--') + legend_lines.append(l_prec_req) + legend_labels.append('prec. req.') + + if not (thr_opt is None): + l_score_thr = plt.axvline(thr_opt, color='r') + legend_lines.append(l_score_thr) + legend_labels.append('opt. thr.') + + plt.legend(legend_lines, legend_labels, loc='lower right', fontsize=10) + + # Recall vs. Precision + plt.subplot(212) + plt.plot(prec, rec, '.-') + + plt.ylim((0, 1.01)) + plt.xlim((0, 1.01)) + plt.ylabel('recall') + plt.xlabel('precision') + + if prec_requirement: + l_prec_req = plt.axvline(prec_requirement, color='r', linestyle='--') + plt.legend([l_prec_req], ['precision req.'], loc='lower left', fontsize=10) + + if save_as: + plt.savefig(save_as, bbox_inches='tight', format='pdf') + + if show: + plt.tight_layout() + plt.show(block=False) + + +def plot_confusion_matrix(ConfMat, label_strings=None, title='Confusion matrix', cmap=plt.cm.get_cmap('Blues')): + """Plot confusion matrix in a separate window""" + plt.imshow(ConfMat, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + if label_strings: + tick_marks = np.arange(len(label_strings)) + plt.xticks(tick_marks, label_strings, rotation=90) + plt.yticks(tick_marks, label_strings) + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label') + + +def print_confusion_matrix(ConfMat, label_strings=None, title='Confusion matrix'): + """Print confusion matrix as text to terminal""" + + if label_strings is None: + label_strings = ConfMat.shape[0] * [''] + + print(title) + print(len(title) * '-') + # Make printable matrix: + print_mat = [] + for i, row in enumerate(ConfMat): + print_mat.append([label_strings[i]] + list(row)) + print(tabulate(print_mat, headers=['True\Pred'] + label_strings, tablefmt='orgtbl')) + + +class Analyzer(object): + + def __init__(self, maxcharlength=35, plot=False, print_conf_mat=False, output_filepath=None): + + self.maxcharlength = maxcharlength + self.plot = plot + self.print_conf_mat = print_conf_mat + + # create logger + self.logID = str( + datetime.now()) # this is to enable individual logging configuration between different instances + self.logger = logging.getLogger(self.logID) + self.logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(message)s') + + # create console handler + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + self.logger.addHandler(ch) + + if output_filepath: + # create file handler + fh = logging.FileHandler(output_filepath) + fh.setLevel(logging.INFO) + fh.setFormatter(formatter) + self.logger.addHandler(fh) + + def show_acc_top_k_improvement(self, y_pred, y_true, k=5, inp='scores'): + """ + Show how accuracy improves when considering the event of the correct label being among the top k predictions as a successful prediction + Arguments: + k: integer k mentioned above + inp: string, one of 'scores' or 'indices', defining assumptions for `y_pred`, see below + y_pred: If inp is 'indices', then this is a (N_samples, k) array of top class indices (pre-sorted class indices based on score) per sample + If inp is 'scores', then this is assummed to be a (N_samples, C) array of class scores per sample, where C is the number of classes + y_true: (N_samples,) 1D numpy array of ground truth labels (integer indices) + """ + + print('How accuracy improves when allowing correct result being in the top 1, 2, ..., k predictions:\n') + + if inp == 'scores': + predictions = np.argsort(y_pred, axis=1)[:, ::-1] # sort in descending order + else: + predictions = y_pred + + predictions = predictions[:, :min(k, predictions.shape[1])] # take top k + + accuracy_per_rank = acc_top_k(predictions, y_true) + + row1 = ['k'] + range(1, len(accuracy_per_rank) + 1) + row2 = ['Accuracy'] + list(accuracy_per_rank) + print(tabulate([row1, row2], tablefmt='orgtbl')) + + if self.plot: + from matplotlib.ticker import MaxNLocator + + ax = plt.figure().gca() + plt.plot(np.arange(1, k + 1, dtype=int), accuracy_per_rank, '.-') + ax.xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.xlabel('Number of allowed predictions (k)') + plt.ylabel('Cumulative accuracy\n(prob. of correct result being in top k pred.)') + plt.title('Cumulative Accuracy vs Number of allowed predictions') + + plt.show(block=False) + + return accuracy_per_rank + + def generate_classification_report(self, digits=3, number_of_thieves=2, maxcharlength=35): + """ + Returns a string of a report for given metric arrays (array length equals the number of classes). + Called internally by `analyze_classification`. + digits: number of digits after . for displaying results + number_of_thieves: number of biggest thieves to report + maxcharlength: max. number of characters to use when displaying thief names + """ + + relative_freq = self.support / np.sum(self.support) # relative frequencies of each class in the true lables + sorted_class_indices = np.argsort(relative_freq)[ + ::-1] # sort by "importance" of classes (i.e. occurance frequency) + + last_line_heading = 'avg / total' + + width = max(len(cn) for cn in self.existing_class_names) + width = max(width, len(last_line_heading), digits) + + headers = ["precision", "recall", "f1-score", "rel. freq.", "abs. freq.", "biggest thieves"] + fmt = '%% %ds' % width # first column: class name + fmt += ' ' + fmt += ' '.join(['% 10s' for _ in headers[:-1]]) + fmt += '|\t % 5s' + fmt += '\n' + + headers = [""] + headers + report = fmt % tuple(headers) + report += '\n' + + for i in sorted_class_indices: + values = [self.existing_class_names[i]] + for v in (self.precision[i], self.recall[i], self.f1[i], + relative_freq[i]): # v is NOT a tuple, just goes through this list 1 el. at a time + values += ["{0:0.{1}f}".format(v, digits)] + values += ["{}".format(self.support[i])] + thieves = np.argsort(self.ConfMatrix_normalized_row[i, :])[::-1][ + :number_of_thieves + 1] # other class indices "stealing" from class. May still contain self + thieves = thieves[thieves != i] # exclude self at this point + steal_ratio = self.ConfMatrix_normalized_row[i, thieves] + thieves_names = [ + self.existing_class_names[thief][:min(maxcharlength, len(self.existing_class_names[thief]))] for thief + in thieves] # a little inefficient but inconsequential + string_about_stealing = "" + for j in range(len(thieves)): + string_about_stealing += "{0}: {1:.3f},\t".format(thieves_names[j], steal_ratio[j]) + values += [string_about_stealing] + + report += fmt % tuple(values) + + report += '\n' + 100 * '-' + '\n' + + # compute averages/sums + values = [last_line_heading] + for v in (np.average(self.precision, weights=relative_freq), + np.average(self.recall, weights=relative_freq), + np.average(self.f1, weights=relative_freq)): + values += ["{0:0.{1}f}".format(v, digits)] + values += ['{0}'.format(np.sum(relative_freq))] + values += ['{0}'.format(np.sum(self.support))] + values += [''] + + # make last ("Total") line for report + report += fmt % tuple(values) + + return report + + def get_avg_prec_recall(self, ConfMatrix, existing_class_names, excluded_classes=None): + """Get average recall and precision, using class frequencies as weights, optionally excluding + specified classes""" + + class2ind = dict(zip(existing_class_names, range(len(existing_class_names)))) + included_c = np.full(len(existing_class_names), 1, dtype=bool) + + if not (excluded_classes is None): + excl_ind = [class2ind[excl_class] for excl_class in excluded_classes] + included_c[excl_ind] = False + + pred_per_class = np.sum(ConfMatrix, axis=0) + nonzero_pred = (pred_per_class > 0) + + included = included_c & nonzero_pred + support = np.sum(ConfMatrix, axis=1) + weights = support[included] / np.sum(support[included]) + + prec = np.diag(ConfMatrix[included, :][:, included]) / pred_per_class[included] + prec_avg = np.dot(weights, prec) + + # rec = np.diag(ConfMatrix[included_c,:][:,included_c])/support[included_c] + rec_avg = np.trace(ConfMatrix[included_c, :][:, included_c]) / np.sum(support[included_c]) + + return prec_avg, rec_avg + + def prec_rec_histogram(self, precision, recall, binedges=None): + """Make a histogram with the distribution of classes with respect to precision and recall + """ + + if binedges is None: + binedges = np.concatenate((np.arange(0, 0.6, 0.2), np.arange(0.6, 1.01, 0.1)), axis=0) + binedges = np.append(binedges, binedges[-1] + 0.1) # add 1 extra bin at the end for >= 1 + + hist_precision, binedges = np.histogram(precision, binedges) + hist_recall, binedges = np.histogram(recall, binedges) + + print("\n\nDistribution of classes with respect to PRECISION: ") + for b in range(len(binedges) - 1): + print("[{:.1f}, {:.1f}): {}".format(binedges[b], binedges[b + 1], hist_precision[b])) + + print("\n\nDistribution of classes with respect to RECALL: ") + for b in range(len(binedges) - 1): + print("[{:.1f}, {:.1f}): {}".format(binedges[b], binedges[b + 1], hist_recall[b])) + + if self.plot: + plt.figure() + plt.subplot(121) + widths = np.diff(binedges) + plt.bar(binedges[:-1], hist_precision, width=widths, align='edge') + plt.xlim(0, 1) + ax = plt.gca() + ax.set_xticks(binedges) + plt.xlabel('Precision') + plt.ylabel('Number of classes') + plt.title("Distribution of classes with respect to precision") + + plt.subplot(122) + widths = np.diff(binedges) + plt.bar(binedges[:-1], hist_recall, width=widths, align='edge') + plt.xlim(0, 1) + ax = plt.gca() + ax.set_xticks(binedges) + plt.xlabel('Recall') + plt.ylabel('Number of classes') + plt.title("Distribution of classes with respect to recall") + + plt.show(block=False) + + def analyze_classification(self, y_pred, y_true, class_names, excluded_classes=None): + """ + For an array of label predictions and the respective true labels, shows confusion matrix, accuracy, recall, precision etc: + Input: + y_pred: 1D array of predicted labels (class indices) + y_true: 1D array of true labels (class indices) + class_names: 1D array or list of class names in the order of class indices. + Could also be integers [0, 1, ..., num_classes-1]. + excluded_classes: list of classes to be excluded from average precision, recall calculation (e.g. OTHER) + """ + + # Trim class_names to include only classes existing in y_pred OR y_true + in_pred_labels = set(list(y_pred)) + in_true_labels = set(list(y_true)) + + self.existing_class_ind = sorted(list(in_pred_labels | in_true_labels)) + class_strings = [str(name) for name in class_names] # needed in case `class_names` elements are not strings + self.existing_class_names = [class_strings[ind][:min(self.maxcharlength, len(class_strings[ind]))] for ind in + self.existing_class_ind] # a little inefficient but inconsequential + + # Confusion matrix + ConfMatrix = metrics.confusion_matrix(y_true, y_pred) + + if self.print_conf_mat: + print_confusion_matrix(ConfMatrix, label_strings=self.existing_class_names, title='Confusion matrix') + print('\n') + if self.plot: + plt.figure() + plot_confusion_matrix(ConfMatrix, self.existing_class_names) + + # Normalize the confusion matrix by row (i.e by the number of samples in each class) + self.ConfMatrix_normalized_row = ConfMatrix.astype('float') / ConfMatrix.sum(axis=1)[:, np.newaxis] + + if self.print_conf_mat: + print_confusion_matrix(self.ConfMatrix_normalized_row, label_strings=self.existing_class_names, + title='Confusion matrix normalized by row') + print('\n') + if self.plot: + plt.figure() + plot_confusion_matrix(self.ConfMatrix_normalized_row, label_strings=self.existing_class_names, + title='Confusion matrix normalized by row') + + plt.show(block=False) + + # Analyze results + self.total_accuracy = np.trace(ConfMatrix) / len(y_true) + print('Overall accuracy: {:.3f}\n'.format(self.total_accuracy)) + + # returns metrics for each class, in the same order as existing_class_names + self.precision, self.recall, self.f1, self.support = metrics.precision_recall_fscore_support(y_true, y_pred, + labels=self.existing_class_ind) + + # Print report + print(self.generate_classification_report()) + + # Calculate average precision and recall + self.prec_avg, self.rec_avg = self.get_avg_prec_recall(ConfMatrix, self.existing_class_names, excluded_classes) + if excluded_classes: + print( + "\nAverage PRECISION: {:.2f}\n(using class frequencies as weights, excluding classes with no predictions and predictions in '{}')".format( + self.prec_avg, ', '.join(excluded_classes))) + print( + "\nAverage RECALL (= ACCURACY): {:.2f}\n(using class frequencies as weights, excluding classes in '{}')".format( + self.rec_avg, ', '.join(excluded_classes))) + + # Make a histogram with the distribution of classes with respect to precision and recall + self.prec_rec_histogram(self.precision, self.recall) + + return {"total_accuracy": self.total_accuracy, "precision": self.precision, "recall": self.recall, + "f1": self.f1, "support": self.support, "prec_avg": self.prec_avg, "rec_avg": self.rec_avg} \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..1d4f5f4 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,343 @@ +import json +import os +import sys +import builtins +import functools +import time +import ipdb +from copy import deepcopy + +import numpy as np +import torch +import xlrd +import xlwt +from xlutils.copy import copy + +import logging +logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) + + +def timer(func): + """Print the runtime of the decorated function""" + @functools.wraps(func) + def wrapper_timer(*args, **kwargs): + start_time = time.perf_counter() # 1 + value = func(*args, **kwargs) + end_time = time.perf_counter() # 2 + run_time = end_time - start_time # 3 + print(f"Finished {func.__name__!r} in {run_time} secs") + return value + return wrapper_timer + + +def save_model(path, epoch, model, optimizer=None): + if isinstance(model, torch.nn.DataParallel): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + data = {'epoch': epoch, + 'state_dict': state_dict} + if not (optimizer is None): + data['optimizer'] = optimizer.state_dict() + torch.save(data, path) + + +def load_model(model, model_path, optimizer=None, resume=False, change_output=False, + lr=None, lr_step=None, lr_factor=None): + start_epoch = 0 + checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) + state_dict = deepcopy(checkpoint['state_dict']) + if change_output: + for key, val in checkpoint['state_dict'].items(): + if key.startswith('output_layer'): + state_dict.pop(key) + model.load_state_dict(state_dict, strict=False) + print('Loaded model from {}. Epoch: {}'.format(model_path, checkpoint['epoch'])) + + # resume optimizer parameters + if optimizer is not None and resume: + if 'optimizer' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + start_lr = lr + for i in range(len(lr_step)): + if start_epoch >= lr_step[i]: + start_lr *= lr_factor[i] + for param_group in optimizer.param_groups: + param_group['lr'] = start_lr + print('Resumed optimizer with start lr', start_lr) + else: + print('No optimizer parameters in checkpoint.') + if optimizer is not None: + return model, optimizer, start_epoch + else: + return model + + +def load_config(config_filepath): + """ + Using a json file with the master configuration (config file for each part of the pipeline), + return a dictionary containing the entire configuration settings in a hierarchical fashion. + """ + + with open(config_filepath) as cnfg: + config = json.load(cnfg) + + return config + + +def create_dirs(dirs): + """ + Input: + dirs: a list of directories to create, in case these directories are not found + Returns: + exit_code: 0 if success, -1 if failure + """ + try: + for dir_ in dirs: + if not os.path.exists(dir_): + os.makedirs(dir_) + return 0 + except Exception as err: + print("Creating directories error: {0}".format(err)) + exit(-1) + + +def export_performance_metrics(filepath, metrics_table, header, book=None, sheet_name="metrics"): + """Exports performance metrics on the validation set for all epochs to an excel file""" + + if book is None: + book = xlwt.Workbook() # new excel work book + + book = write_table_to_sheet([header] + metrics_table, book, sheet_name=sheet_name) + + book.save(filepath) + logger.info("Exported per epoch performance metrics in '{}'".format(filepath)) + + return book + + +def write_row(sheet, row_ind, data_list): + """Write a list to row_ind row of an excel sheet""" + + row = sheet.row(row_ind) + for col_ind, col_value in enumerate(data_list): + row.write(col_ind, col_value) + return + + +def write_table_to_sheet(table, work_book, sheet_name=None): + """Writes a table implemented as a list of lists to an excel sheet in the given work book object""" + + sheet = work_book.add_sheet(sheet_name) + + for row_ind, row_list in enumerate(table): + write_row(sheet, row_ind, row_list) + + return work_book + + +def export_record(filepath, values): + """Adds a list of values as a bottom row of a table in a given excel file""" + + read_book = xlrd.open_workbook(filepath, formatting_info=True) + read_sheet = read_book.sheet_by_index(0) + last_row = read_sheet.nrows + + work_book = copy(read_book) + sheet = work_book.get_sheet(0) + write_row(sheet, last_row, values) + work_book.save(filepath) + + +def register_record(filepath, timestamp, experiment_name, best_metrics, final_metrics=None, comment=''): + """ + Adds the best and final metrics of a given experiment as a record in an excel sheet with other experiment records. + Creates excel sheet if it doesn't exist. + Args: + filepath: path of excel file keeping records + timestamp: string + experiment_name: string + best_metrics: dict of metrics at best epoch {metric_name: metric_value}. Includes "epoch" as first key + final_metrics: dict of metrics at final epoch {metric_name: metric_value}. Includes "epoch" as first key + comment: optional description + """ + metrics_names, metrics_values = zip(*best_metrics.items()) + row_values = [timestamp, experiment_name, comment] + list(metrics_values) + if final_metrics is not None: + final_metrics_names, final_metrics_values = zip(*final_metrics.items()) + row_values += list(final_metrics_values) + + if not os.path.exists(filepath): # Create a records file for the first time + logger.warning("Records file '{}' does not exist! Creating new file ...".format(filepath)) + directory = os.path.dirname(filepath) + if len(directory) and not os.path.exists(directory): + os.makedirs(directory) + header = ["Timestamp", "Name", "Comment"] + ["Best " + m for m in metrics_names] + if final_metrics is not None: + header += ["Final " + m for m in final_metrics_names] + book = xlwt.Workbook() # excel work book + book = write_table_to_sheet([header, row_values], book, sheet_name="records") + book.save(filepath) + else: + try: + export_record(filepath, row_values) + except Exception as x: + alt_path = os.path.join(os.path.dirname(filepath), "record_" + experiment_name) + logger.error("Failed saving in: '{}'! Will save here instead: {}".format(filepath, alt_path)) + export_record(alt_path, row_values) + filepath = alt_path + + logger.info("Exported performance record to '{}'".format(filepath)) + + +class Printer(object): + """Class for printing output by refreshing the same line in the console, e.g. for indicating progress of a process""" + + def __init__(self, console=True): + + if console: + self.print = self.dyn_print + else: + self.print = builtins.print + + @staticmethod + def dyn_print(data): + """Print things to stdout on one line, refreshing it dynamically""" + sys.stdout.write("\r\x1b[K" + data.__str__()) + sys.stdout.flush() + + +def readable_time(time_difference): + """Convert a float measuring time difference in seconds into a tuple of (hours, minutes, seconds)""" + + hours = time_difference // 3600 + minutes = (time_difference // 60) % 60 + seconds = time_difference % 60 + + return hours, minutes, seconds + + +# def check_model1(model, verbose=False, stop_on_error=False): +# status_ok = True +# for name, param in model.named_parameters(): +# nan_grads = torch.isnan(param.grad) +# nan_params = torch.isnan(param) +# if nan_grads.any() or nan_params.any(): +# status_ok = False +# print("Param {}: {}/{} nan".format(name, torch.sum(nan_params), param.numel())) +# if verbose: +# print(param) +# print("Grad {}: {}/{} nan".format(name, torch.sum(nan_grads), param.grad.numel())) +# if verbose: +# print(param.grad) +# if stop_on_error: +# ipdb.set_trace() +# if status_ok: +# print("Model Check: OK") +# else: +# print("Model Check: PROBLEM") + + +def check_model(model, verbose=False, zero_thresh=1e-8, inf_thresh=1e6, stop_on_error=False): + status_ok = True + for name, param in model.named_parameters(): + param_ok = check_tensor(param, verbose=verbose, zero_thresh=zero_thresh, inf_thresh=inf_thresh) + if not param_ok: + status_ok = False + print("Parameter '{}' PROBLEM".format(name)) + grad_ok = True + if param.grad is not None: + grad_ok = check_tensor(param.grad, verbose=verbose, zero_thresh=zero_thresh, inf_thresh=inf_thresh) + if not grad_ok: + status_ok = False + print("Gradient of parameter '{}' PROBLEM".format(name)) + if stop_on_error and not (param_ok and grad_ok): + ipdb.set_trace() + + if status_ok: + print("Model Check: OK") + else: + print("Model Check: PROBLEM") + + +def check_tensor(X, verbose=True, zero_thresh=1e-8, inf_thresh=1e6): + + is_nan = torch.isnan(X) + if is_nan.any(): + print("{}/{} nan".format(torch.sum(is_nan), X.numel())) + return False + + num_small = torch.sum(torch.abs(X) < zero_thresh) + num_large = torch.sum(torch.abs(X) > inf_thresh) + + if verbose: + print("Shape: {}, {} elements".format(X.shape, X.numel())) + print("No 'nan' values") + print("Min: {}".format(torch.min(X))) + print("Median: {}".format(torch.median(X))) + print("Max: {}".format(torch.max(X))) + + print("Histogram of values:") + values = X.view(-1).detach().numpy() + hist, binedges = np.histogram(values, bins=20) + for b in range(len(binedges) - 1): + print("[{}, {}): {}".format(binedges[b], binedges[b + 1], hist[b])) + + print("{}/{} abs. values < {}".format(num_small, X.numel(), zero_thresh)) + print("{}/{} abs. values > {}".format(num_large, X.numel(), inf_thresh)) + + if num_large: + print("{}/{} abs. values > {}".format(num_large, X.numel(), inf_thresh)) + return False + + return True + + +def count_parameters(model, trainable=False): + if trainable: + return sum(p.numel() for p in model.parameters() if p.requires_grad) + else: + return sum(p.numel() for p in model.parameters()) + + +def recursively_hook(model, hook_fn): + for name, module in model.named_children(): #model._modules.items(): + if len(list(module.children())) > 0: # if not leaf node + for submodule in module.children(): + recursively_hook(submodule, hook_fn) + else: + module.register_forward_hook(hook_fn) + + +def compute_loss(net: torch.nn.Module, + dataloader: torch.utils.data.DataLoader, + loss_function: torch.nn.Module, + device: torch.device = 'cpu') -> torch.Tensor: + """Compute the loss of a network on a given dataset. + + Does not compute gradient. + + Parameters + ---------- + net: + Network to evaluate. + dataloader: + Iterator on the dataset. + loss_function: + Loss function to compute. + device: + Torch device, or :py:class:`str`. + + Returns + ------- + Loss as a tensor with no grad. + """ + running_loss = 0 + with torch.no_grad(): + for x, y in dataloader: + netout = net(x.to(device)).cpu() + running_loss += loss_function(y, netout) + + return running_loss / len(dataloader)