From 88a55392be10c02d1741e2a975e96f9eba1db649 Mon Sep 17 00:00:00 2001
From: laramos <l.a.ramos@amc.uva.nl>
Date: Tue, 16 Jun 2020 12:23:15 +0200
Subject: [PATCH 1/2] shap features

---
 .gitignore                                    |   6 +-
 Classifiers/XGB.py                            | 421 +++++++++-------
 Classifiers/gradboost.py                      |  50 --
 Classifiers/logreg.py                         | 298 ++++++-----
 covid19_ICU_admission.py                      | 200 ++++++--
 covid19_ICU_util.py                           | 214 ++++++--
 .../ALL_castor2excel_variables.py             | 166 +++++--
 .../ALL_completeness_category.py              |  76 +++
 .../ALL_convert_to_age_in_years.py            |   2 +-
 data_query_scripts/survival_death.py          | 199 ++++++--
 data_query_scripts/survival_death_km.py       | 350 +++++++++++++
 error_replacements.py                         |   4 +-
 generate_figures.py                           | 469 ++++++++++++++++++
 get_feature_set.py                            |   6 +-
 lr_coefs.npy                                  | Bin 0 -> 1928 bytes
 requirements.txt                              |   4 +-
 results/loho_pm_class_dist_per_hosp.txt       |  10 +
 results/loho_pm_n2283_y516.0_coefs.txt        |  45 ++
 unit_lookup.py                                |   8 +-
 update_slack/covid19_update_slack_all.py      | 101 ++--
 xgb_coefs.npy                                 | Bin 0 -> 1028 bytes
 21 files changed, 2047 insertions(+), 582 deletions(-)
 delete mode 100644 Classifiers/gradboost.py
 create mode 100644 data_query_scripts/ALL_completeness_category.py
 create mode 100644 data_query_scripts/survival_death_km.py
 create mode 100644 generate_figures.py
 create mode 100644 lr_coefs.npy
 create mode 100644 results/loho_pm_class_dist_per_hosp.txt
 create mode 100644 results/loho_pm_n2283_y516.0_coefs.txt
 create mode 100644 xgb_coefs.npy

diff --git a/.gitignore b/.gitignore
index eee52b3..d8a6989 100644
--- a/.gitignore
+++ b/.gitignore
@@ -146,4 +146,8 @@ covid19_variables.xlsx
 *.xlsx
 *.xls
 *.png
-.vscode/
\ No newline at end of file
+.vscode/
+*.zip
+*.gif
+*.tiff
+*.pxm
diff --git a/Classifiers/XGB.py b/Classifiers/XGB.py
index 370badc..cca853d 100644
--- a/Classifiers/XGB.py
+++ b/Classifiers/XGB.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 """
 Created on Thu Apr 30 13:25:50 2020
 
@@ -40,7 +39,7 @@
 from xgboost import XGBClassifier
 from sklearn.feature_selection import SelectKBest
 from sklearn.preprocessing import PolynomialFeatures
-#import shap
+import shap
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.feature_selection import SelectFromModel
@@ -53,6 +52,7 @@
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import RobustScaler
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_predict
 from sklearn.metrics import auc as auc_calc
@@ -83,22 +83,30 @@ def __init__(self):
         '''
 
         self.goal = None
+        self.data_struct = None
+
         self.model_args = {
+            'imputer': 'iterative',     # Simple, iterative, forest
             'add_missing_indicator': False,
+            'apply_polynomials': False,
+            'apply_feature_selection': False,
+            'n_features': 10,
             'apply_pca': False,
             'pca_n_components': 3,
-
-            'select_features': False,
-            'n_best_features': 10,
-            'plot_feature_graph': True,
-
             'grid_search': True
         }
 
+        self.grid = {
+            'XGB__learning_rate': ([0.1, 0.01, 0.001]),
+            'XGB__gamma': ([0.1, 0.01, 0.001]),
+            'XGB__n_estimators': ([100, 200, 300, 500, 700]),
+            'XGB__subsample': ([0.5, 0.7, 0.9]),
+            'XGB__colsample_bytree': ([0.5, 0.6, 0.7, 0.8]),
+            'XGB__max_depth': ([2, 4, 6,8])}
+
         self.score_args = {
-            'plot_n_rocaucs': 5,
+            'plot_n_rocaucs': 0,
             'n_thresholds': 50,
-
         }
 
         self.evaluation_args = {
@@ -111,29 +119,19 @@ def __init__(self):
         self.intercepts = []
         self.n_best_features = []
 
+        self.trained_classifiers = []
+
         self.learn_size = []
         
-        self.grid = {
-            'XGB__learning_rate': ([0.1, 0.01, 0.001]),
-            'XGB__gamma': ([0.1, 0.01, 0.001]),
-            'XGB__n_estimators': ([100, 200, 300, 500, 700]),
-            'XGB__subsample': ([0.5, 0.7, 0.9]),
-            'XGB__colsample_bytree': ([0.5, 0.6, 0.7, 0.8]),
-            'XGB__max_depth': ([2, 4, 6,8])}
-        
-        #self.grid = {
-        #'XGB__n_estimators': ([200,400,500,600,800,1000,1200,1400]),
-        #'XGB__max_features': (['auto', 'sqrt', 'log2']),                   # precomputed,'poly', 'sigmoid'
-        #'XGB__max_depth':    ([10,20,30,40, 50, 60, 70, 80, 90, 100, None]),
-        #'XGB__criterion':    (['gini', 'entropy']),
-        #'XGB__min_samples_split':  [2,4,6,8],
-        #'XGB__min_samples_leaf':   [2,4,6,8,10]}
-       
         self.save_path = ''
         self.fig_size = (1280, 960)
         self.fig_dpi = 600
+
         self.random_state = 0
-        
+        self.save_prediction = True
+        self.hospital = pd.Series()
+        self.days_until_outcome = pd.Series()
+
     def define_imputer(self,impute_type):
         '''Initialize the imputer to be used for every iteration.
         
@@ -146,18 +144,16 @@ def define_imputer(self,impute_type):
         if impute_type=='simple':
             self.imputer = SimpleImputer(missing_values=np.nan, strategy='median',
                                            add_indicator=self.model_args['add_missing_indicator'])
-        else:
-            if impute_type=='iterative':
-                    self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median',
+        elif impute_type=='iterative':
+            self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median',
                                            add_indicator=self.model_args['add_missing_indicator'])
-            else:                    
-                if impute_type=='forest':
-                    self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
+        elif impute_type=='forest':
+            self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
 
     def get_pipeline(self):
-        self.define_imputer('iterative')
+        self.define_imputer(self.model_args['imputer'])
         steps = [('imputer', self.imputer),
-                 ('scaler', MinMaxScaler())]
+                 ('scaler', RobustScaler())]
 
         if self.model_args['apply_polynomials']:
             steps += [('polynomials', PolynomialFeatures(interaction_only=True))]
@@ -176,11 +172,7 @@ def get_pipeline(self):
             for key in keys:
                 del self.grid[key]
 
-        steps += [('LR', LogisticRegression(solver='saga', 
-                                            penalty='elasticnet', #class_weight='balanced', 
-                                            l1_ratio=.5,
-                                            max_iter=200,
-                                            random_state=self.random_state))] 
+        steps += [('XGB', XGBClassifier(random_state=self.random_state))] 
         return Pipeline(steps)
         
     def train(self, datasets):
@@ -215,43 +207,63 @@ def train(self, datasets):
         self.learn_size += [{'tr_x': train_x.shape, 'tr_y': train_y.shape,
                             'te_x': test_x.shape, 'te_y': test_y.shape}]
 
-        n_best_features = self.model_args['n_best_features']
-        plot_feature_graph = self.model_args['plot_feature_graph']
-
         train_x = self.impute_missing_values(train_x)
         test_x = self.impute_missing_values(test_x)
 
+        # Define pipeline
         self.pipeline = self.get_pipeline()
 
         # Model and feature selection
         # TODO ideally also the feature selection would take place within a CV pipeline
-        if self.model_args['select_features']:
-            if not any(self.n_best_features):
-                self.importances = self.feature_contribution(clf, train_x, train_y,
-                                                             plot_graph=plot_feature_graph)
-                self.n_best_features = np.argsort(self.importances)[-n_best_features:]
-                print('Select {} best features:\n{}'.format(self.model_args['n_best_features'],
-                                                            list(train_x.columns[
-                                                                     self.n_best_features])))
-            train_x = train_x.iloc[:, self.n_best_features]
-            test_x = test_x.iloc[:, self.n_best_features]
 
         if self.model_args['grid_search']:
             # print("Train classfier using grid search for best parameters.")
             cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=self.random_state)
-            grid = RandomizedSearchCV(self.pipeline, param_grid=self.grid, cv=cv,
-                                scoring='roc_auc', n_jobs=-2)
+            grid = RandomizedSearchCV(self.pipeline, param_distributions=self.grid, cv=cv,
+                                scoring='roc_auc', n_jobs=-2, n_iter=50)
             grid.fit(train_x, train_y)
             clf = grid.best_estimator_
+            self.trained_classifiers += [clf]
             # print("Best estimator: ", clf)
         else:
             # Train classifier without optimization.
             clf = self.pipeline
             clf.fit(train_x, train_y)
 
+        self.coefs.append(clf['XGB'].feature_importances_)
+        
         test_y_hat = clf.predict_proba(test_x)  # Predict
 
-        self.coefs.append(clf['XGB'].feature_importances_)
+        if 'feature_selection' in clf.named_steps:
+            columns = train_x.columns[np.argsort(clf.named_steps\
+                                        .feature_selection\
+                                        .pvalues_)][0:self.model_args['n_features']].to_list()
+            self.n_best_features += [columns]
+            print(columns)
+        else:
+            columns = train_x.columns
+
+        idx_train = train_x.index
+        idx_test = test_x.index
+
+        if self.model_args['add_missing_indicator']:
+            missing_cols = columns.to_list()\
+                              + ['{}_nan'.format(c)
+                                 for c in train_x.loc[:, train_x.isna().any()]]
+
+        train_x = pd.DataFrame(clf[:-1].transform(train_x))
+        test_x = pd.DataFrame(clf[:-1].transform(test_x))
+
+        if self.model_args['add_missing_indicator']:
+            train_x.columns = missing_cols
+            test_x.columns = missing_cols
+        else:
+            train_x.columns = columns
+            test_x.columns = columns
+        
+        train_x.index = idx_train
+        test_x.index = idx_test
+
 
 
         datasets = {"train_x": train_x,
@@ -259,7 +271,10 @@ def train(self, datasets):
                     "train_y": train_y,
                     "test_y": test_y}
 
-        return clf, datasets, test_y_hat
+        explainer = shap.TreeExplainer(clf['XGB'])
+        shap_values = explainer.shap_values(test_x)
+
+        return clf, datasets, test_y_hat, shap_values, test_x
 
     def score(self, clf, datasets, test_y_hat, rep):
         ''' Scores the individual prediction per outcome.
@@ -308,7 +323,7 @@ def score(self, clf, datasets, test_y_hat, rep):
         
         return score
     
-    def evaluate(self, clf, datasets, scores):
+    def evaluate(self, clf, datasets, scores, hospitals=None):
         ''' Evaluate the results of the modelling process,
         such as, feature importances.
         NOTE: No need to show plots here, plt.show is called right
@@ -329,9 +344,15 @@ def evaluate(self, clf, datasets, scores):
         cms = [score['conf_mats'] for score in scores]
         thresholds = [score['thr'] for score in scores]
         
+        if self.model_args['apply_feature_selection']:
+            self.vote_best_featureset()
+
         self.analyse_fpr(cms, thresholds)
-        fig, ax = self.plot_model_results([score['roc_auc'] for score in scores])
-        fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf,
+        fig, ax = self.plot_model_results([score['roc_auc'] for score in scores],
+                                          hospitals=hospitals)
+        if not self.model_args['apply_feature_selection']\
+           and not self.model_args['add_missing_indicator']:
+                fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf,
                                             show_n_features=self.evaluation_args['show_n_features'],
                                             normalize_coefs=self.evaluation_args['normalize_coefs'])
         if self.save_prediction:
@@ -340,6 +361,9 @@ def evaluate(self, clf, datasets, scores):
     def impute_missing_values(self, data, missing_class=-1):
         data = data.copy()  # Prevents copy warning
 
+        vars_binary = get_fields_per_type(data, self.data_struct, 'radio')
+        data.loc[:, vars_binary] = data[vars_binary].fillna(0, axis=0)
+
         # Categorical
         vars_categorical = get_fields_per_type(data, self.data_struct, 'category')
         data.loc[:, vars_categorical] = data[vars_categorical].fillna(missing_class, axis=0)
@@ -353,85 +377,29 @@ def impute_missing_values(self, data, missing_class=-1):
         # data = data.fillna(0).astype(float)
         return data
 
-    def add_engineered_features(self, train_x, test_x):
-        ''' Generate and add features'''
-        # TODO: Add PCA to preprocessing pipeline to avoid leakage in CV.
-        columns = train_x.columns
-        train_x = train_x.reset_index(drop=True)
-        test_x = test_x.reset_index(drop=True)
-
-        scaler = MinMaxScaler().fit(train_x)
-        # scaler = StandardScaler().fit(train_x)
-        train_x = pd.DataFrame(scaler.transform(train_x), columns=columns)
-        test_x = pd.DataFrame(scaler.transform(test_x), columns=columns)
-
-        # Dimensional transformation
-        if self.model_args['apply_pca']:
-            train_x, test_x = self.get_pca(train_x, test_x)
-
-        return train_x, test_x
-
-    def get_pca(self, train_x, test_x):
-        pca_col_names = ['PC {:d}'.format(i+1) for i in range(self.model_args['pca_n_components'])]
-        pca = PCA(n_components=self.model_args['pca_n_components'])
-        pca = pca.fit(train_x)
-        pcs_train = pd.DataFrame(pca.transform(train_x), columns=pca_col_names)
-        pcs_test = pd.DataFrame(pca.transform(test_x), columns=pca_col_names)
-        train_x = pd.concat([train_x, pcs_train], axis=1)
-        test_x = pd.concat([test_x, pcs_test], axis=1)
-
-        return train_x, test_x
-
-    def feature_contribution(self, clf, x, y, plot_graph=False, plot_n_features=None,
-                                n_cv=2, method='predict_proba'):
-
-        plot_n_features = x.shape[1] if not plot_n_features else plot_n_features
-        y_hat = cross_val_predict(clf, x, y, cv=n_cv, method=method)
-        baseline_score = roc_auc_score(y, y_hat[:, 1])
-
-        importances = np.array([])
-        
-        for col in x.columns:
-            x_tmp = x.drop(col, axis=1)
-            y_hat = cross_val_predict(clf, x_tmp, y, cv=n_cv, method=method)
-            score = roc_auc_score(y, y_hat[:, 1])
-            importances = np.append(importances, baseline_score-score)
-
-        if plot_graph:
-            idc = np.argsort(importances)
-            columns = x.columns[idc]
-            fig, ax = plt.subplots(1, 1)
-            ax.plot(importances[idc[-plot_n_features:]])
-            ax.axhline(0, color='k', linewidth=.5)
-            ax.set_xticks(np.arange(x.shape[1]))
-            ax.set_xticklabels(columns[-plot_n_features:], rotation=90, fontdict={'fontsize': 6})
-            ax.set_xlabel('Features')
-            ax.set_ylabel('Difference with baseline')
-            fig.tight_layout()
-
-        return importances
-
-    def plot_model_results(self,
-                           aucs):  # , classifier='Logistic regression', outcome='ICU admission'):
-
+    def plot_model_results(self, aucs, hospitals=[]):  # , classifier='Logistic regression', outcome='ICU admission'):
         avg = sum(aucs) / max(len(aucs), 1)
         std = sqrt(sum([(auc - avg) ** 2 for auc in aucs]) / len(aucs))
         sem = std / sqrt(len(aucs))
 
         fig, ax = plt.subplots(1, 1)
         ax.plot(aucs)
-        ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)'
-                     .format('XGB', avg, sem * 1.96))
+        # ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)'
+        #              .format('Logistic Regression', avg, sem * 1.96))
+        ax.set_title('{}\nAUC: {:.2f} ({:.2f} to {:.2f}) (95% CI)'
+                     .format('XGB', avg, avg-(sem*1.96), avg+(sem*1.96)))                     
         ax.axhline(sum(aucs) / max(len(aucs), 1), color='g', linewidth=1)
         ax.axhline(.5, color='r', linewidth=1)
         ax.set_ylim(0, 1)
-        ax.set_xlabel('Iteration')
+        ax.set_xlabel('Test Fold')
+        if any(hospitals):
+            ax.set_xticks(list(range(hospitals.size)))
+            ax.set_xticklabels(hospitals)
         ax.set_ylabel('AUC')
-        ax.legend(['ROC AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5))
+        ax.legend(['AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5))
         fig.tight_layout()
         fig.savefig(self.save_path + '_Performance_roc_auc_{}_{}.png'.format(avg, sem * 1.96),
                     figsize=self.fig_size, dpi=self.fig_dpi)
-
         return fig, ax
 
     def analyse_fpr(self, cms, thresholds):
@@ -497,72 +465,115 @@ def analyse_fpr(self, cms, thresholds):
             fig.savefig(self.save_path + '_average_roc.png',
                         figsize=self.fig_size, dpi=self.fig_dpi)
 
-    def plot_model_weights(self, feature_labels, clf, 
-                           show_n_features=10, normalize_coefs=False):
-        if self.model_args['add_missing_indicator']:
-            # ADD featuere labels for missing feature indicators
-            feature_labels = feature_labels.to_list() \
-                             + ['m_id_{}'.format(feature_labels[i])\
-                                for i in clf.named_steps.imputer.indicator_.features_]
-                             
-        coefs = self.coefs
-        intercepts = self.intercepts
-        coefs = np.array(coefs).squeeze()
-        intercepts = np.array(intercepts).squeeze()
+    def plot_model_weights(self, feature_labels, clf,
+                            show_n_features=10, normalize_coefs=False):
 
-        if len(coefs.shape) <= 1:
-            return
+            feature_labels = self.get_feature_labels(feature_labels, clf)
 
-        show_n_features = coefs.shape[1] if show_n_features is None else show_n_features
+            # FIXME
+            if self.model_args['apply_pca']:
+                print('UNEVEN FEATURE LENGTH. CANT PLOT WEIGHTS')
+                return None, None
 
-        coefs = (coefs - coefs.mean(axis=0)) / coefs.std(axis=0) if normalize_coefs else coefs
+            coefs = self.coefs
+            intercepts = self.intercepts
+            coefs = np.array(coefs).squeeze()
+            intercepts = np.array(intercepts).squeeze()
 
-        avg_coefs = abs(coefs.mean(axis=0))
-        mask_not_nan = ~np.isnan(avg_coefs)  # Remove non-existent weights
-        avg_coefs = avg_coefs[mask_not_nan]
+            if len(coefs.shape) <= 1:
+                return
 
-        var_coefs = coefs.var(axis=0)[mask_not_nan] if not normalize_coefs else None
-        idx_sorted = avg_coefs.argsort()
-        n_bars = np.arange(avg_coefs.shape[0])
+            np.save('xgb_coefs.npy', coefs)
+
+            with open(self.save_path + '_coefs.txt', 'w') as f:
+                f.write('{}'.format(coefs))
+
+            show_n_features = coefs.shape[1] if show_n_features is None else show_n_features
+
+            odds = np.exp(coefs)
+            odds_avg = odds.mean(axis=0)-1
+            odds_var = odds.var(axis=0)
+
+            idx_sorted = odds_avg.argsort()
+            n_bars = np.arange(odds_avg.size)
+            labels = np.array([self.var_dict.get(c, c) for c in feature_labels]) 
+            fontsize = 5.75 if labels.size < 50 else 5
+            bar_width = .5  # bar width
+
+            fig, ax = plt.subplots()       
+            ax.set_title('XGBoost - Odds ratios')
+            ax.barh(n_bars, odds_avg[idx_sorted], bar_width, xerr=odds_var[idx_sorted],
+                        label='Weight')
+            ax.set_yticks(n_bars)
+            ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize })
+            ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5)
+            ax.set_xlabel('Odds ratio')
+            fig.tight_layout()
+            fig.savefig(self.save_path + '_Average_weight_variance.png',
+                        figsize=self.fig_size, dpi=self.fig_dpi)
+            return fig, ax
+
+    def get_feature_labels(self, labels, clf):
+        steps = clf.named_steps.keys()
+        labels = np.array(labels)
 
-        labels = [self.var_dict.get(c) for c in feature_labels]
-        has_no_label = labels
         for i, l in enumerate(labels):
             if l == None:
-                labels[i] = feature_labels[i]
+                labels[i] = labels[i]
             elif 'chronic cardiac disease' in l.lower():
                 labels[i] = 'Chronic Cardiac Disease (Not hypertension)'
             elif 'home medication' in l.lower():
                 labels[i] = 'Home medication'
-        labels = np.array(labels)
-        fontsize = 5.75 if labels.size < 50 else 5
-        bar_width = .5  # bar width
 
-        fig, ax = plt.subplots()
-        if normalize_coefs:
-            ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, label='Weight')
-            ax.set_title('XGB - Average weight value')
-        else:
-            ax.set_title('XGB - Average weight value')
-            ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, xerr=var_coefs[idx_sorted],
-                    label='Weight')
-        ax.set_yticks(n_bars)
-        ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize })
-        ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5)
-        ax.set_xlabel('Weight{}'.format(' (normalized)' if normalize_coefs else ''))
-        fig.tight_layout()
-        fig.savefig(self.save_path + '_Average_weight_variance.png',
-                    figsize=self.fig_size, dpi=self.fig_dpi)
-        return fig, ax
+        # NOTE: use loop over steps to be able to switch order
+        if self.model_args['add_missing_indicator']:
+            labels = labels.to_list() \
+                     + ['m_id_{}'.format(labels[i])\
+                       for i in clf.named_steps.imputer.indicator_.features_]
+
+        if 'polynomials' in steps:
+            # N_features = n_features (n)
+            #              + n_combinations_without_repeat (k)
+            #              + bias (if true)
+            #            = n + (n!)/(k!(n-k)!) + 1
+
+            labels = np.array(clf.named_steps.polynomials.get_feature_names(labels))
+
+        if 'feature_selection' in steps:
+            # TODO: check if also works with adding_missing_indicators
+            k = self.model_args['n_features']
+            labels = labels[np.argsort(clf.named_steps\
+                                          .feature_selection\
+                                          .pvalues_)[0:k]]
+
+            return labels
+        return labels    
+
+    def vote_best_featureset(self):
+        nansum = lambda x: sum([i for i in x if str(i)!='nan'])
+        # Get list by voting. i.e sorted list with most occurences
+        columns_list = [sorted(fset[0]) for fset in self.n_best_features]
+        result = pd.DataFrame(self.n_best_features, columns=['columns', 'fvalues', 'pvalues'])
+        result['fsum'] = result['fvalues'].apply(nansum)
+        result['psum'] = result['pvalues'].apply(nansum)
+        result['columns'] = result['columns'].apply(sorted)
+        result.to_excel('_xgb_feature_selection_results.xlsx')
+
+        counts = pd.Series(columns_list).value_counts()
+        counts.to_excel(self.save_path + '_xgb_feature_selection_votes.xlsx')
+        print('votes={} for {}'.format(counts.iloc[0], counts.index[0]))
 
     def save_prediction_to_file(self, scores):
-        x = pd.concat([score['x'] for score in scores], axis=0).reset_index(drop=True)
-        y_hat = pd.concat([pd.Series(score['y_hat']) for score in scores], axis=0) \
-                  .reset_index(drop=True)
-        y = pd.concat([score['y'] for score in scores], axis=0).reset_index(drop=True)
-
-        df = pd.concat([x, y, y_hat], axis=1)
-        df.columns=list(x.columns)+['y', 'y_hat']
+        x = pd.concat([score['x'] for score in scores], axis=0)
+        y_hat = pd.concat([pd.Series(score['y_hat']) for score in scores], axis=0)
+        y_hat.index = x.index
+        y = pd.concat([score['y'] for score in scores], axis=0)
+        self.hospital.index = x.index
+        self.days_until_outcome.index = self.days_until_outcome.index
+        df = pd.concat([x, y, y_hat, 
+                        self.hospital,
+                        self.days_until_outcome], axis=1)
+        df.columns=list(x.columns)+['y', 'y_hat', 'hospital', 'days_until_death']
 
         filename = self.save_path + '_prediction.pkl'
         df.to_pickle(filename)
@@ -607,3 +618,61 @@ def get_fields_per_type(data, data_struct, type):
     #     ax.set_title('FALSE POSITIVE RATE\n Mean mean and median over FPRs at different\nthresholds for all training iterations\nmean: {:.3f} \u00B1 {:.3f}\nmedian: {:.3f} \u00B1 {:.3f}'
     #             .format(means_mets['mean'], means_mets['ci'], median_mets['mean'], median_mets['ci']))
     #     fig.savefig('FPR_results.png')
+
+    # def plot_model_weights(self, feature_labels, clf, 
+    #                        show_n_features=10, normalize_coefs=False):
+    #     if self.model_args['add_missing_indicator']:
+    #         # ADD featuere labels for missing feature indicators
+    #         feature_labels = feature_labels.to_list() \
+    #                          + ['m_id_{}'.format(feature_labels[i])\
+    #                             for i in clf.named_steps.imputer.indicator_.features_]
+                             
+    #     coefs = self.coefs
+    #     intercepts = self.intercepts
+    #     coefs = np.array(coefs).squeeze()
+    #     intercepts = np.array(intercepts).squeeze()
+
+    #     if len(coefs.shape) <= 1:
+    #         return
+
+    #     show_n_features = coefs.shape[1] if show_n_features is None else show_n_features
+
+    #     coefs = (coefs - coefs.mean(axis=0)) / coefs.std(axis=0) if normalize_coefs else coefs
+
+    #     avg_coefs = abs(coefs.mean(axis=0))
+    #     mask_not_nan = ~np.isnan(avg_coefs)  # Remove non-existent weights
+    #     avg_coefs = avg_coefs[mask_not_nan]
+
+    #     var_coefs = coefs.var(axis=0)[mask_not_nan] if not normalize_coefs else None
+    #     idx_sorted = avg_coefs.argsort()
+    #     n_bars = np.arange(avg_coefs.shape[0])
+
+    #     labels = [self.var_dict.get(c) for c in feature_labels]
+    #     has_no_label = labels
+    #     for i, l in enumerate(labels):
+    #         if l == None:
+    #             labels[i] = feature_labels[i]
+    #         elif 'chronic cardiac disease' in l.lower():
+    #             labels[i] = 'Chronic Cardiac Disease (Not hypertension)'
+    #         elif 'home medication' in l.lower():
+    #             labels[i] = 'Home medication'
+    #     labels = np.array(labels)
+    #     fontsize = 5.75 if labels.size < 50 else 5
+    #     bar_width = .5  # bar width
+
+    #     fig, ax = plt.subplots()
+    #     if normalize_coefs:
+    #         ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, label='Weight')
+    #         ax.set_title('XGB - Average weight value')
+    #     else:
+    #         ax.set_title('XGB - Average weight value')
+    #         ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, xerr=var_coefs[idx_sorted],
+    #                 label='Weight')
+    #     ax.set_yticks(n_bars)
+    #     ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize })
+    #     ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5)
+    #     ax.set_xlabel('Weight{}'.format(' (normalized)' if normalize_coefs else ''))
+    #     fig.tight_layout()
+    #     fig.savefig(self.save_path + 'XGB_Average_weight_variance.png',
+    #                 figsize=self.fig_size, dpi=self.fig_dpi)
+    #     return fig, ax
\ No newline at end of file
diff --git a/Classifiers/gradboost.py b/Classifiers/gradboost.py
deleted file mode 100644
index b3080e4..0000000
--- a/Classifiers/gradboost.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Apr 17 16:27:52 2020
-
-@author: rriccilopes
-"""
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import numpy as np
-
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.metrics import roc_auc_score
-from sklearn.model_selection import GridSearchCV
-
-import warnings
-from sklearn.exceptions import ConvergenceWarning
-warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
-
-
-def train_gradient_boosting(train_x, test_x, train_y, test_y, **kwargs):
-    """
-
-    kwargs
-    ----------
-    plot_graph : bool
-    gridsearch : bool
-    """
-    gridsearch = False if not kwargs.get('gridsearch') else kwargs.get('gridsearch')
-
-    # Initialize Classifier
-    clf = GradientBoostingClassifier(random_state=0)
-    
-    if gridsearch:
-        grid = {
-            "n_estimators": [100, 500, 1000, 2000],
-            "min_samples_split": [2, 4],
-            "min_samples_leaf": [1, 2, 4],
-            "max_depth": [None, 2, 3],
-            "max_features": ['auto']
-        }
-        clf = GridSearchCV(estimator=clf, param_grid=grid, cv=5, n_jobs=-2, scoring='roc_auc')
-        clf.fit(train_x, train_y)  # Model
-        clf = clf.best_estimator_
-    else:
-        clf.fit(train_x, train_y)
-
-    test_y_hat = clf.predict_proba(test_x) # Predict
-
-    return clf, train_x, test_x, train_y, test_y, test_y_hat
\ No newline at end of file
diff --git a/Classifiers/logreg.py b/Classifiers/logreg.py
index 1a9e49d..a2b10d5 100644
--- a/Classifiers/logreg.py
+++ b/Classifiers/logreg.py
@@ -1,37 +1,38 @@
-''' 
+'''
 DO NOT OVERWRITE THIS FILE, MAKE A COPY TO EDIT!
 
 This file contains blueprint for the model that is used
-in covid19_ICU_admission.  
+in covid19_ICU_admission.
 This class should take care of training a classifier/regressor,
-scoring each iteration and evaluating post training. 
+scoring each iteration and evaluating post training.
 
-Make sure the names, inputs and outputs of the predefined methods stay 
+Make sure the names, inputs and outputs of the predefined methods stay
 the same!
 
 
 Model parameters:
 Please store all parameters in the defined dicts in __init__().
-This way it is easy to change some parameters without changing 
-the main code and without the need for extra config files. 
+This way it is easy to change some parameters without changing
+the main code and without the need for extra config files.
 
 Most important naming conventions and variables:
 Model:      The whole class in this file. This means that "model" takes
             care of training, scoring and evaluating
 
 Clf:        This is the actually classifier/regressor. It can be for example
-            the trained instance from the Sklearn package 
+            the trained instance from the Sklearn package
             (e.g. LogisticRegression())
 
 Datasets:   dictionary containin all training and test sets:
             train_x, train_y, test_x, test_y, test_y_hat
 
-..._args:   Dictionary that holds the parameters that are used as 
+..._args:   Dictionary that holds the parameters that are used as
             input for train, score or evaluate
-            
+
 '''
+import os
 from math import sqrt
-
+import shap
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
@@ -39,11 +40,15 @@
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import RobustScaler
 from sklearn.preprocessing import PolynomialFeatures
+from sklearn.pipeline import Pipeline
 from sklearn.feature_selection import SelectFpr
 from sklearn.feature_selection import SelectKBest
 from sklearn.model_selection import cross_val_predict
 from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.impute import SimpleImputer
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import auc as auc_calc
 from sklearn.metrics import roc_curve
@@ -51,46 +56,46 @@
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import plot_confusion_matrix
 
-import os
-from sklearn.pipeline import Pipeline
+
 from sklearn.model_selection import GridSearchCV
-from sklearn.impute import SimpleImputer
 
 import warnings
 from sklearn.exceptions import ConvergenceWarning
+# from sklearn.exceptions import UserWarning
+from sklearn.experimental import enable_iterative_imputer  # Required to enable experimental iterative imputer
 from sklearn.impute import IterativeImputer
 from missingpy import MissForest
-# warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
+warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
+# warnings.filterwarnings(action='ignore', category=UserWarning)
 
 class LogReg:
 
     def __init__(self):
         ''' Initialize model.
-        Save all model parameters here. 
+        Save all model parameters here.
         Please don't change the names of the preset
         parameters, as they might be called outside
-        this class. 
+        this class.
         '''
 
         self.goal = None
         self.data_struct = None
         self.model_args = {
+            'imputer': 'iterative',     # Simple, iterative, forest
             'add_missing_indicator': False,
-            
             'apply_polynomials': False,
-
             'apply_feature_selection': False,
             'n_features': 10,
-
             'apply_pca': False,
             'pca_n_components': 3,
-
             'grid_search': True
         }
 
         self.grid = {
-            'PCA__n_components': list(range(1, 11)),
-            'LR__l1_ratio': [i / 10 for i in range(0, 11, 1)]
+            'LR__l1_ratio': [i / 10 for i in range(0, 11, 2)],
+            'LR__C': np.logspace(-4, 4, 10),
+            'LR__penalty': ['l2','elasticnet'],
+            'LR__max_iter': [100, 300, 500]
         }
 
         self.score_args = {
@@ -99,13 +104,15 @@ def __init__(self):
 
         self.evaluation_args = {
             'show_n_features': None,
-            'normalize_coefs': True,
+            'normalize_coefs': False,
             'plot_analyse_fpr': False}
 
         self.coefs = []
         self.intercepts = []
         self.n_best_features = []
 
+        self.trained_classifiers = []
+
         self.learn_size = []
 
         self.save_path = ''
@@ -113,23 +120,24 @@ def __init__(self):
         self.fig_dpi = 600
 
         self.random_state = 0
-        self.save_prediction = False
-        self.hospital = None
+        self.save_prediction = True
+        self.hospital = pd.Series()
+        self.days_until_outcome = pd.Series()
         
     def train(self, datasets):
         ''' Initialize, train and predict a classifier.
         This includes: Feature engineering (i.e. PCA) and
-        selection, training clf, (hyper)parameter optimization, 
+        selection, training clf, (hyper)parameter optimization,
         and a prediction on the test set. Make sure to save
         all variables you want to keep track of in the instance.
 
-        Input: 
+        Input:
             datasets:: dict
                 Contains train and test x, y
-        
+
         Output:
             clf:: instance, dict, list, None
-                Trained classifier/regressor instance, such as 
+                Trained classifier/regressor instance, such as
                 sklearn logistic regression. Is not used
                 outside this file, so can be left empty
             datasets:: dict
@@ -150,6 +158,7 @@ def train(self, datasets):
         self.learn_size += [{'tr_x': train_x.shape, 'tr_y': train_y.shape,
                             'te_x': test_x.shape, 'te_y': test_y.shape}]
 
+        # Imputes non numeric
         train_x = self.impute_missing_values(train_x)
         test_x = self.impute_missing_values(test_x)
 
@@ -160,10 +169,14 @@ def train(self, datasets):
         if self.model_args['grid_search']:
             # print("Train classfier using grid search for best parameters.")
             cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=self.random_state)
-            grid = GridSearchCV(self.pipeline, param_grid=self.grid, cv=cv,
-                                scoring='roc_auc', n_jobs=-2)
+            grid = RandomizedSearchCV(self.pipeline, param_distributions=self.grid, cv=cv,
+                                      scoring='roc_auc', n_jobs=-2,
+                                      verbose=1, n_iter=50)
+            # grid = GridSearchCV(self.pipeline, param_grid=self.grid, cv=cv,
+            #                     scoring='roc_auc', n_jobs=-2)
             grid.fit(train_x, train_y)
             clf = grid.best_estimator_
+            self.trained_classifiers += [clf]
             # print("Best estimator: ", clf)
         else:
             # Train classifier without optimization.
@@ -174,21 +187,47 @@ def train(self, datasets):
         self.intercepts.append(clf.named_steps['LR'].intercept_)
 
         test_y_hat = clf.predict_proba(test_x)  # Predict
-        
+
         if 'feature_selection' in clf.named_steps:
-            columns = train_x.columns[np.argsort(clf.named_steps\
-                                          .feature_selection\
-                                          .pvalues_)][0:self.model_args['n_features']].to_list()
+            idx_sorted = np.argsort(clf['feature_selection']\
+                                       .pvalues_)
+            f_values = clf['feature_selection'].scores_
+            p_values = clf['feature_selection'].pvalues_
+            columns = train_x.columns[idx_sorted[0:self.model_args['n_features']]].to_list()
+            self.n_best_features += [[columns, f_values, p_values]]
+            print(columns)
         else:
             columns = train_x.columns
-        train_x = pd.DataFrame(clf[:-1].transform(train_x), columns=columns)
-        train_x = pd.DataFrame(clf[:-1].transform(test_x), columns=columns)
+        
+        idx_train = train_x.index
+        idx_test = test_x.index
+        
+        if self.model_args['add_missing_indicator']:
+            missing_cols = columns.to_list()\
+                              + ['{}_nan'.format(c)
+                                 for c in train_x.loc[:, train_x.isna().any()]]
+
+        train_x = pd.DataFrame(clf[:-1].transform(train_x))
+        test_x = pd.DataFrame(clf[:-1].transform(test_x))
+
+        if self.model_args['add_missing_indicator']:
+            train_x.columns = missing_cols
+            test_x.columns = missing_cols
+        else:
+            train_x.columns = columns
+            test_x.columns = columns
+        
+        train_x.index = idx_train
+        test_x.index = idx_test
         datasets = {"train_x": train_x,
                     "test_x": test_x,
                     "train_y": train_y,
                     "test_y": test_y}
         
-        return clf, datasets, test_y_hat
+        explainer = shap.LinearExplainer(clf['LR'], test_x)
+        shap_values = explainer.shap_values(test_x)
+
+        return clf, datasets, test_y_hat, shap_values, test_x
 
     def score(self, clf, datasets, test_y_hat, rep):
         ''' Scores the individual prediction per outcome.
@@ -234,10 +273,10 @@ def score(self, clf, datasets, test_y_hat, rep):
             'y': datasets['test_y'],
             'y_hat': test_y_hat
             }
-        
+
         return score
-      
-    def evaluate(self, clf, datasets, scores):
+
+    def evaluate(self, clf, datasets, scores, hospitals=None):
         ''' Evaluate the results of the modelling process,
         such as, feature importances.
         NOTE: No need to show plots here, plt.show is called right
@@ -253,44 +292,53 @@ def evaluate(self, clf, datasets, scores):
                 List of all scores generated per training
                 iteration.
         '''
-        self.var_dict = dict(zip(self.data_struct['Field Variable Name'], 
+        self.var_dict = dict(zip(self.data_struct['Field Variable Name'],
                                  self.data_struct['Field Label']))
         cms = [score['conf_mats'] for score in scores]
         thresholds = [score['thr'] for score in scores]
         
+        if self.model_args['apply_feature_selection']:
+            # self.save_path += 'k{}'.format(self.model_args['n_features'])
+            self.vote_best_featureset()
+
         self.analyse_fpr(cms, thresholds)
-        fig, ax = self.plot_model_results([score['roc_auc'] for score in scores])
-        fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf,
-                                            show_n_features=self.evaluation_args['show_n_features'],
-                                            normalize_coefs=self.evaluation_args['normalize_coefs'])
+        fig, ax = self.plot_model_results([score['roc_auc'] for score in scores],
+                                          hospitals=hospitals)
+
+        if not self.model_args['apply_feature_selection']\
+           and not self.model_args['add_missing_indicator']:
+            fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf,
+                                    show_n_features=self.evaluation_args['show_n_features'],
+                                    normalize_coefs=self.evaluation_args['normalize_coefs'])
 
         if self.save_prediction:
             self.save_prediction_to_file(scores)
-        
-    def define_imputer(self,impute_type):
+
+    def define_imputer(self, impute_type):
         '''Initialize the imputer to be used for every iteration.
-        
+
         Input:
-            impute_type: string, {'simple': SimpleImputer, 
+            impute_type: string, {'simple': SimpleImputer,
             'iterative': IterativeImputer and 'forest': RandomForest imputer}
         Output:
-            Imputer: imputer object to be used in the pipeline        
+            Imputer: imputer object to be used in the pipeline
         '''
         if impute_type=='simple':
             self.imputer = SimpleImputer(missing_values=np.nan, strategy='median',
-                                           add_indicator=self.model_args['add_missing_indicator'])
-        else:
-            if impute_type=='iterative':
-                    self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median',
-                                           add_indicator=self.model_args['add_missing_indicator'])
-            else:                    
-                if impute_type=='forest':
-                    self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
-
+                                         add_indicator=self.model_args['add_missing_indicator'])
+        elif impute_type=='iterative':
+            self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median',
+                                            add_indicator=self.model_args['add_missing_indicator'])
+        elif impute_type=='forest':
+            self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
+ 
     def get_pipeline(self):
-        self.define_imputer('iterative')
-        steps = [('imputer', self.imputer),
-                 ('scaler', MinMaxScaler())]
+        self.define_imputer(self.model_args['imputer'])
+
+        steps = [
+                 ('imputer', self.imputer),   
+                 ('scaler', RobustScaler())
+                ]
 
         if self.model_args['apply_polynomials']:
             steps += [('polynomials', PolynomialFeatures(interaction_only=True))]
@@ -309,30 +357,29 @@ def get_pipeline(self):
             for key in keys:
                 del self.grid[key]
 
-        steps += [('LR', LogisticRegression(solver='saga', 
-                                            penalty='elasticnet', #class_weight='balanced', 
+        steps += [('LR', LogisticRegression(solver='saga',
+                                            penalty='elasticnet', #class_weight='balanced',
                                             l1_ratio=.5,
                                             max_iter=200,
-                                            random_state=self.random_state))] 
+                                            random_state=self.random_state))]
         return Pipeline(steps)
 
-    def impute_missing_values(self, data, missing_class=-1):
+    def impute_missing_values(self, data, missing_class=0):
+        # Imputes all missing values from non-numeric columns
         data = data.copy()  # Prevents copy warning
 
+        vars_binary = get_fields_per_type(data, self.data_struct, 'radio')
+        data.loc[:, vars_binary] = data[vars_binary].fillna(0, axis=0)
+
         # Categorical
         vars_categorical = get_fields_per_type(data, self.data_struct, 'category')
         data.loc[:, vars_categorical] = data[vars_categorical].fillna(missing_class, axis=0)
-    
-        # # Numeric
-        # vars_numeric = get_fields_per_type(data, self.data_struct, 'numeric')
-        # data.loc[:, vars_numeric] = data.loc[:, vars_numeric] \
-        #                                 .fillna(data.loc[:, vars_numeric] \
-        #                                             .median())
-        # data = data.fillna(0).astype(float)
+
+
         return data
 
-    def plot_model_results(self,
-                           aucs):  # , classifier='Logistic regression', outcome='ICU admission'):
+    def plot_model_results(self, aucs, hospitals=[]):  
+        # , classifier='Logistic regression', outcome='ICU admission'):
 
         avg = sum(aucs) / max(len(aucs), 1)
         std = sqrt(sum([(auc - avg) ** 2 for auc in aucs]) / len(aucs))
@@ -340,14 +387,19 @@ def plot_model_results(self,
 
         fig, ax = plt.subplots(1, 1)
         ax.plot(aucs)
-        ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)'
-                     .format('Logistic Regression', avg, sem * 1.96))
+        # ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)'
+        #              .format('Logistic Regression', avg, sem * 1.96))
+        ax.set_title('{}\nAUC: {:.2f} ({:.2f} to {:.2f}) (95% CI)'
+                     .format('Logistic Regression', avg, avg-(sem*1.96), avg+(sem*1.96)))
         ax.axhline(sum(aucs) / max(len(aucs), 1), color='g', linewidth=1)
         ax.axhline(.5, color='r', linewidth=1)
         ax.set_ylim(0, 1)
-        ax.set_xlabel('Iteration')
+        ax.set_xlabel('Test Fold')
+        if any(hospitals):
+            ax.set_xticks(list(range(hospitals.size)))
+            ax.set_xticklabels(hospitals)
         ax.set_ylabel('AUC')
-        ax.legend(['ROC AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5))
+        ax.legend(['AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5))
         fig.tight_layout()
         fig.savefig(self.save_path + '_Performance_roc_auc_{}_{}.png'.format(avg, sem * 1.96),
                     figsize=self.fig_size, dpi=self.fig_dpi)
@@ -417,11 +469,11 @@ def analyse_fpr(self, cms, thresholds):
             fig.savefig(self.save_path + '_average_roc.png',
                         figsize=self.fig_size, dpi=self.fig_dpi)
 
-    def plot_model_weights(self, feature_labels, clf, 
+    def plot_model_weights(self, feature_labels, clf,
                            show_n_features=10, normalize_coefs=False):
 
         feature_labels = self.get_feature_labels(feature_labels, clf)
-        
+
         # FIXME
         if self.model_args['apply_pca']:
             print('UNEVEN FEATURE LENGTH. CANT PLOT WEIGHTS')
@@ -435,34 +487,32 @@ def plot_model_weights(self, feature_labels, clf,
         if len(coefs.shape) <= 1:
             return
 
-        show_n_features = coefs.shape[1] if show_n_features is None else show_n_features
+        # Save Coeffs
+        np.save('lr_coefs.npy', coefs)
 
-        coefs = (coefs - coefs.mean(axis=0)) / coefs.std(axis=0) if normalize_coefs else coefs
+        with open(self.save_path + '_coefs.txt', 'w') as f:
+            f.write('{}'.format(coefs))
 
-        avg_coefs = abs(coefs.mean(axis=0))
-        mask_not_nan = ~np.isnan(avg_coefs)  # Remove non-existent weights
-        avg_coefs = avg_coefs[mask_not_nan]
+        show_n_features = coefs.shape[1] if show_n_features is None else show_n_features
 
-        var_coefs = coefs.var(axis=0)[mask_not_nan] if not normalize_coefs else None
-        idx_sorted = avg_coefs.argsort()
-        n_bars = np.arange(avg_coefs.shape[0])
+        odds = np.exp(coefs)
+        odds_avg = odds.mean(axis=0)-1
+        odds_var = odds.var(axis=0)
 
-        labels = np.array([self.var_dict.get(c, c) for c in feature_labels])
+        idx_sorted = odds_avg.argsort()
+        n_bars = np.arange(odds_avg.size)
+        labels = np.array([self.var_dict.get(c, c) for c in feature_labels]) 
         fontsize = 5.75 if labels.size < 50 else 5
         bar_width = .5  # bar width
 
-        fig, ax = plt.subplots()
-        if normalize_coefs:
-            ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, label='Weight')
-            ax.set_title('Logistic regression - Average weight value')
-        else:
-            ax.set_title('Logistic regression - Average weight value')
-            ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, xerr=var_coefs[idx_sorted],
+        fig, ax = plt.subplots()       
+        ax.set_title('Logistic regression - Odds ratios')
+        ax.barh(n_bars, odds_avg[idx_sorted], bar_width, xerr=odds_var[idx_sorted],
                     label='Weight')
         ax.set_yticks(n_bars)
         ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize })
         ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5)
-        ax.set_xlabel('Weight{}'.format(' (normalized)' if normalize_coefs else ''))
+        ax.set_xlabel('Odds ratio')
         fig.tight_layout()
         fig.savefig(self.save_path + '_Average_weight_variance.png',
                     figsize=self.fig_size, dpi=self.fig_dpi)
@@ -472,20 +522,20 @@ def get_feature_labels(self, labels, clf):
         steps = clf.named_steps.keys()
         labels = np.array(labels)
 
-        for i, l in enumerate(labels):
-            if l == None:
-                labels[i] = labels[i]
-            elif 'chronic cardiac disease' in l.lower():
-                labels[i] = 'Chronic Cardiac Disease (Not hypertension)'
-            elif 'home medication' in l.lower():
-                labels[i] = 'Home medication'
-
-        # NOTE: use loop over steps to be able to switch order    
+        # NOTE: use loop over steps to be able to switch order
         if self.model_args['add_missing_indicator']:
-            labels = labels.to_list() \
+            labels = list(labels) \
                      + ['m_id_{}'.format(labels[i])\
                        for i in clf.named_steps.imputer.indicator_.features_]
-        
+        else:
+            for i, l in enumerate(labels):
+                if l == None:
+                    labels[i] = labels[i]
+                elif 'chronic cardiac disease' in str(l).lower():
+                    labels[i] = 'Chronic Cardiac Disease (Not hypertension)'
+                elif 'home medication' in str(l).lower():
+                    labels[i] = 'Home medication'
+
         if 'polynomials' in steps:
             # N_features = n_features (n)
             #              + n_combinations_without_repeat (k)
@@ -504,13 +554,31 @@ def get_feature_labels(self, labels, clf):
             return labels
         return labels
 
+    def vote_best_featureset(self):
+        nansum = lambda x: sum([i for i in x if str(i)!='nan'])
+        # Get list by voting. i.e sorted list with most occurences
+        columns_list = [sorted(fset[0]) for fset in self.n_best_features]
+        result = pd.DataFrame(self.n_best_features, columns=['columns', 'fvalues', 'pvalues'])
+        result['fsum'] = result['fvalues'].apply(nansum)
+        result['psum'] = result['pvalues'].apply(nansum)
+        result['columns'] = result['columns'].apply(sorted)
+        result.to_excel('_lr_feature_selection_results.xlsx')
+
+        counts = pd.Series(columns_list).value_counts()
+        counts.to_excel(self.save_path + '_lr_feature_selection_votes.xlsx')
+        print('votes={} for {}'.format(counts.iloc[0], counts.index[0]))
+
     def save_prediction_to_file(self, scores):
         x = pd.concat([score['x'] for score in scores], axis=0)
         y_hat = pd.concat([pd.Series(score['y_hat']) for score in scores], axis=0)
         y_hat.index = x.index
         y = pd.concat([score['y'] for score in scores], axis=0)
-        df = pd.concat([x, y, y_hat, self.hospital], axis=1)
-        df.columns=list(x.columns)+['y', 'y_hat', 'hospital']
+        self.hospital.index = x.index
+        self.days_until_outcome.index = self.days_until_outcome.index
+        df = pd.concat([x, y, y_hat, 
+                        self.hospital,
+                        self.days_until_outcome], axis=1)
+        df.columns=list(x.columns)+['y', 'y_hat', 'hospital', 'days_until_death']
 
         filename = self.save_path + '_prediction.pkl'
         df.to_pickle(filename)
@@ -532,8 +600,8 @@ def get_metrics(lst):
             'sem': sem,
             'ci': ci}
 
-def get_fields_per_type(data, data_struct, type):
-    fields = data_struct.loc[data_struct['Field Type']=='category',
+def get_fields_per_type(data, data_struct, type_):
+    fields = data_struct.loc[data_struct['Field Type']==type_,
                             'Field Variable Name'].to_list()
     return [field for field in fields if field in data.columns]
 
@@ -580,7 +648,7 @@ def get_fields_per_type(data, data_struct, type):
     #     baseline_score = roc_auc_score(y, y_hat[:, 1])
 
     #     importances = np.array([])
-        
+
     #     for col in x.columns:
     #         x_tmp = x.drop(col, axis=1)
     #         y_hat = cross_val_predict(clf, x_tmp, y, cv=n_cv, method=method)
diff --git a/covid19_ICU_admission.py b/covid19_ICU_admission.py
index df5cf27..f7ec90a 100644
--- a/covid19_ICU_admission.py
+++ b/covid19_ICU_admission.py
@@ -11,6 +11,8 @@
 import os
 import os.path
 import sys
+import shap
+
 path_to_classifiers = r'./Classifiers/'
 # path_to_settings = r'./' # TODO: add settings
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), path_to_classifiers))
@@ -51,7 +53,7 @@
 # classifiers
 from logreg import LogReg
 from XGB import XGB
-from gradboost import train_gradient_boosting
+#from survival import Survival
 
 # data
 from get_feature_set import get_1_premorbid
@@ -64,7 +66,6 @@
 
 is_in_columns = lambda var_list, data: [v for v in var_list if v in data.columns]
 
-
 def load_data_api(path_credentials):
 
     # Try loading objects from disk file; delete saveddata.pkl to force reload
@@ -135,6 +136,14 @@ def load_data(path_to_creds):
 def preprocess(data, data_struct):
     ''' Processed the data per datatype.'''
 
+    # Drop useless data
+    cols = data_struct.loc[data_struct.loc[:, 'Form Collection Name']\
+                                        .isin(['!!! CARDIO (OLD DON’T USE)!!!',
+                                                'Vascular medicine (OPTIONAL)',
+                                                'CARDIOLOGY - CAPACITY (OPTIONAL)']),
+                            'Field Variable Name'].to_list()
+    data = data.drop(is_in_columns(cols, data), axis=1)
+
     # Fix single errors
     data = fix_single_errors(data)
 
@@ -154,7 +163,10 @@ def preprocess(data, data_struct):
 def prepare_for_learning(data, data_struct, variables_to_incl,
                          variables_to_exclude, goal,
                          group_by_record=True, use_outcome=None,
-                         additional_fn=None, use_imputation=True):
+                         additional_fn=None,
+                         remove_records_threshold_above=1,
+                         remove_features_threshold_above=0.5,
+                         pcr_corona_confirmed_only=True):
 
     outcomes, used_columns = calculate_outcomes(data, data_struct)
     data = pd.concat([data, outcomes], axis=1)
@@ -170,34 +182,98 @@ def prepare_for_learning(data, data_struct, variables_to_incl,
 
     x, y, all_outcomes = select_x_y(data, outcomes, used_columns, goal)
 
+    # Remove samples with missing y
+    if goal[0] != 'survival':
+        has_y = y.notna()
+        x = x.loc[has_y, :]
+        y = y.loc[has_y]
+
+    # Include only patients with CONFIRMED covid infection (PCR+ or Coronavirus)
+    # This excludes patients based on CORADS > 4
+    if pcr_corona_confirmed_only:
+        is_confirmed_patient = \
+            (data['Coronavirus']==1) |\
+            (data['pcr_pos']==1) |\
+            (data['corads_admission_cat_4']==1) |\
+            (data['corads_admission_cat_5']==1)
+        x = x.loc[is_confirmed_patient, :]
+        y = y.loc[is_confirmed_patient]
+
     # Select variables to include in prediction
     variables_to_incl['Field Variable Name'] += ['hospital']
-    x = select_variables(x, data_struct, variables_to_incl)
-
-    # Select variables to exclude
-    x = x.drop(is_in_columns(variables_to_exclude, x), axis=1)
+    variables_to_exclude += ['Coronavirus', 'pcr_pos']
+    x = select_variables(x, data_struct, 
+                         variables_to_incl,
+                         variables_to_exclude)
+
+    # Select time frame
+    days_until_death = outcomes.loc[x.index, 'Days until death'].copy()
+    days_until_discharge = outcomes.loc[x.index, 'Days until discharge'].copy()
+    has_death_week = (days_until_death>=0) & (days_until_death<=7)
+    has_disch_week = (days_until_discharge>=0) & (days_until_discharge<=7)
+    outcome_in_week = has_death_week | has_disch_week |\
+                      (outcomes['Levend dag 21 maar nog in het ziekenhuis - totaal']==1)
+
+    # tmp = x.drop(['Record Id', 'hospital'], axis=1)
+
+    # x = x.loc[outcome_in_week, :]
+    # y = y.loc[outcome_in_week]
+    days_until_death = days_until_death[outcome_in_week]
+
+
+    # Drop features with too many missing
+    if remove_features_threshold_above is not None:
+        threshold = remove_features_threshold_above
+        has_too_many_missing = (x.isna().sum(axis=0)/x.shape[0]) > threshold
+        x = x.loc[:, ~has_too_many_missing]
+        print('LOG: dropped features: {}, due to more than {}% missing'
+              .format(has_too_many_missing.loc[has_too_many_missing].index.to_list(),
+                      threshold*100))
+
+    # Remove records with too many missing
+    if remove_records_threshold_above is not None:
+        threshold = remove_records_threshold_above
+        has_too_many_missing = ((x.isna().sum(axis=1))/(x.shape[1])) > threshold
+        print('LOG: Dropped {} records, due to more than {}% missing'
+              .format(has_too_many_missing.sum(), threshold*100))
+        x = x.loc[~has_too_many_missing, :]
+        y = y.loc[~has_too_many_missing]
+
+    # Combine and Rename hospitals
+    name_combined = 'Center com'
+    cutoff = 100
+    hospital = x.loc[:, 'hospital'].copy()
+    counts = hospital.value_counts()
+    hospital.loc[hospital.isin(counts[counts<cutoff].index)] = 'zzzzz' # Quick hack for correct sorting
+    print('LOG: Hospitals to be combined: {}'.format(list(counts[counts<cutoff].index)))
+    print('LOG: Combined {} hospitals to a single hospital of size n={}'\
+           .format((counts<cutoff).sum(), counts[counts<cutoff].sum()))
+           
+    unique_hospitals = np.sort(hospital.unique())
+    hosp_change_dict = dict(zip(
+        unique_hospitals, 
+        ['Center '+str(i) for i in range(unique_hospitals.size)]))
+    hosp_change_dict['zzzzz'] = name_combined
+    hospital = hospital.replace(hosp_change_dict)
 
     # Remove columns without information
-    hospital = x.loc[:, 'hospital']
     records = x.loc[:, 'Record Id']
     x = x.drop(['hospital', 'Record Id'], axis=1)
     x = x.loc[:, x.nunique() > 1]  # Remove columns without information
 
     print('LOG: Using <{}:{}> as y.'.format(goal[0], goal[1]))
-    print('LOG: Class distribution: 1: {}, 0: {}, total: {}'\
-            .format(y.value_counts()[1], y.value_counts()[0], y.size))
+    # print('LOG: Class distribution: 1: {}, 0: {}, total: {}'\
+    #        .format(y[y.columns[0]].value_counts()[1], y[y.columns[0]].value_counts()[0], y[y.columns[0]].size))
     print('LOG: Selected {} variables for predictive model'
-          .format(x.columns.size))
+           .format(x.columns.size))
 
-    # Remove samples with missing y
-    if goal[0] != 'survival':
-        has_y = y.notna()
-        x = x.loc[has_y, :]
-        y = y.loc[has_y]
-    
-    return x, y, data, hospital, records
+    explore_data(x, y)
+
+    return x, y, data, hospital, records, days_until_death
 
-def train_and_predict(x, y, model, rep, type='subsamp', type_col=None, test_size=0.2):
+def train_and_predict(x, y, model, rep, splittype='loho',
+                      hospitals=None, unique_hospitals=None,
+                      days_until_death=None, test_size=0.2):
     ''' Splits data into train and test set using
     monte carlo subsampling method, i.e., n random
     train/test splits using equal class balance.
@@ -225,14 +301,17 @@ class instance from on of the
             clf on test y.
     '''
 
-    if type == 'loho':
+    if splittype == 'loho':
         # Leave-one-hospital-out cross-validation
-        test_hosp = type_col.unique()[rep]
-        is_test_hosp = type_col == test_hosp
+        test_hosp = unique_hospitals[rep]
+        is_test_hosp = hospitals == test_hosp
         train_x = x.loc[~is_test_hosp, :]
         train_y = y.loc[~is_test_hosp]
         test_x = x.loc[is_test_hosp, :]
         test_y = y.loc[is_test_hosp]
+        model.hospital = model.hospital.append(hospitals.loc[is_test_hosp])
+        model.days_until_outcome = model.days_until_outcome\
+                                        .append(days_until_death.loc[is_test_hosp])
     else:
         # Default to random subsampling
         train_x, test_x, train_y, test_y = train_test_split(x, y,
@@ -241,8 +320,8 @@ class instance from on of the
     datasets = {"train_x": train_x, "test_x": test_x,
                 "train_y": train_y, "test_y": test_y}
 
-    clf, datasets, test_y_hat = model.train(datasets)
-    return clf, datasets, test_y_hat
+    clf, datasets, test_y_hat, shap_values, test_set = model.train(datasets)
+    return clf, datasets, test_y_hat,shap_values, test_set
 
 def score_prediction(model, clf, datasets, test_y_hat, rep):
     ''' Wrapper for scoring individual predictions made
@@ -272,8 +351,10 @@ class instance from on of the
     score = model.score(clf, datasets, test_y_hat, rep)
     return score
 
-def evaluate_model(model, clf, datasets, scores):
-    model.evaluate(clf, datasets, scores)
+def evaluate_model(model, clf, datasets, scores,
+                   hospitals=None):
+    model.evaluate(clf, datasets, scores,
+                   hospitals=hospitals)
 
 def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
         train_test_split_method, model_class,
@@ -282,10 +363,12 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
     model.goal = goal
 
     data, data_struct = preprocess(data, data_struct)
-    x, y, data, hospital, records = prepare_for_learning(data, data_struct,
+    x, y, data, hospital,\
+        records, days_until_death = prepare_for_learning(data, data_struct,
                                                          variables_to_include,
                                                          variables_to_exclude,
                                                          goal)
+
     if goal[0] == 'survival':
         save_class_dist_per_hospital(save_path, y['event_mortality'], hospital[y.index])
     else:
@@ -297,32 +380,53 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
     model.save_path = '{}_n{}_y{}'.format(save_path, y.size, y.sum())
     model.save_prediction = save_prediction
 
-
     if train_test_split_method == 'loho':
         # Leave-one-hospital-out
-        repetitions = hospital.unique().size
+        unique_hospitals = np.sort(hospital.unique())
+        repetitions = unique_hospitals.size
     else:
         # Random Subsampling
         repetitions = 100
 
     scores = []
+    shap_list = list()
+    test_list = list()
     for rep in range(repetitions):
-        clf, datasets, test_y_hat = train_and_predict(x, y, model, rep,
-                                                      type=train_test_split_method,
-                                                      type_col=hospital)
+        clf, datasets, test_y_hat,shap_values,test_set = train_and_predict(x, y, model, rep,
+                                                      splittype=train_test_split_method,
+                                                      hospitals=hospital,
+                                                      unique_hospitals=unique_hospitals,
+                                                      days_until_death=days_until_death)
+        shap_list.append(shap_values)
+        test_list.append(test_set)
+        
         score = score_prediction(model, clf, datasets,
                                  test_y_hat, rep)
         scores.append(score)
 
     model.hospital = hospital[y.index]
-    evaluate_model(model, clf, datasets, scores)
-
+    evaluate_model(model, clf, datasets, scores,
+                   hospitals=unique_hospitals)
+    
+    shap_values = shap_list[0]
+    for val in shap_list:
+        shap_values  = np.concatenate((shap_values,val),axis=0)
+        
+    test_set = test_list[0]
+    for val in shap_list:
+        test_set  = np.concatenate((test_set,val),axis=0)
+    test_set = pd.DataFrame(test_set,columns=x.columns)
+    f = plt.figure()    
+    shap.summary_plot(shap_values, test_set)
+    f.savefig(os.path.join( model.save_path ,"summary_plot1.png"), bbox_inches='tight', dpi=600)
+    f = plt.figure()
+    shap.summary_plot(shap_values, test_set, plot_type="bar")
+    f.savefig(os.path.join( model.save_path ,"summary_plot2.png"), bbox_inches='tight', dpi=600)
     if not save_figures:
         plt.show()
 
     print('\n', flush=True)
 
-
 if __name__ == "__main__":
     ##### START PARAMETERS #####
 
@@ -355,10 +459,13 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
     #  NOTE: See get_feature_set.py for preset selections
     feature_opts = {
         'pm':   get_1_premorbid(),
-        'cp':   get_2_clinical_presentation(),
-        'lab':  get_3_laboratory_radiology_findings(),
-        'pmcp': get_4_premorbid_clinical_representation(),
-        'all':  get_5_premorbid_clin_rep_lab_rad()
+        # 'cp':   get_2_clinical_presentation(),
+        # 'lab':  get_3_laboratory_radiology_findings(),
+        # 'pmcp': get_4_premorbid_clinical_representation(),
+        # 'all':  get_5_premorbid_clin_rep_lab_rad(),
+        # 'k10': ['Blood_Urea_Nitrogen_value_1', 'LDH', 'PH_value_1', 'age_yrs', 'ccd', 'fio2_1', 'hypertension', 'irregular', 'rtr', 'uses_n_medicine'] # XGB
+        # 'k10': ['Blood_Urea_Nitrogen_value_1', 'LDH', 'PH_value_1', 'age_yrs', 'ccd', 'fio2_1', 'hypertension', 'irregular', 'rtr', 'uses_n_medicine'] # LOGREG
+        # 'paper': ['LDH', 'Lymphocyte_1_1', 'crp_1_1']
     }
 
     # Options:
@@ -368,11 +475,17 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
 
     # Add all 'Field Variable Name' from data_struct to
     # EXCLUDE variables from analysis
-    variables_to_exclude = ['microbiology_worker']
+    variables_to_exclude = [
+        # 'Smoking', 
+        # 'alcohol'
+        # 'oxygen_saturation_on',
+        # 'oxygen_saturation'
+    ]
 
     # Options:
     #   see .\Classifiers
-    model = LogReg  # NOTE: do not initialize model here,
+    model = XGB
+    # model = LogReg  # NOTE: do not initialize model here,
                     #       but supply the class (i.e. omit
                     #       the parentheses)
 
@@ -387,6 +500,7 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
 
     for train_test_split_method in cv_opts:
         for feat_name, features in feature_opts.items():
+            print(feat_name)
             vars_to_include = {
                 'Form Collection Name': [],  # groups
                 'Form Name':            [],  # variable subgroups
@@ -401,3 +515,7 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
                 save_prediction=save_prediction)
 
     plt.show()
+
+
+
+
diff --git a/covid19_ICU_util.py b/covid19_ICU_util.py
index 0fa6e93..5dcd978 100644
--- a/covid19_ICU_util.py
+++ b/covid19_ICU_util.py
@@ -1,3 +1,5 @@
+import time
+
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
@@ -28,10 +30,29 @@
      'CRP', 'Albumin', 'CKin', 'LDH_daily', 'Chest_X_Ray', 'CTperf',
      'd_dimer_yes_no']
 
+TIME_FNS = True
+
 format_dt = lambda col: pd.to_datetime(col, format='%d-%m-%Y', errors='coerce').astype('datetime64[ns]')
 is_in_columns = lambda var_list, data: [v for v in var_list if v in data.columns]
 
+# Function timer decorator
+def timeit(method):
+    def timed(*args, **kw):
+        if not TIME_FNS:
+            return method(*args, **kw)
+        ts = time.time()
+        result = method(*args, **kw)
+        te = time.time()
+        if 'log_time' in kw:
+            name = kw.get('log_name', 'method.__name__'.upper())
+            kw['log_time'][name] = int((te - ts)*1000)
+        else:
+            print('TIM: {:s}: {:2.2f}ms'.format(method.__name__,
+                                                (te-ts)*1000))
+        return result
+    return timed
 
+@timeit
 def get_all_field_information(path_to_creds):
     study_struct, reports_struct, \
         optiongroups_struct = import_study_report_structure(path_to_creds)
@@ -52,6 +73,7 @@ def get_all_field_information(path_to_creds):
 
     return data_struct
 
+@timeit
 def count_occurrences(col, record_ids, reset_count=False, start_count_at=1):
     # Make counter of occurences in binary column
     # NOTE: Make sure to supply sorted array
@@ -81,6 +103,7 @@ def count_occurrences(col, record_ids, reset_count=False, start_count_at=1):
     new_col = pd.concat(new_cols, axis=0)
     return new_col
 
+@timeit
 def calculate_outcomes(data, data_struct):
     # CATEGORY EXPLANATION
     # Discharged alive: 1) Discharged to home
@@ -199,6 +222,7 @@ def calculate_outcomes(data, data_struct):
     used_columns = [col for col in data.columns if 'Outcome' in col] # Keep track of var
     return df_outcomes, used_columns
 
+@timeit
 def select_x_y(data, outcomes, used_columns,
                goal, remove_not_outcome=True):
     x = data.drop(used_columns, axis=1)
@@ -211,12 +235,14 @@ def select_x_y(data, outcomes, used_columns,
 
     # TEMP select (Non-ICU patients)
     # ICU pts:
-    # was_icu = outcomes.iloc[:, [3, 6, 10, 12, 14]].any(axis=1)
-    # y[was_icu] = None # ONLY INCLUDE NON ICU PATIENTS
+    was_icu = outcomes.iloc[:, [3, 6, 10, 12, 14]].any(axis=1)
+    # was_icu.to_excel('was_icu.xlsx')
+    # y[was_icu] = None # ONLY INCLUDE NON-ICU PATIENTS (Inverted because set to None)
     # y[~was_icu] = None # ONLY INCLUDE ICU PATIENTS
-
+    # x['was_icu'] = was_icu
     return x, y, outcomes_dict
 
+@timeit
 def get_classification_outcomes(data, outcomes):
     y_dict = {}
 
@@ -233,15 +259,17 @@ def get_classification_outcomes(data, outcomes):
     #       Alive (=0): Discharged at t <= 21 |
     #                   Alive at t=21
     y = pd.Series(None, index=data.index)
-    y.loc[outcomes.loc[:, 'Dood - totaal']==1] = 1
+    y.loc[outcomes.loc[:, 'Dood - totaal']==1] = 1 # TODO: with days_until_death <=21d
     y.loc[outcomes.loc[:, ['Levend ontslagen en niet heropgenomen - totaal',
                            'Levend dag 21 maar nog in het ziekenhuis - totaal']].any(axis=1)] = 0
     y_dict['mortality_with_outcome'] = y
 
     return y_dict
 
+@timeit
 def get_survival_analysis_outcomes(data, outcomes):
     y_dict = {}
+    data = data.copy()
 
     # 1) Event (=1): ICU admission
     #    Duration (event=1): days until first ICU admission since hospital admission
@@ -265,11 +293,29 @@ def get_survival_analysis_outcomes(data, outcomes):
     #                        TODO: Think about same exclusion at classification[y2]
     outcome_name = 'Patient has died'
 
+    # start with everyone alive at t=0
     event = pd.Series(0, index=data.index)
-    duration = outcomes.loc[:, 'Days until death']
+
+    # mark all patients with a time to death as death
+    duration = outcomes.loc[:, 'Days until death'].copy()
     event.loc[duration.notna()] = 1
+
+    max_duration = np.nanmax(duration)
+
+    # mark patients that died, but do not have a death date as alive until their last followup
     duration.loc[duration.isna()] = data.loc[duration.isna(), 'days_since_admission_current_hosp']
 
+    # # now add the people that are ALIVE DISCHARGED - set as no event until t=max_duration
+    duration.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = max_duration  # or 42 days   #  discharged alive - ALL 3
+    event.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = 0  # censor alive at t=max_duration days
+
+    # # now add the people that are UNKNOWN (= not death and not discharged)
+    duration.loc[data['Onbekend (alle patiënten zonder outcome)']] = data['days_since_admission_current_hosp'][data['Onbekend (alle patiënten zonder outcome)']]  # Onbekend (alle patiënten zonder outcome) == UNKNOWN outcome
+    event.loc[data['Onbekend (alle patiënten zonder outcome)']] = 0  # censor at time of event.
+
+    # remaining data: does not have any outcome or follow-up: we only now these patients are alive at t=0.
+    duration[duration.isna()] = 0.0  # duration unknown because date is not filled in? set to 0.0
+
     y = pd.concat([event, duration], axis=1)
     y.columns = ['event_mortality', 'duration_mortality']
     y_dict['mortality_all'] = y
@@ -304,6 +350,7 @@ def get_survival_analysis_outcomes(data, outcomes):
                                        axis=1)
     return y_dict
 
+@timeit
 def fix_single_errors(data):
     ''' Replaces values on the global dataframe,
     per column or per column and record Id.
@@ -334,8 +381,23 @@ def fix_single_errors(data):
                 data.loc[data['Record Id']==record_id, column] \
                     = data.loc[data['Record Id']==record_id, column] \
                           .replace(pair[0], pair[1])
+
+    # SaO2 and SaO2_1 are incidentally reported as fraction rather than as
+    # a percentage; fix this by multiplication. 0 is impossible -> replace by nan
+    sao2_invalid = data['SaO2'].astype(float) < 1.0
+    data.loc[sao2_invalid,'SaO2'] = (100. * data.loc[sao2_invalid,'SaO2'].astype(float)).astype(str)
+
+    sao2_1_invalid = data['SaO2_1'].astype(float) < 1.0
+    data.loc[sao2_1_invalid,'SaO2_1'] = (100. * data.loc[sao2_1_invalid,'SaO2_1'].astype(float)).astype(str)
+
+    sao2_zero = data['SaO2'].astype(float) == 0.
+    data.loc[sao2_zero,'SaO2'] = np.nan
+
+    sao2_1_zero = data['SaO2_1'].astype(float) == 0.
+    data.loc[sao2_1_zero,'SaO2_1'] = np.nan
     return data
 
+@timeit
 def transform_binary_features(data, data_struct):
     '''
     '''
@@ -343,7 +405,6 @@ def transform_binary_features(data, data_struct):
     dict_yes_no = {0:0, 1:1, 2:0, 3:value_na, 9:value_na, 9999: value_na}
     dict_yp = {0:0, 1:1, 2:.5, 3:0, 4:value_na} # [1, 2, 3, 4 ] --> [1, .5, 0, -1]
     dict_yu = {0:0, 1:1, 9999:value_na}
-    dict_smoke = {0:0, 1:1, 2:0, 3:.5, 4:value_na} # [Yes, no, stopped_smoking] --> [1, 0, .5]
 
     # Some fixed for erronuous field types
     data_struct.loc[data_struct['Field Variable Name']=='MH_HF', 'Field Type'] = 'radio'
@@ -377,9 +438,9 @@ def transform_binary_features(data, data_struct):
     data.loc[:, 'Bacteria'] = data.loc[:, 'Bacteria'].fillna(3) \
                                                      .astype(int) \
                                                      .apply(lambda x: dict_yes_no.get(x))
-    data.loc[:, 'Smoking'] = data.loc[:, 'Smoking'].fillna(4) \
-                                                   .astype(int) \
-                                                   .apply(lambda x: dict_smoke.get(x))
+    # data.loc[:, 'Smoking'] = data.loc[:, 'Smoking'].fillna(4) \
+    #                                                .astype(int) \
+    #                                                .apply(lambda x: dict_smoke.get(x))
     data.loc[:, 'CT_thorax_performed'] = data.loc[:, 'CT_thorax_performed'].fillna(3) \
                                                                            .astype(int) \
                                                                            .apply(lambda x: {0:0, 1:0, 2:1, 3:0}.get(x))
@@ -407,6 +468,7 @@ def transform_binary_features(data, data_struct):
 
     return data, data_struct
 
+@timeit
 def transform_categorical_features(data, data_struct):
     ''' Create dummyvariables for category variables,
         removes empty variables and attaches column names
@@ -418,35 +480,41 @@ def transform_categorical_features(data, data_struct):
 
     # Extract variables that can contain multiple answers OR need to be
     #   dummified to be used in a later stage
-    vars_to_dummy = ['oxygen_saturation_on', 'dept', 'Outcome']
+    vars_to_dummy = ['dept', 'Outcome'] # 'oxygen_saturation_on'
 
-    is_one_hot_encoded = data_struct['Field Type'].isin(['checkbox']) | \
-                         data_struct['Field Variable Name'].isin(vars_to_dummy)
-    data_struct.loc[is_one_hot_encoded, 'Field Type'] = 'category_one_not_encoded'
+    is_one_hot_encoded = data_struct['Field Type'].isin(['checkbox', 'category']) | \
+                            data_struct['Field Variable Name'].isin(vars_to_dummy)
+    data_struct.loc[is_one_hot_encoded, 'Field Type'] = 'category_one_hot_encoded'
 
     cat_struct_ohe = data_struct.loc[is_one_hot_encoded,
-                                 ['Field Variable Name', 'Option Name',
-                                  'Option Value']]
+                                    ['Field Variable Name', 'Option Name',
+                                    'Option Value']]
     category_columns_ohe = is_in_columns(cat_struct_ohe['Field Variable Name'], data)
 
+    # Exclude some vars
+    category_columns_ohe = [c for c in category_columns_ohe if 'PaO2_sample_type' not in c]
+
     get_name = lambda c, v: '{:s}_cat_{:s}'.format(col, str(v))
 
     # Dummify variables
     dummies_list = []
     for col in category_columns_ohe:
+
         # Get all unique categories in the column
         unique_categories = pd.unique([cat for value in data[col].values
                                        for cat in str(value).split(';')])
-        unique_categories = [cat for cat in unique_categories
-                             if cat.lower() not in ['nan', 'none']]
+        # unique_categories = [cat for cat in unique_categories
+        #                      if cat.lower() not in ['nan', 'none']]
+
         if not any(unique_categories):
             continue
 
         # TODO: Make column names to actual name instead of numeric answer
-        dummy_column_names = [get_name(col, v) for v in unique_categories
-                              if v.lower() not in ['nan', 'none']]
+        dummy_column_names = [get_name(col, v) for v in unique_categories]
+                            #   if v.lower() not in ['nan', 'none']]
         # Create new dataframe with the dummies
         dummies = pd.DataFrame(0, index=data.index, columns=dummy_column_names)
+
         # Insert the data
         for cat in unique_categories:
             # TODO: Filter specific categories that are nan/na/none/unknown
@@ -455,8 +523,15 @@ def transform_categorical_features(data, data_struct):
             data[col] = data[col].fillna('')
 
             regex_str = '(?:;|^){}(?:;|$)'.format(cat)
-            dummies.loc[data[col].str.contains(regex_str, regex=True),
-                        get_name(col, cat)] = 1
+            has_cat = data[col].str.contains(regex_str, regex=True)
+            if has_cat.sum() > 1:
+                dummies.loc[has_cat, get_name(col, cat)] = 1            
+
+        nan_cols = [c for c in dummies.columns if '_nan' in c or '_None' in c]
+        missing_col = dummies.loc[:, nan_cols].max(axis=1)
+        dummies = dummies.drop(nan_cols, axis=1)
+        dummies = pd.concat([dummies, missing_col], axis=1)
+        dummies.columns = dummies.columns[:-1].to_list() + [col+'_cat_missing']
 
         dummies_list += [dummies]
 
@@ -471,9 +546,12 @@ def transform_categorical_features(data, data_struct):
 
     return data, data_struct
 
+@timeit
 def transform_numeric_features(data, data_struct):
     # Calculates all variables to the same unit,
     #   according to a handmade mapping in unit_lookup.py
+    data = data.copy()
+
     unit_dict, var_numeric = get_unit_lookup_dict()
 
     numeric_columns = is_in_columns(var_numeric.keys(), data)
@@ -500,8 +578,33 @@ def transform_numeric_features(data, data_struct):
 
     data = data.drop(is_in_columns(unit_dict.keys(), data), axis=1)
 
+    # PaO2_1 and PaO2 are measured in 3 ways
+    # as described in PaO2_sample\_type_1 and PaO2_sample_type
+    # divide these variables in four dummies.
+    for p in ['', '_1']:
+        options = data_struct.loc[
+            data_struct['Field Variable Name']==('PaO2_sample_type'+p),
+            ['Option Name', 'Option Value']].iloc[0, :]
+
+        col_dict = dict(zip(options['Option Value'], options['Option Name']))
+        # Non-Strings are missing values
+        colnames = ['PaO2'+p+'_'+v for v in col_dict.values() if type(v)==str]
+        df = pd.DataFrame(0, columns=colnames, index=data.index)
+
+        for value, name in col_dict.items():
+            if str(name) == 'nan':
+                # occurs only once, so better drop it
+                continue
+            colname = 'PaO2'+p+'_'+str(name)
+            is_measure_type = data['PaO2_sample_type'+p] == value
+            df.loc[is_measure_type, colname] = data.loc[is_measure_type, 'PaO2'+p]
+        df.loc[data['PaO2'+p].isna(), :] = None
+
+        data = data.drop(columns=['PaO2'+p, 'PaO2_sample_type'+p])
+        data = pd.concat([data, df], axis=1)
     return data, data_struct
 
+@timeit
 def transform_time_features(data, data_struct):
     '''
     TODO: Check difference hosp_admission and Outcome_dt
@@ -594,6 +697,7 @@ def transform_time_features(data, data_struct):
 
     return data, data_struct
 
+@timeit
 def transform_string_features(data, data_struct):
     # TODO: Why it med_specify not in data_struct?
 
@@ -615,6 +719,7 @@ def transform_string_features(data, data_struct):
                   index=data_struct.columns), ignore_index=True)
     return data, data_struct
 
+@timeit
 def transform_calculated_features(data, data_struct):
     struct_calc = data_struct.loc[data_struct['Field Type']=='calculation', :]
 
@@ -625,6 +730,7 @@ def transform_calculated_features(data, data_struct):
     data = data.drop(cols_to_drop, axis=1)
     return data, data_struct
 
+@timeit
 def select_data(data, data_struct):
     cols_to_keep = [col for col in data.columns
                     if col not in IS_MEASURED_COLUMNS]
@@ -640,39 +746,46 @@ def select_data(data, data_struct):
 
     return data, data_struct
 
-def select_variables(data, data_struct, variables_to_include_dict):
+@timeit
+def select_variables(data, data_struct, variables_to_include_dict,
+                     variables_to_exclude):
     # Get all variables
     variables_to_include = []
     for k, v in variables_to_include_dict.items():
+        v = [i for i in v if i not in variables_to_exclude]
         if k == 'Field Variable Name':
-            variables_to_include += variables_to_include_dict[k]
+            variables_to_include += v
         else:
             variables_to_include += data_struct.loc[data_struct[k].isin(v),
                                                    'Field Variable Name'] \
                                                .to_list()
 
     # Retrieve the corresponding categorical 1-hot encoded column names
-    category_vars = data_struct.loc[data_struct['Field Type'] == 'category',
+    category_vars = data_struct.loc[(data_struct['Field Type'] == 'category') |
+                                    (data_struct['Field Type'] == 'category_one_hot_encoded'),
                                 'Field Variable Name'].to_list()
     variables_to_include += [c for var in variables_to_include
-                                for c in data.columns
-                                if (var in category_vars) and (var in c)]
+                               for c in data.columns
+                               if ((var in category_vars)\
+                                    and (var not in variables_to_exclude))\
+                                  and (var in c)]
 
     variables_to_include += ['Record Id']
     variables_to_include = list(np.unique(is_in_columns(variables_to_include, data)))
     return data.loc[:, variables_to_include]
 
+@timeit
 def impute_missing_values(data, data_struct):
-    ''' 
+    '''
     NOTE: DEPRECATED in upcoming versions
     NOTE: Only impute values that not leak data.
-          i.e. only use single field or 
+          i.e. only use single field or
           record based imputations. '''
 
     missing_class = -1
     # Categorical --> Add a missing class
     vars_categorical = is_in_columns(data_struct.loc[data_struct['Field Type']=='category', 'Field Variable Name'].to_list(), data)
-    
+
     data.loc[:, vars_categorical] = data.loc[:, vars_categorical].fillna(missing_class)
 
     ## MODE should be moved to Pipeline in Classifier to prevent data leakage
@@ -692,6 +805,7 @@ def impute_missing_values(data, data_struct):
 
     return data
 
+@timeit
 def plot_feature_importance(importances, features, show_n_features=5):
     show_n_features = features.shape[0] if not show_n_features else show_n_features
 
@@ -703,7 +817,8 @@ def plot_feature_importance(importances, features, show_n_features=5):
 
     return fig, ax
 
-def save_class_dist_per_hospital(path, y, hospital):   
+@timeit
+def save_class_dist_per_hospital(path, y, hospital):
     with open(path + '_class_dist_per_hosp.txt', 'w') as f:
         f.write('{}\t{}\t{}\n'.format('hospital', '0', '1'))
         for h in hospital.unique():
@@ -712,31 +827,28 @@ def save_class_dist_per_hospital(path, y, hospital):
             f.write(line)
     print('LOG: Written class distribution per hospital to file.')
 
-
 def explore_data(x, y):
-    data = pd.concat([x, y], axis=1)
-    corr = data.corr(method='spearman')
-    plt.matshow(corr)
+    # VIF
+    from statsmodels.stats.outliers_influence import variance_inflation_factor
+    from statsmodels.tools.tools import add_constant
 
+    x = x.fillna(0).astype(float)
+    # x = add_constant(x)
+    vif = pd.Series([variance_inflation_factor(x.values, i)
+                    for i in range(x.shape[1])], index=x.columns)\
+            .sort_values(ascending=True)
 
-# beademd geweest op IC
-# outcome_ventilation_any = data['patient_interventions_cat_1'] == 1.0 \
-#                           | data['patient_interventions_cat_2'] == 1.0 \
-#                           | data['Invasive_ventilation_1'] == 1.0
-# outcome_ventilation_daily = data['patient_interventions_cat_1'] == 1.0 | \
-#                             data['patient_interventions_cat_2'] == 1.0
+    fig, ax = plt.subplots()
+    ax.set_title('Variance inflation factor')
+    vif.nlargest(25).plot(kind='barh')
+    fig.tight_layout()
 
-# Orgaanfalen lever, nier
-# outcome_organfailure_any = data['patient_interventions_cat_3'] == 1.0 | \
-# data['patient_interventions_cat_5'] == 1.0 | \
-# data['Extracorporeal_support_1'] == 1.0 | \
-# data['Liver_dysfunction_1_1'] == 1.0 | \
-# data['INR_1_1'].astype('float') > 1.5 | \
-# data['Acute_renal_injury_Acute_renal_failure_1_1'] == 1.0
+    # CORRELATION
+    # xc = x.corr('spearman') # 'spearmon'
 
+    # fig, ax = plt.subplots()
+    # ax.set_title('Correlation matrix')
+    # ax.matshow(xc)
 
+    return
 
-# has_not_died = outcomes.loc[:, ['Levend ontslagen en niet heropgenomen - totaal',
-#                                 'Levend dag 21 maar nog in het ziekenhuis - totaal']].any(axis=1)
-# y_duration.loc[has_not_died] = data.loc[has_not_died, 'days_since_admission_current_hosp']
-# y_event.loc[has_not_died] = 0
\ No newline at end of file
diff --git a/data_query_scripts/ALL_castor2excel_variables.py b/data_query_scripts/ALL_castor2excel_variables.py
index 1b52c09..2377387 100755
--- a/data_query_scripts/ALL_castor2excel_variables.py
+++ b/data_query_scripts/ALL_castor2excel_variables.py
@@ -9,59 +9,117 @@
 @author: wouterpotters
 """
 import os
-import site
 import configparser
 import pandas as pd
-site.addsitedir('./../')  # add directory to path to enable local imports
-from covid19_import import import_study_report_structure  # noqa: E402
-
-config = configparser.ConfigParser()
-config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini'))
-
-# the excel file with all variables and answer options is stored here
-target_excel = config['exportresults']['excel_file_variables']
-
-# # Get all data from Castor database (without any selection criterium)
-# Note that you need export rights for every individual center.
-study_struct, reports_struct, optiongroups_struct = \
-    import_study_report_structure(
-        config['CastorCredentials']['local_private_path'])
-
-# ## Export all variables to excel
-writer = pd.ExcelWriter(target_excel, engine='xlsxwriter')
-
-readme = 'This excel sheet contains an overview of the variables that are ' +\
-    'used in the Castor EDC database for the COVID 19 project. \nThere are' +\
-    ' three tabs; \n(1) Admission variables; to be entered once and ' +\
-    'updated incidentally. \n(2) Daily reports are created once per day.'
-readme = pd.DataFrame([x for x in readme.split('\n')])
-
-# Write each dataframe to a different worksheet.
-readme.to_excel(writer, sheet_name='README', index=False)
-optiongroups_struct.to_excel(writer, sheet_name='OptionGroups', index=False)
-
-# added answeroptions to the excel file
-answeroptions = pd.pivot_table(optiongroups_struct,
-                               index='Option Group Id',
-                               values=['Option Name', 'Option Value'],
-                               aggfunc=lambda x: list(x))
-answeroptions.rename(columns={'Option Name': 'Field Options Names',
-                              'Option Value': 'Field Options Values'})
-study_struct_withoptions = pd.merge(study_struct,
-                                    answeroptions, how='left',
-                                    left_on='Field Option Group',
-                                    right_on='Option Group Id')
-study_struct_withoptions.to_excel(writer, sheet_name='AdmissionVariables',
-                                  index=False)
-
-reports_struct_withoptions = pd.merge(reports_struct,
-                                      answeroptions,
-                                      how='left',
-                                      left_on='Field Option Group',
-                                      right_on='Option Group Id')
-reports_struct_withoptions.to_excel(writer,
-                                    sheet_name='DailyUpdateVariables',
-                                    index=False)
-
-writer.save()  # save excel file
-print('excel file was saved to : ' + target_excel)
+import castorapi as ca
+
+
+def import_study_report_structure(c):
+    # STEP 0: collect answer options from optiongroups
+    # get answer option groups
+    optiongroups_struct = c.request_study_export_optiongroups()
+
+    # STEP 1: collect data from study
+    # get the main study structure (i.e. questions)
+    structure = c.request_study_export_structure()
+
+    # sort on form collection order and field order
+    # (this matches how data is filled in castor)
+    structure_filtered = structure \
+        .sort_values(['Form Collection Name', 'Form Collection Order',
+                      'Form Order', 'Field Order'])
+
+    # filter variables that have no Field Variable name; these field do not
+    # record data
+    structure_filtered[~structure_filtered['Field Variable Name'].isna()]
+
+    # select only study variables
+    study_structure = structure_filtered[
+        structure_filtered['Form Type'].isin(['Study'])]
+
+    # select only report variables (may contain duplicates)
+    reports_structure = structure_filtered[
+        structure_filtered['Form Type'].isin(['Report'])]
+
+    return study_structure, reports_structure, optiongroups_struct
+
+
+if __name__ == "__main__":
+    config = configparser.ConfigParser()
+    config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini'))
+
+    path_to_api_creds = config['CastorCredentials']['local_private_path']
+
+    # input: private folder where client & secret files (no extension,
+    #        1 string only per file) from castor are saved by the user
+    # see also:
+    # https://helpdesk.castoredc.com/article/124-application-programming-interface-api
+    c = ca.CastorApi(path_to_api_creds)  # e.g. in user dir outside of GIT repo
+
+    # get study ID for COVID study
+    if False:
+        name = 'COVID-19 NL'
+        excel_postfix = ''
+    else:
+        name = 'Clinical features of COVID-19 positive patients in VieCuri'
+        excel_postfix = '_viecurie.xlsx'
+
+    study_id = c.select_study_by_name(name)
+
+    study_name = c.request_study(study_id=study_id)['name']
+
+    # # Get all data from Castor database (without any selection criterium)
+    # Note that you need export rights for every individual center.
+    study_structure, reports_structure, optiongroups_struct = \
+        import_study_report_structure(c)
+
+    # STEP 2: convert to excel file
+    # the excel file with all variables and answer options is stored here
+    target_excel = config['exportresults']['excel_file_variables'] + excel_postfix
+
+    # ## Export all variables to excel
+    writer = pd.ExcelWriter(target_excel, engine='xlsxwriter')
+
+    readme = 'This excel sheet contains an overview of the variables that are ' +\
+        'used in the Castor EDC database for the study: ' + study_name +\
+        'project. \nThere are multiple tabs; \n(1) Study variables; ' +\
+        'to be entered once (and updated incidentally). ' +\
+        '\n(2, 3 ..., n) Reports tabs could be created more often.'
+    readme = pd.DataFrame([x for x in readme.split('\n')])
+
+    # Write each dataframe to a different worksheet.
+    readme.to_excel(writer, sheet_name='README', index=False)
+    optiongroups_struct.to_excel(writer, sheet_name='OptionGroups', index=False)
+
+    # added answeroptions to the excel file
+    answeroptions = pd.pivot_table(optiongroups_struct,
+                                   index='Option Group Id',
+                                   values=['Option Name', 'Option Value'],
+                                   aggfunc=lambda x: list(x))
+    answeroptions.rename(columns={'Option Name': 'Field Options Names',
+                                  'Option Value': 'Field Options Values'})
+
+    study_struct_withoptions = pd.merge(study_structure,
+                                        answeroptions, how='left',
+                                        left_on='Field Option Group',
+                                        right_on='Option Group Id')
+    study_struct_withoptions.to_excel(writer, sheet_name='Study',
+                                      index=False)
+
+    reports_struct_withoptions = pd.merge(reports_structure,
+                                          answeroptions,
+                                          how='left',
+                                          left_on='Field Option Group',
+                                          right_on='Option Group Id')
+
+    for report_name in reports_structure['Form Collection Name'].unique():
+        selection = reports_struct_withoptions[
+            reports_struct_withoptions['Form Collection Name'] == report_name]
+        report_name = 'Report' + report_name
+        for r in ':*?/[]\\':
+            report_name = report_name.replace(r, '_')
+
+        selection.to_excel(writer, sheet_name=report_name[:31], index=False)
+
+    writer.save()  # save excel file
+    print('\n excel file for study ' + study_name + ' was saved to : ' + target_excel)
diff --git a/data_query_scripts/ALL_completeness_category.py b/data_query_scripts/ALL_completeness_category.py
new file mode 100644
index 0000000..923b7a4
--- /dev/null
+++ b/data_query_scripts/ALL_completeness_category.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Apr  3 12:46:07 2020
+
+@author: wouterpotters
+"""
+import os
+import numpy as np
+import pickle
+import configparser
+import castorapi as ca
+
+config = configparser.ConfigParser()
+config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini'))
+
+c = ca.CastorApi(config['CastorCredentials']['local_private_path'])
+study_id = c.select_study_by_name(config['CastorCredentials']['study_name'])
+
+# select AMC + VUmc
+institutes = c.request_institutes()
+study_info = c.request_study_export_structure()
+
+try:
+    path_save = os.path.join(config['CastorCredentials']['local_private_path'], 'saveddata.pkl')
+    with open(path_save, 'rb') as f:
+        df_study, df_structure, df_report, df_report_structure, \
+            df_optiongroup_structure = pickle.load(f)
+except Exception as ex:
+    print('error'+str(ex))
+
+# %% per hospital %
+variables = df_structure[df_structure['Form Collection Name'] == 'INTERNAL MED/ GERIATRICS']['Field Variable Name'].reset_index(drop=True)
+varcount = len(variables)
+
+for i in institutes:
+    inst = i['name']
+    sel = df_study['hospital'] == inst
+    if sum(sel) > 0:
+        print('{}:\t{} records, records with any INTERNAL MED/ GERIATRICS data {}, {}% complete'
+              .format(inst, sum(sel),
+                      sum(np.sum(~df_study[sel][variables].isna(),axis=1) > 0),
+                      round(100-100*np.sum(np.sum(df_study[sel][variables].isna()))/(sum(sel)*varcount),0)))
+    else:
+        if False:
+            print('{}:\t{} entries'.format(inst,sum(sel)))
+
+# %% per hospital
+# cfs_yesno	CFS evaluated during admission?
+
+# %% per hospital %
+variables = df_structure[df_structure['Form Collection Name'] == 'INTERNAL MED/ GERIATRICS']['Field Variable Name'].reset_index(drop=True)
+varcount = len(variables)
+
+variables = ['cfs_yesno']
+varcount = 1
+
+subgroep_70 = df_study['age_yrs'] >= 70
+
+for i in institutes:
+    inst = i['name']
+    sel = np.logical_and(df_study['hospital'] == inst, subgroep_70)
+    if sum(sel) > 0:
+        if False:
+            print('{}:\t{} records, records with any INTERNAL MED/ GERIATRICS data {}, {}% complete'
+              .format(inst, sum(sel),
+                      sum(np.sum(df_study[sel][variables] == '1',axis=1) > 0),
+                      round(100-100*np.sum(np.sum(df_study[sel][variables] == '1'))/(sum(sel)*varcount),0)))
+        else:
+            print('{}:\t{}/{} records'
+              .format(inst,
+                      sum(np.sum(df_study[sel][variables] == '1',axis=1) > 0),
+                      sum(sel)))
+    else:
+        if False:
+            print('{}:\t{} entries'.format(inst,sum(sel)))
\ No newline at end of file
diff --git a/data_query_scripts/ALL_convert_to_age_in_years.py b/data_query_scripts/ALL_convert_to_age_in_years.py
index 541fcc1..f0e62df 100644
--- a/data_query_scripts/ALL_convert_to_age_in_years.py
+++ b/data_query_scripts/ALL_convert_to_age_in_years.py
@@ -13,7 +13,7 @@
 import datetime
 import castorapi as ca
 
-LIVE = True  # be very carefull. This updates a huge amount of data!!!
+LIVE = False  # be very carefull. This updates a huge amount of data!!!
 
 config = configparser.ConfigParser()
 config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini'))
diff --git a/data_query_scripts/survival_death.py b/data_query_scripts/survival_death.py
index ed5d8aa..3c6e26d 100755
--- a/data_query_scripts/survival_death.py
+++ b/data_query_scripts/survival_death.py
@@ -20,6 +20,9 @@
 from lifelines import *
 import pandas as pd
 import numpy as np
+from sklearn.experimental import enable_iterative_imputer  # Required to enable experimental iterative imputer
+from sklearn.impute import SimpleImputer, IterativeImputer
+from sklearn.preprocessing import StandardScaler, RobustScaler
 
 plt.rcParams["font.family"] = "Times New Roman"
 matplotlib.rcParams.update({'font.size': 14})
@@ -38,16 +41,52 @@
                             'oxygen_saturation':'Saturatie (%)','Temperature':'Temperatuur (ºC)','Antiviral_agent_1':'Behandeling met antivirale middelen (ja)','Oxygen_therapy_1':'zuurstof therapie (ja)','Non_invasive_ventilation_1':'non-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)',
                             'intubate_yesno':'Niet intuberen (ja)','CT_thorax_performed':'CT thorax verricht op SEH (ja)','corads_admission':'CO-RAD score  (1-5)','pcr_pos':'Corona virus PCR + op SEH (ja)','Coagulation_disorder_1_1':'Stollingsstoornis (ja)',
                             'Acute_renal_injury_Acute_renal_failure_1_1':'Nierfunctie stoornis waarvoor dialyse (ja)','diabetes_complications':'Diabetes met complicaties (ja)','cpd':'chronische longziekte (ja)','Smoking':'Actieve roker (ja)','obesity':'Obesitas (ja)','immune_sup':'immunosupressiva (ja)','ace_i_spec_1':'Gebruik ACE remmer (ja)',
-                            'sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','capillary_refill':'Verlengde capillaire refill (ja)','rtr':'Ademhalingsfrequentie (1/min)','oxygen_saturation':'Zuurstof saturatie (%)','SaO2_1':'SaO2 (%)','PaO2_1':'PaO2 (kPa)','PCO2_1':'pCO2 (kPa)',
-                            'fio2_1':'FiO2 (%)','Temperature':'Temperatuur (ºC)','eye':'EMV','Antiviral_agent_1':'Anti-virale middelen (ja)','Antibiotic_1':'Antibiotica (ja)','Corticosteroid_1':'Corticosteroiden voor ARDS (ja)','Corticosteroid_type_1':'gebruik corticosteroiden (ja)','Oxygen_therapy_1':'Zuurstof behandeling (ja)',
+                            'sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','capillary_refill':'Verlengde capillaire refill (ja)','rtr':'Ademhalingsfrequentie (1/min)','oxygen_saturation':'Zuurstof saturatie (%)','SaO2_1':'SaO2 (%)','PaO2_1_Arterial':'PaO2 arteriëel (kPa)',
+                            'PaO2_1_Venous':'PaO2 veneus (kPa)', 'PaO2_1_Capillary':'PaO2 capillair (kPa)', 'PaO2_1_nan':'PaO2 onbekend (kPa)', 'PCO2_1':'pCO2 (kPa)','fio2_1':'FiO2 (%)','Temperature':'Temperatuur (ºC)','eye':'EMV','Antiviral_agent_1':'Anti-virale middelen (ja)','Antibiotic_1':'Antibiotica (ja)',
+                            'Corticosteroid_1':'Corticosteroiden voor ARDS (ja)','Corticosteroid_type_1':'gebruik corticosteroiden (ja)','Oxygen_therapy_1':'Zuurstof behandeling (ja)',
                             'Non_invasive_ventilation_1':'Niet-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)','intubate_yesno':'Niet intuberen (ja)','auxiliary_breathing_muscles':'Gebruik van extra ademhalingsspieren (ja)','fever':'Koorts gehad sinds eerste klachten (ja)',
                             'Anosmia':'Anosmie (ja)','Rhinorrhoea':'Rhinorrhoea (ja)','Sore_throat':'Keelpijn (ja)','cough_sputum':'Hoest met sputum (ja)','cough_sputum_haemoptysis':'Hoest met bloederig slijm/ haemoptoë (ja)','Arthralgia':'Arthralgie (ja)','Myalgia':'Spierpijn (ja)','Fatigue_Malaise':'Vermoeidheid/ algehele malaise (ja)',
                             'Abdominal_pain':'Buikpijn (ja)','Vomiting_Nausea':'Misselijkheid/ overgeven (ja)','Diarrhoea':'diarree (ja)','Dyspnea':'Dyspneu (ja)','Wheezing':'Piepende ademhaling (ja)','Chest_pain':'Pijn op de borst (ja)','ear_pain':'oorpijn (ja)','Bleeding_Haemorrhage':'bloeding (ja)','Headache':'Hoofdpijn (ja)',
                             'confusion':'Veranderd bewustzijn (ja)','Seizures':'Insulten (ja)','infiltrates_2':'Infiltraat op X-thorax (ja)','corads_admission':'CO-RADS (1-5)','Glucose_unit_1_1':'glucose (mmol/L)','Sodium_1_1':'Natrium (mmol/L)','Potassium_1_1':'Kalium (mmol/L)','Blood_Urea_Nitrogen_value_1':'Ureum (mmol/L)',
                             'Creatinine_value_1':'Creatinine (µmol/L)','Calcium_adm':'Ca (totaal) (mmol/L)','ferritine_admin_spec':'ferritine (mg/L)','creatininekinase':'CK (U/L)','d_dimer':'D-Dimeer (nmol/L)','AST_SGOT_1_1':'ASAT (U/L)','ALT_SGPT_1_1':'ALAT (U/L)','Total_Bilirubin_2_1':'totaal Bilirubine (IE)','LDH':'LDH (U/L)',
                             'PH_value_1':'Zuurgraad (pH)','Lactate_2_1':'Lactaat (mmol/L)','crp_1_1':'CRP (mg/L)','Haemoglobin_value_1':'Hb (mmol/L)','Platelets_value_1':'trombocyten (x10^9/L)','WBC_2_1':'Leukocyten (x10^3/µL)','Lymphocyte_1_1':'Lymfocyten (x10^9/L)','Neutrophil_unit_1':'Neutrofielen (x10^9/L)','INR_1_1':'INR',
-                            'pt_spec':'PT (sec)','fibrinogen_admin':'Fibrinogeen (g/L)','pcr_pos':'Corona PCR + (ja)','Adenovirus':'Adenovirus (ja)','RSV_':'RS virus (ja)','Influenza':'Influenza virus (ja)','Bacteria':'Sputumkweek (ja)','days_since_onset':'Tijd sinds eerste klachten (dagen)','irregular':'Onregelmatige hartslag (ja)',
-                            'DNR_yesno':' (ja)','healthcare_worker':'Zorgmedewerker (ja)','infec_resp_diagnosis':'Andere infectieuze ademhalingsdiagnose','Blood_albumin_value_1':'Albumine (g/L)','culture':'Bloedkweek (positief)','APT_APTR_1_1':'APTT (sec)'}
+                            'pt_spec':'PT (sec)','fibrinogen_admin':'Fibrinogeen (g/L)','pcr_pos':'Corona PCR + (ja)','Adenovirus':'Adenovirus (ja)','RSV_':'RS virus (ja)','Influenza':'Influenza virus (ja)','Bacteria':'Sputumkweek (ja)','days_untreated':'Tijd sinds eerste klachten (dagen)','irregular':'Onregelmatige hartslag (ja)',
+                            'DNR_yesno':' (ja)','healthcare_worker':'Zorgmedewerker (ja)','infec_resp_diagnosis':'Andere infectieuze ademhalingsdiagnose','Blood_albumin_value_1':'Albumine (g/L)','culture':'Bloedkweek (positief)','APT_APTR_1_1':'APTT (sec)','uses_n_medicine':'Aantal thuismedicamenten'}
+
+
+class StandardScaler_min2cols(StandardScaler):
+    def __init__(self,columns,copy=True, with_mean=False, with_std=True):
+        self.scaler = StandardScaler(copy, with_mean=with_mean, with_std=with_std)
+        self.columns = columns
+
+    def fit(self, X, y=None):
+        self.scaler.fit(X[self.columns], y)
+        return self
+
+    def transform(self, X, y=None, copy=None):
+        init_col_order = X.columns
+        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
+        X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)]
+        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
+
+class RobustScaler_min2cols(RobustScaler):
+    def __init__(self, columns, copy=True, with_centering=True,
+                 with_scaling=True, quantile_range=(25.0, 75.0)):
+        self.scaler = RobustScaler(with_centering=with_centering,
+                                   with_scaling=with_scaling,
+                                   quantile_range=quantile_range,
+                                   copy=copy)
+        self.columns = columns
+
+    def fit(self, X, y=None):
+        self.scaler.fit(X[self.columns], y)
+        return self
+
+    def transform(self, X, y=None, copy=None):
+        init_col_order = X.columns
+        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
+        X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)]
+        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
 
 
 def simple_estimates(features):
@@ -55,7 +94,6 @@ def simple_estimates(features):
     E = features['Event death']
     fig, axes = plt.subplots(1, 1)
 
-
     kmf = KaplanMeierFitter().fit(T, E, label='KaplanMeierFitter')
     kmf.plot_survival_function()
     axes.set_xlim(0, 21)
@@ -89,7 +127,7 @@ def KM_age_groups(features):
     print('ICU no: n=', np.nansum(was_not_on_icu))
 
     icu = False
-    special = True
+    special = False
     fig, axes = plt.subplots(1, 1)
 
     if special:
@@ -140,34 +178,76 @@ def KM_age_groups(features):
         plt.savefig('KM_survival_curve_DEATH.png', format='png', dpi=300, figsize=(20,20), pad_inches=0, bbox_inches='tight')
     plt.show()
 
+    return kmf1, kmf2, kmf3
+
+
+def cox_ph_loho(features, hazard_ratios=False,
+                imputation_method='median',
+                minimal_n=10,
+                pvalue=0.05):
+    cindex = []
+    n = []
+    for test_hospital in features['hospital'].unique():
+        print(test_hospital)
+        train_set = features[features['hospital'] != test_hospital]
+        test_set = features[features['hospital'] == test_hospital]
+
+        if len(test_set) >= minimal_n:
+            test_set = test_set.drop(columns=['hospital'])
+
+            cph, p, imputer, scaler = cox_ph(train_set, hazard_ratios=False,
+                                             imputation_method=imputation_method,
+                                             return_scaler_imputer=True)
+
+            test_set = pd.DataFrame(imputer.transform(test_set), columns=test_set.columns)
+            test_set = scaler.transform(test_set)
+
+            hazard_ratios_sum = cph.summary
+            print(hazard_ratios_sum)
+            sel_corr = hazard_ratios_sum[hazard_ratios_sum['p'] <= pvalue]
+            c = cph.score(test_set, scoring_method="concordance_index")
+            print('test hospital:', test_hospital, 'test c-index', c)
+            print(sel_corr)
+            cindex += [c]
+            n += [len(test_set)]
+        else:
+            print('test hospital:', test_hospital, ' has <= ',minimal_n,' records. Cannot determine concordance.')
+    return cindex, n
+
+def cox_ph(features, hazard_ratios=False,
+           imputation_method='median', return_scaler_imputer=False,
+           show_progress=True,
+           penalizer=0.,
+           l1_ratio=0.):
+    train_set = features.drop(columns=['hospital']) # NOTE aids_hiv is an outlier for plotting coefs
 
-def cox_ph_concordance_test(features):
-    test_hospital = 'MUMC' # this can be MUMC, Zuyderland, or AUMC - AMC
-    train_set = features[features['hospital'] != test_hospital]
-    test_set = features[features['hospital'] == test_hospital]
-
-    train_set = train_set.drop(columns=['hospital'])
-    test_set = test_set.drop(columns=['hospital'])
-
-    cph = CoxPHFitter()
-    cph.fit(train_set, duration_col='Time to event', event_col='Event death', show_progress=True, step_size=0.1)
-
-    print('with and without ICU')
-    print('test hospital:', test_hospital)
-    print('test c-index', cph.score(test_set, scoring_method="concordance_index"))
+    # impute nan with median; note that time to event and event death cannot contain nan
+    if imputation_method == 'iterative':
+        imputer = IterativeImputer(missing_values=np.nan,
+                                   initial_strategy='median')
+    else:
+        imputer = SimpleImputer(missing_values=np.nan, strategy=imputation_method)
+    train_set = pd.DataFrame(imputer.fit_transform(train_set), columns=train_set.columns)
 
+    # scale data using zscore
+    scaler = RobustScaler_min2cols(columns=train_set.columns)
+    train_set = scaler.fit_transform(train_set)
 
-def cox_ph(features, hazard_ratios=False):
-    train_set = features.drop(columns=['hospital']) # NOTE aids_hiv is an outlier for plotting coefs
+    cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
+    cph.fit(train_set, duration_col='Time to event', event_col='Event death', show_progress=show_progress, step_size=0.1)
 
-    cph = CoxPHFitter()
-    cph.fit(train_set, duration_col='Time to event', event_col='Event death', show_progress=True, step_size=0.1)
+    if True:
+        cph.print_summary()
 
-    cph.print_summary()
+        fig, ax = plt.subplots(figsize=(5, 25))
+        p = cph.plot(hazard_ratios=hazard_ratios)
+    else:
+        p = None
 
-    fig, ax = plt.subplots(figsize=(5, 25))
-    p = cph.plot(hazard_ratios=hazard_ratios)
-    return cph, p
+    if return_scaler_imputer:
+        return cph, p, imputer, scaler
+    else:
+        return cph, p
 
 def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False):
     hazard_ratios_sum = cph.summary
@@ -200,11 +280,14 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False):
     variables_to_exclude += ['Coronavirus']  # duplicate of PCR information
     data, data_struct = load_data(path_creds)
     data, data_struct = preprocess(data, data_struct)
-    features, outcomes, data, hospital, record_id = prepare_for_learning(data, data_struct,
-                                                              variables_to_include,
-                                                              variables_to_exclude,
-                                                              goal,
-                                                              use_imputation=False)
+    # %%
+    features, outcomes, data, hospital, record_id, days_until_death = prepare_for_learning(data, data_struct,
+                                                                          variables_to_include,
+                                                                          variables_to_exclude,
+                                                                          goal,
+                                                                          remove_records_threshold_above=1.0,
+                                                                          remove_features_threshold_above=0.5,
+                                                                          pcr_corona_confirmed_only=False)
     del outcomes  # not relevant in this form. Use features and hospital.
 
     # %% STEP 2: CALCULATE and CORRECT ALL TIMINGS.
@@ -215,6 +298,9 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False):
     times.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = 21.  # or 365   #  discharged alive - ALL 3
     events.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = False  # censor at t=21 days
 
+    times.loc[data['Levend dag 21 maar nog in het ziekenhuis - totaal']] = 21.
+    events.loc[data['Levend dag 21 maar nog in het ziekenhuis - totaal']] = False
+
     # now add the people that are UNKNOWN at 21 days.
     times.loc[data['Onbekend (alle patiënten zonder outcome)']] = data['days_since_admission_current_hosp'][data['Onbekend (alle patiënten zonder outcome)']]  # Onbekend (alle patiënten zonder outcome) == UNKNOWN outcome
     events.loc[data['Onbekend (alle patiënten zonder outcome)']] = False  # censor at time of event.
@@ -223,6 +309,19 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False):
     times.loc[data['Days until death'] > 21.] = 21.
     events.loc[data['Days until death'] > 21.] = False  # censor at time of event.
 
+    # fix remaining issues with events
+    sel = np.logical_and(times.isna(),data['Outcome_cat_4'] == 1) # transfer
+    events[sel] = np.nan
+    sel = np.logical_and(times.isna(),data['Outcome_cat_8'] == 1) # death
+    events[sel] = False  # no information on timing present, hence state as alive at t=0
+    times[sel] = 0.
+    sel = np.logical_and(times.isna(),data['Outcome_cat_10'] == 1) # discharged readmitted
+    events[sel] = np.nan
+
+    # remaining data: does not have any outcome or follow-up: we only now these patients are alive at t=0.
+    times[times.isna()] = 0.0
+    times[events.isna()] = np.nan
+
     features = pd.concat([features,
                           pd.Series(name='Time to event',
                                     data=times),
@@ -232,11 +331,11 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False):
                                     data=hospital)], axis=1)
 
 
-    pd.Series(name='Record Id',data=record_id)[[d not in ['VieCuri'] for d in data['hospital']]]
+    # pd.Series(name='Record Id',data=record_id)[[d not in ['VieCuri'] for d in data['hospital']]]
 
     # drop hospitals without enough data
-    data.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True)
-    features.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True)
+    # data.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True)
+    # features.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True)
 
     # now drop the people that have no information regarding durations (very new records).
     # now drop n/a's in time to event; these are incomplete records that cannot be used.
@@ -246,25 +345,31 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False):
     # translate feature columns
     features.rename(columns=englishvar_to_dutchnames, inplace=True)
 
-
-    # CORRECT SaO2
-    features['SaO2 (%)'][features['SaO2 (%)']<= 1.0] = features['SaO2 (%)'][features['SaO2 (%)']<= 1.0] * 100.
-
-    # fi_sa_ratio = (features['SaO2 (%)']/100) / features['FiO2 (%)']
-    # fi_sa_ratio[fi_sa_ratio == np.inf] = np.nan
-    # fi_sa_ratio[fi_sa_ratio == 0.0] = np.nan
+    # do not model data with > 50% missing data
+    drop_vars = features.columns[np.sum(features.isnull())/len(features) > 0.5]
+    features = features.drop(columns=drop_vars)
 
     # %% STEP 3: SHOW KM CURVE
     # simple_estimates(features)  # events: True = death, False = censor, None = ?
-    KM_age_groups(features)
+    kmf =  KM_age_groups(features)
+
+    # %% step 5: valdiate concordance with LOHO
+    imputation_method = 'iterative'
+    hazard_ratios = False
+    cindex, testsetn = cox_ph_loho(features, hazard_ratios=hazard_ratios,
+                         imputation_method=imputation_method,
+                         minimal_n=25)
+    print('concordances (mean: {}) = {}'.format(round(np.mean(cindex),3), [round(c,3) for c in cindex]))
 
     # %% STEP 4: COX REGRESSION.
     # USE HERE: features, hospital, times, events
-    cph, plot = cox_ph(features, hazard_ratios=False)
+    cph, plot = cox_ph(features, hazard_ratios=True,
+                       imputation_method=imputation_method)
     plot.figure.savefig('Cox_coef_all.png', format='png', dpi=300, figsize=(3,23), pad_inches=0, bbox_inches='tight')
 
     # %%
-    cphplt = cox_plot_pvalue_only(cph, 0.05, hazard_ratios=False)
+    cphplt = cox_plot_pvalue_only(cph, 0.05, hazard_ratios=True)
+
     # run on command line:
     cphplt.figure.set_figheight(6)
-    cphplt.figure.savefig('Cox_coef_pvalue_.05.png', format='png', dpi=300, figsize=(7,9), pad_inches=0, bbox_inches='tight')
\ No newline at end of file
+    cphplt.figure.savefig('Cox_coef_pvalue_.05.png', format='png', dpi=300, figsize=(7,9), pad_inches=0, bbox_inches='tight')
diff --git a/data_query_scripts/survival_death_km.py b/data_query_scripts/survival_death_km.py
new file mode 100644
index 0000000..540bf2d
--- /dev/null
+++ b/data_query_scripts/survival_death_km.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 29 20:19:18 2020
+Example to run survival model.
+@author: wouterpotters
+"""
+import configparser
+import matplotlib
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), './../'))
+from covid19_ICU_admission import load_data, preprocess, prepare_for_learning
+from get_feature_set import get_1_premorbid, \
+                            get_2_clinical_presentation, \
+                            get_3_laboratory_radiology_findings, \
+                            get_5_premorbid_clin_rep_lab_rad
+
+import matplotlib.pyplot as plt
+from lifelines import KaplanMeierFitter
+from lifelines.statistics import logrank_test, pairwise_logrank_test
+import pandas as pd
+import numpy as np
+from sklearn.experimental import enable_iterative_imputer  # Required to enable experimental iterative imputer
+from sklearn.impute import SimpleImputer, IterativeImputer
+from sklearn.preprocessing import StandardScaler, RobustScaler
+
+plt.rcParams["font.family"] = "Times New Roman"
+matplotlib.rcParams.update({'font.size': 14})
+
+c1 = ( 57/255, 106/255, 177/255)  # blue
+c2 = (218/255, 124/255,  48/255)  # orange
+c3 = ( 83/255,  81/255,  84/255)  # black
+c4 = ( 62/255, 150/255,  81/255)  # green
+c5 = (204/255,  37/255,  41/255)  # red
+
+englishvar_to_dutchnames = {'age_yrs':'Leeftijd (jaren)','gender_male':'Geslacht (man)','gender':'Geslacht (man)','hypertension':'Hypertensie (ja)','Coagulation_disorder_1_1':'Stollingsstoornis (ja)','autoimm_disorder_1':'Auto-immuunziekte (ja)','rheuma_disorder':'Reumatologische ziekte (ja)','organ_transplant_1':'Orgaan donatie ondergaan (ja)',
+                            'diabetes_without_complications':'DM (zonder complicaties) (ja)','diabetes_complications':'DM (met complicaties) (ja)','ccd':'Chronische hartziekte (ja)','ckd':'Chronische nierziekte (ja)','cnd':'Chronische neurologische aandoening (ja)','cpd':'chronische longziekte (ja)','asthma':'Asthma (ja)','Dementia':'dementie (ja)',
+                            'mld':'Milde leveraandoening (ja)','live_disease':'Matig-ernstige leveraandoening (ja)','mneoplasm':'Maligniteit (ja)','chd':'Chronische hematologische ziekte (ja)','alcohol':'Alcohol abusus (ja)','Smoking in History or Active':'Roken actief of in verleden (ja)','obesity':'Obesitas (BMI > 30) (ja)',
+                            'home_medication':'Thuismedicatie (ja)','immune_sup':'Immunosupressiva (ja)','ace_i_spec_1':'Gebruik ACE inhibitor (ja)','Corticosteroid_type_1':'Corticosteroiden gebruik (ja)','sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','rtr':'Ademhalingsfrequentie (1/min)',
+                            'oxygen_saturation':'Saturatie (%)','Temperature':'Temperatuur (ºC)','Antiviral_agent_1':'Behandeling met antivirale middelen (ja)','Oxygen_therapy_1':'zuurstof therapie (ja)','Non_invasive_ventilation_1':'non-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)',
+                            'intubate_yesno':'Niet intuberen (ja)','CT_thorax_performed':'CT thorax verricht op SEH (ja)','corads_admission':'CO-RAD score  (1-5)','pcr_pos':'Corona virus PCR + op SEH (ja)','Coagulation_disorder_1_1':'Stollingsstoornis (ja)',
+                            'Acute_renal_injury_Acute_renal_failure_1_1':'Nierfunctie stoornis waarvoor dialyse (ja)','diabetes_complications':'Diabetes met complicaties (ja)','cpd':'chronische longziekte (ja)','Smoking':'Actieve roker (ja)','obesity':'Obesitas (ja)','immune_sup':'immunosupressiva (ja)','ace_i_spec_1':'Gebruik ACE remmer (ja)',
+                            'sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','capillary_refill':'Verlengde capillaire refill (ja)','rtr':'Ademhalingsfrequentie (1/min)','oxygen_saturation':'Zuurstof saturatie (%)','SaO2_1':'SaO2 (%)','PaO2_1_Arterial':'PaO2 arteriëel (kPa)',
+                            'PaO2_1_Venous':'PaO2 veneus (kPa)', 'PaO2_1_Capillary':'PaO2 capillair (kPa)', 'PaO2_1_nan':'PaO2 onbekend (kPa)', 'PCO2_1':'pCO2 (kPa)','fio2_1':'FiO2 (%)','Temperature':'Temperatuur (ºC)','eye':'EMV','Antiviral_agent_1':'Anti-virale middelen (ja)','Antibiotic_1':'Antibiotica (ja)',
+                            'Corticosteroid_1':'Corticosteroiden voor ARDS (ja)','Corticosteroid_type_1':'gebruik corticosteroiden (ja)','Oxygen_therapy_1':'Zuurstof behandeling (ja)',
+                            'Non_invasive_ventilation_1':'Niet-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)','intubate_yesno':'Niet intuberen (ja)','auxiliary_breathing_muscles':'Gebruik van extra ademhalingsspieren (ja)','fever':'Koorts gehad sinds eerste klachten (ja)',
+                            'Anosmia':'Anosmie (ja)','Rhinorrhoea':'Rhinorrhoea (ja)','Sore_throat':'Keelpijn (ja)','cough_sputum':'Hoest met sputum (ja)','cough_sputum_haemoptysis':'Hoest met bloederig slijm/ haemoptoë (ja)','Arthralgia':'Arthralgie (ja)','Myalgia':'Spierpijn (ja)','Fatigue_Malaise':'Vermoeidheid/ algehele malaise (ja)',
+                            'Abdominal_pain':'Buikpijn (ja)','Vomiting_Nausea':'Misselijkheid/ overgeven (ja)','Diarrhoea':'diarree (ja)','Dyspnea':'Dyspneu (ja)','Wheezing':'Piepende ademhaling (ja)','Chest_pain':'Pijn op de borst (ja)','ear_pain':'oorpijn (ja)','Bleeding_Haemorrhage':'bloeding (ja)','Headache':'Hoofdpijn (ja)',
+                            'confusion':'Veranderd bewustzijn (ja)','Seizures':'Insulten (ja)','infiltrates_2':'Infiltraat op X-thorax (ja)','corads_admission':'CO-RADS (1-5)','Glucose_unit_1_1':'glucose (mmol/L)','Sodium_1_1':'Natrium (mmol/L)','Potassium_1_1':'Kalium (mmol/L)','Blood_Urea_Nitrogen_value_1':'Ureum (mmol/L)',
+                            'Creatinine_value_1':'Creatinine (µmol/L)','Calcium_adm':'Ca (totaal) (mmol/L)','ferritine_admin_spec':'ferritine (mg/L)','creatininekinase':'CK (U/L)','d_dimer':'D-Dimeer (nmol/L)','AST_SGOT_1_1':'ASAT (U/L)','ALT_SGPT_1_1':'ALAT (U/L)','Total_Bilirubin_2_1':'totaal Bilirubine (IE)','LDH':'LDH (U/L)',
+                            'PH_value_1':'Zuurgraad (pH)','Lactate_2_1':'Lactaat (mmol/L)','crp_1_1':'CRP (mg/L)','Haemoglobin_value_1':'Hb (mmol/L)','Platelets_value_1':'trombocyten (x10^9/L)','WBC_2_1':'Leukocyten (x10^3/µL)','Lymphocyte_1_1':'Lymfocyten (x10^9/L)','Neutrophil_unit_1':'Neutrofielen (x10^9/L)','INR_1_1':'INR',
+                            'pt_spec':'PT (sec)','fibrinogen_admin':'Fibrinogeen (g/L)','pcr_pos':'Corona PCR + (ja)','Adenovirus':'Adenovirus (ja)','RSV_':'RS virus (ja)','Influenza':'Influenza virus (ja)','Bacteria':'Sputumkweek (ja)','days_untreated':'Tijd sinds eerste klachten (dagen)','irregular':'Onregelmatige hartslag (ja)',
+                            'DNR_yesno':' (ja)','healthcare_worker':'Zorgmedewerker (ja)','infec_resp_diagnosis':'Andere infectieuze ademhalingsdiagnose','Blood_albumin_value_1':'Albumine (g/L)','culture':'Bloedkweek (positief)','APT_APTR_1_1':'APTT (sec)','uses_n_medicine':'Aantal thuismedicamenten'}
+
+import unicodedata
+import string
+
+valid_filename_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
+char_limit = 255
+
+def clean_filename(filename, whitelist=valid_filename_chars, replace=' '):
+    # replace spaces
+    for r in replace:
+        filename = filename.replace(r,'_')
+
+    # keep only valid ascii chars
+    cleaned_filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
+
+    # keep only whitelisted chars
+    cleaned_filename = ''.join(c for c in cleaned_filename if c in whitelist)
+    if len(cleaned_filename)>char_limit:
+        print("Warning, filename truncated because it was over {}. Filenames may no longer be unique".format(char_limit))
+    return cleaned_filename[:char_limit]
+
+
+class StandardScaler_min2cols(StandardScaler):
+    def __init__(self,columns,copy=True, with_mean=False, with_std=True):
+        self.scaler = StandardScaler(copy, with_mean=with_mean, with_std=with_std)
+        self.columns = columns
+
+    def fit(self, X, y=None):
+        self.scaler.fit(X[self.columns], y)
+        return self
+
+    def transform(self, X, y=None, copy=None):
+        init_col_order = X.columns
+        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
+        X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)]
+        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
+
+class RobustScaler_min2cols(RobustScaler):
+    def __init__(self, columns, copy=True, with_centering=True,
+                 with_scaling=True, quantile_range=(25.0, 75.0)):
+        self.scaler = RobustScaler(with_centering=with_centering,
+                                   with_scaling=with_scaling,
+                                   quantile_range=quantile_range,
+                                   copy=copy)
+        self.columns = columns
+
+    def fit(self, X, y=None):
+        self.scaler.fit(X[self.columns], y)
+        return self
+
+    def transform(self, X, y=None, copy=None):
+        init_col_order = X.columns
+        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
+        X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)]
+        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
+
+
+def KM_age_groups(features, outcomes):
+    T = outcomes['duration_mortality']
+    E = outcomes['event_mortality']
+
+    features['Leeftijd (jaren)'] = features['Leeftijd (jaren)'].astype(float)
+    age_70_plus = features['Leeftijd (jaren)'] >= 70.
+    print('Alle patiënten: n=', len(age_70_plus))
+    print('Leeftijd ≥ 70: n=', sum(age_70_plus))
+    print('Leeftijd < 70: n=', sum(~age_70_plus))
+
+    was_not_on_icu = data[['Levend ontslagen en niet heropgenomen - waarvan niet opgenomen geweest op IC',
+                           'Levend dag 21 maar nog in het ziekenhuis - niet op IC geweest',
+                           'Dood op dag 21 - niet op IC geweest']].any(axis=1)
+    was_on_icu = data[['Levend ontslagen en niet heropgenomen - waarvan opgenomen geweest op IC',
+                       'Levend dag 21 maar nog in het ziekenhuis - op IC geweest',
+                       'Levend dag 21 maar nog in het ziekenhuis - waarvan nu nog op IC',
+                       'Dood op dag 21 - op IC geweest']].any(axis=1)
+    print('Alle patiënten: n=', (np.nansum(was_on_icu)+np.nansum(was_not_on_icu)))
+    print('ICU yes: n=', np.nansum(was_on_icu))
+    print('ICU no: n=', np.nansum(was_not_on_icu))
+
+    icu = False
+    fig, axes = plt.subplots(1, 1)
+
+    if icu:
+        kmf1 = KaplanMeierFitter().fit(T[was_on_icu], E[was_on_icu], label='Wel op ICU geweest')
+        kmf2 = KaplanMeierFitter().fit(T[was_not_on_icu], E[was_not_on_icu], label='Niet op ICU geweest')
+    else: # age 70
+        kmf1 = KaplanMeierFitter().fit(T[age_70_plus], E[age_70_plus], label='Leeftijd ≥ 70 jaar')
+        kmf2 = KaplanMeierFitter().fit(T[~age_70_plus], E[~age_70_plus], label='Leeftijd < 70 jaar')
+
+    kmf3 = KaplanMeierFitter().fit(T, E, label='Alle patienten')
+    if icu:
+        kmf1.plot_survival_function(color=c4)
+        kmf2.plot_survival_function(color=c5)
+        kmf3.plot_survival_function(color=c3)
+    else:
+        kmf1.plot_survival_function(color=c1)
+        kmf2.plot_survival_function(color=c2)
+        kmf3.plot_survival_function(color=c3)
+
+
+    axes.set_xticks([1, 5, 9, 13, 17, 21])
+    axes.set_xticklabels(['1','5','9','13','17','21'])
+    axes.set_xlabel('Aantal dagen sinds opnamedag')
+    axes.set_ylabel('Proportie overlevend')
+
+    axes.set_xlim(0, 21)
+    axes.set_ylim(0, 1)
+    titledict = {'fontsize': 18,
+                 'fontweight': 'bold',
+                 'verticalalignment': 'baseline',
+                 'horizontalalignment': 'center'}
+    # plt.title('COVID-PREDICT survival functie tot t = 21 dagen (n='+str(len(T))+')',fontdict=titledict)
+    plt.tight_layout()
+    if icu:
+        plt.savefig('KM_survival_curve_ICU.png', format='png', dpi=300, figsize=(20,20), pad_inches=0, bbox_inches='tight')
+    else:
+        plt.savefig('KM_survival_curve_DEATH.png', format='png', dpi=300, figsize=(20,20), pad_inches=0, bbox_inches='tight')
+    plt.show()
+
+    return kmf1, kmf2, kmf3
+
+
+def KM_all_vars(features, outcomes):
+    T = outcomes['duration_mortality']
+    E = outcomes['event_mortality']
+
+    is_binary = features.columns[['ja' in f for f in features.columns]]
+    is_binary = is_binary.append(features.columns[['positief' in f for f in features.columns]])
+    is_binary = is_binary.append(features.columns[['Andere infectieuze ademhalingsdiagnose' in f for f in features.columns]])
+    is_binary = is_binary.append(features.columns[['oxygen_saturation_on_cat_' in f for f in features.columns]])
+    is_float = ['ALAT (U/L)', 'ASAT (U/L)', 'Ureum (mmol/L)', 'Albumine (g/L)',
+                'Ca (totaal) (mmol/L)', 'Creatinine (µmol/L)', 'glucose (mmol/L)',
+                'Hb (mmol/L)', 'hart frequentie (1/min)', 'LDH (U/L)', 'Lactaat (mmol/L)',
+                'Lymfocyten (x10^9/L)', 'Neutrofielen (x10^9/L)', 'pCO2 (kPa)',
+                'Zuurgraad (pH)', 'PaO2 arteriëel (kPa)', 'trombocyten (x10^9/L)',
+                'Kalium (mmol/L)', 'SaO2 (%)', 'Natrium (mmol/L)', 'Temperatuur (ºC)',
+                'totaal Bilirubine (IE)', 'Leukocyten (x10^3/µL)',  'CK (U/L)',
+                'CRP (mg/L)', 'Tijd sinds eerste klachten (dagen)', 'Diastolische bloeddruk (mmHg)',
+                'ferritine (mg/L)', 'FiO2 (%)', 'Zuurstof saturatie (%)',
+                'Ademhalingsfrequentie (1/min)', 'Systolische bloeddruk (mmHg)',
+                'Aantal thuismedicamenten']
+    is_categorical_corads = ['corads_admission_cat_1', 'corads_admission_cat_2',
+                             'corads_admission_cat_3', 'corads_admission_cat_4',
+                             'corads_admission_cat_5']
+    is_categorical_male_female = ['gender_cat_1', 'gender_cat_2']
+    ngroups = 3
+    statres = []
+    for f in features.columns:
+        print(f)
+        kmfs = None
+        fig, axes = plt.subplots(1, 1)
+        features[f] = features[f].astype(float)
+        if f in is_binary:
+            yes = features[f] == 1
+            if sum(yes) > 0:
+                kmf_yes = KaplanMeierFitter().fit(T[yes], E[yes], label='Ja (n={})'.format(np.nansum(yes)))
+
+            no = features[f] == 0
+            if sum(no) > 0:
+                kmf_no = KaplanMeierFitter().fit(T[no], E[no], label='Nee (n={})'.format(np.nansum(no)))
+
+            na = features[f].isna()
+            if sum(na) > 0:
+                kmf_na = KaplanMeierFitter().fit(T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na)))
+
+            kmfs = [kmf_yes, kmf_no, kmf_na]
+            # event_durations (iterable) – a (n,) list-like representing the (possibly partial) durations of all individuals
+            # groups (iterable) – a (n,) list-like of unique group labels for each individual.
+            # event_observed (iterable, optional) – a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed.
+            # t_0 (float, optional (default=-1)) – the period under observation, -1 for all time.
+            groups = features[f].astype("category")
+            event_durations = T
+            event_observed = E
+
+        elif f in is_float:
+            ff = pd.qcut(features[f], ngroups, labels=np.arange(ngroups)+1)
+            kmfs = []
+            for q in ff.unique():
+                sel = ff == q
+                if sum(sel) > 0:
+                    kmf_q = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(q, np.nansum(sel)))
+                    kmfs += [kmf_q]
+            sel = ff.isna()
+            if sum(sel) > 0:
+                q = 'Onbekend'
+                kmf_q = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(q, np.nansum(sel)))
+                kmfs += [kmf_q]
+            groups = ff
+            event_durations = T
+            event_observed = E
+
+        elif f in is_categorical_corads:
+            kmfs = []
+            for f in is_categorical_corads:
+                sel = features[f]
+                kmf_f = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(f, np.nansum(sel)))
+                kmfs += [kmf_f]
+            na = features[is_categorical_corads].any(axis=1) == False
+            if sum(na) > 0:
+                kmf_na = KaplanMeierFitter().fit(T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na)))
+            groups = is_categorical_corads#.astype("category")
+            event_durations = T
+            event_observed = E
+
+            statres += [None]  # FIXME
+            continue  # FIXME
+
+        elif f in is_categorical_male_female:
+            kmfs = []
+            for f in is_categorical_male_female:
+                sel = features[f] == 1.0
+                kmf_f = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(f, np.nansum(sel)))
+                kmfs += [kmf_f]
+            na = features[is_categorical_male_female].any(axis=1) == False
+            if sum(na) > 0:
+                kmf_na = KaplanMeierFitter().fit(T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na)))
+            groups = is_categorical_male_female#.astype("category")
+            event_durations = T
+            event_observed = E
+
+            statres += [None]  # FIXME
+            continue  # FIXME
+
+        else:
+            # split data in X groups
+            print('{} not implemented'.format(f))
+
+        groups = groups.cat.add_categories(-1).fillna(-1) # treat nan as seperate group.
+
+        if len(np.unique(groups)) < 5:
+            ss = [pairwise_logrank_test(event_durations, groups, event_observed, t_0=21.)]
+            p = np.min([s.p_value for s in ss])
+            print(p)
+            if p * 289 < 0.05:
+                print('{} < 0.05 (p: , corrected p: {})'.format(f, p, p*289))
+            statres += ss
+        else:
+            print('error in groups: {} - {}'.format(f, groups))
+
+        if kmfs:
+            [k.plot_survival_function() for k in kmfs]
+
+            axes.set_xticks([1, 5, 9, 13, 17, 21])
+            axes.set_xticklabels(['1', '5', '9', '13', '17', '21'])
+            axes.set_xlabel('Aantal dagen sinds opnamedag')
+            axes.set_ylabel('Proportie overlevend')
+
+            axes.set_xlim(0, 21)
+            axes.set_ylim(0, 1)
+        plt.title('Kaplan-Meier survival - {}'.format(f))
+        plt.tight_layout()
+
+        filename = clean_filename('KM_{}.png'.format(f))
+        plt.show()
+        fig.savefig(os.path.join('km_curves', filename), format='png', dpi=300, figsize=(20, 20), pad_inches=0, bbox_inches='tight')
+
+    return statres
+
+
+if __name__ == '__main__':
+    config = configparser.ConfigParser()
+    config.read('../user_settings.ini')
+    path_creds = config['CastorCredentials']['local_private_path']
+
+    # %% STEP 1: GET THE DATA USING MAARTEN'S FUNCTIONS
+    # For more info: please check covid19_ICU_util.py:select_x_y()
+    goal = ['survival', 'mortality_all']
+
+    # Add all 'Field Variable Name' from data_struct to
+    # INCLUDE variables from analysis
+    #  NOTE: See get_feature_set.py for preset selections
+    variables_to_include = {
+        'Form Collection Name': [],  # groups
+        'Form Name':            [],  # variable subgroups
+        'Field Variable Name': get_5_premorbid_clin_rep_lab_rad() # single variables
+    }
+
+    # Add all 'Field Variable Name' from data_struct to
+    # EXCLUDE variables from analysis, because <10 yes or no answers are present.
+    variables_to_exclude = ['Cardiac_consultation','Meningitis_Encephalitis_1_1','CAR_MH_AF','susp_pe','FH_Cor','CP_oedema','Seizure_1_1','MH_cardiacdevice','ear_pain','MH_SVT_AF_1','CAR_T0_PALP','Inhaled_Nitric_Oxide_1','MH_SVT_AFL','Outcome_7d_hosp','Endocarditis_Myocarditis_Pericarditis_1_1','MH_SVT_AF','Adenovirus','VAP','pregyn_rptestcd','carmed_ami','CAR_CM_heartfailure','CP_ortho','vasc_lipidlowering','discharge_AKI','MH_CABG','carmed_cipro','microbiology_worker','discharge_ards','post_partum','MH_HF','CAR_PM','cvrisk_famhist','dx_his_acq_LQT_1','CAR_MH_aLQT','MH_ACS','discharge_CT','vasc_creat','same_id','Med_New_QT_Prol','carmed_arb_chronic','CP_sync','Adm_CT','MH_SVT_AFL_1','CRF_QT','echo_FU','MH_PCI','CAR_T0_PE','ECG_adm','aids_hiv','discharge_swap','CP_PE','MH_ischemia_det','vasc_antifxa_measured','ECG_adm_ST','carmed_cita','corona_ieorres','Pregnancy']
+    variables_to_exclude += ['Coronavirus']  # duplicate of PCR information
+    data, data_struct = load_data(path_creds)
+    data, data_struct = preprocess(data, data_struct)
+
+    features, outcomes, data, hospital, record_id, days_until_death = prepare_for_learning(data, data_struct,
+                                                                          variables_to_include,
+                                                                          variables_to_exclude,
+                                                                          goal,
+                                                                          remove_records_threshold_above=None,
+                                                                          remove_features_threshold_above=0.75,
+                                                                          pcr_corona_confirmed_only=False)
+    # # translate feature columns
+    features.rename(columns=englishvar_to_dutchnames, inplace=True)
+
+    # %% STEP 2: SHOW KM CURVE
+    # kmf = KM_age_groups(features, outcomes)
+    kmf_all = KM_all_vars(features, outcomes)
diff --git a/error_replacements.py b/error_replacements.py
index acdc1fd..3a52c5c 100644
--- a/error_replacements.py
+++ b/error_replacements.py
@@ -31,7 +31,9 @@ def get_column_fix_dict():
     '''
     return {
         'oxygentherapy_1': [[-98, None]],
-        'Smoking': [[-99, None]]
+        'Smoking': [[-99, None]],
+        'whole_admission_yes_no': [['1;1', None], ['1;2', None]], 
+        'whole_admission_yes_no_1':  [['1;1', None], ['1;2', None]]
     }
 
 
diff --git a/generate_figures.py b/generate_figures.py
new file mode 100644
index 0000000..3f756d4
--- /dev/null
+++ b/generate_figures.py
@@ -0,0 +1,469 @@
+import os
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import roc_auc_score
+
+import seaborn as sns
+
+DPI = 600
+
+def get_distance_to_corner(y_hat, y):
+    # np.sqrt((1-sensitivity)**2  + (1-specificity)**2)
+    # sensitivity (TPR) = TP / (TP+FN)
+    # specificity (TNR) = TN / (TN+FP)
+    y_hat = get_cm_label(y, y_hat)
+    sens = (y_hat==1).sum() / ((y_hat==1).sum() + (y_hat==4).sum())
+    spec = (y_hat==2).sum() / ((y_hat==2).sum() + (y_hat==3).sum())
+    d = np.sqrt((1-sens)**2 + (1-spec)**2)
+    return d
+
+def get_fpr_thr(y, y_hat, fpr_goal=.1, steps=100):
+    # Finds the threshold corresponding with the
+    # goal FPR
+    best_error = 100
+    best_thr = 100
+    for thr in np.linspace(0, 1, steps+1):
+        yh = pd.Series(0, index=y.index)
+        yh.loc[y_hat>thr] = 1
+        yh = get_cm_label(y, yh)
+        fpr = yh[yh==3].count() / (yh[yh==3].count() + yh[yh==2].count())
+        error = abs(fpr_goal - fpr)
+        if error < best_error:
+            best_error = error
+            best_thr = thr
+    result = pd.Series(0, index=y.index, name='y_hat_{}'.format(fpr_goal))
+    result.loc[y_hat>best_thr] = 1
+    return result, best_thr
+
+def get_confusion_matrix(y_hat):
+    # Y_hat labeled by tp/tn/fp/fn
+    # [[tn, fp], [fn, tp]]
+    return [[(y_hat==2).sum(), (y_hat==3).sum()], 
+            [(y_hat==4).sum(), (y_hat==1).sum()]]
+
+def get_cm_label(y, y_hat):
+    result = pd.Series(None, index=y.index) 
+    result.loc[(y==1) & (y_hat==1)] = 1  # TP
+    result.loc[(y==0) & (y_hat==0)] = 2  # TN
+    result.loc[(y==0) & (y_hat==1)] = 3  # FP
+    result.loc[(y==1) & (y_hat==0)] = 4  # FN
+    return result
+
+def get_yhats(y, y_hat, fpr_goals):
+    yhs = []
+    thresholds = []
+    for goal in fpr_goals:
+        yh, threshold = get_fpr_thr(y, y_hat, fpr_goal=goal, steps=100)
+        yh = get_cm_label(y, yh)
+        yhs += [yh]
+        thresholds += [threshold]
+    
+    # Include optimal threshold
+    thresholds = np.linspace(0, 1, 101)
+    best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat>thr, y) 
+                                    for thr in thresholds])]
+    yhs += [get_cm_label(y, y_hat>best_thr)]
+    threshold += [best_thr]
+    return pd.concat(yhs, axis=1), thresholds
+
+def get_avg_ci(arr):
+    avg = arr.mean(axis=0)
+    std = arr.std(axis=0)
+    ci = 1.96*std/np.sqrt(arr.shape[0])
+    return np.append(avg, ci)
+
+def get_results(y, y_hat):
+    auc = roc_auc_score(y, y_hat)
+    cm = get_confusion_matrix(get_cm_label(y, y_hat))
+    return [auc, cm[0][0], cm[0][1], cm[1][0], cm[1][1]]
+
+def get_simple_model_performance(y, y_hat_lr, y_hat_xgb, hospital):
+    age = pd.read_excel('age.xlsx').iloc[:, 1]
+    y_hat_70 = age > 70
+    y_hat_80 = age > 80
+    sz = min(y_hat_lr.size, y_hat_xgb.size, y_hat_70.size, y_hat_80.size, y.size, hospital.size)
+    hospital = hospital[:sz].reset_index(drop=True)
+    y = y.iloc[:sz].reset_index(drop=True)
+    y_hat_lr = y_hat_lr.iloc[:sz].reset_index(drop=True)
+    y_hat_xgb = y_hat_xgb.iloc[:sz].reset_index(drop=True)
+    y_hat_70 = y_hat_70.iloc[:sz].reset_index(drop=True)
+    y_hat_80 = y_hat_80.iloc[:sz].reset_index(drop=True)
+
+
+    # TODO: AUC CHANGES BY TURNING BINARY WITH THRESHOLD
+    thresholds = np.linspace(0, 1, 101)
+    best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat_xgb>thr, y) 
+                                        for thr in thresholds])]     
+    y_hat_xgb = y_hat_xgb>best_thr
+    best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat_lr>thr, y) 
+                                        for thr in thresholds])]     
+    y_hat_lr = y_hat_lr>best_thr
+
+    unique_hospitals = hospital.unique()
+    result_70 = []
+    result_80 = []
+    result_lr = []
+    result_xgb = []
+    for h in unique_hospitals:
+        mask = hospital==h
+        result_70 += [get_results(y[mask], y_hat_70[mask])]
+        result_80 += [get_results(y[mask], y_hat_80[mask])]
+        result_lr += [get_results(y[mask], y_hat_lr[mask])]
+        result_xgb += [get_results(y[mask], y_hat_xgb[mask])]
+
+    result = pd.DataFrame(index=['result_70', 'result_80', 'result_lr', 'result_xgb'],
+                        columns=['auc', 'tn', 'fp', 'fn', 'tp', 
+                                'auc_ci', 'tn_ci', 'fp_ci', 'fn_ci', 'tp_ci'])
+    result_70 = get_avg_ci(np.asarray(result_70))
+    result_80 = get_avg_ci(np.asarray(result_80))
+    result_lr = get_avg_ci(np.asarray(result_lr))
+    result_xgb = get_avg_ci(np.asarray(result_xgb))
+
+
+def plot_conf_mats(y, ys, hospitals, name, savepath):
+    fprs = []
+    # Overal
+
+    fig, ax = plt.subplots(1, ys.shape[1], sharey=True, figsize=(8, 4))
+    for i, thr in enumerate(ys.columns):
+        y_hat = ys[thr]
+        tp = (y_hat==1).sum()
+        tn = (y_hat==2).sum()
+        fp = (y_hat==3).sum()
+        fn = (y_hat==4).sum()
+        df_cm = pd.DataFrame([[tn, fp], [fn, tp]],
+                                index=[0, 1], columns=[0, 1])
+        fpr = fp/(fp+tn)
+        fprs += [fpr]
+        sns.heatmap(df_cm, annot=True, fmt="d", 
+                    cmap=plt.get_cmap('Blues'), cbar=False,
+                    ax=ax[i])
+        # ax[i].set_xlabel('Threshold: {}'.format(thr.split('_')[2]))
+        ax[i].set_title('FPR: {:.2f}'.format(fpr))
+        ax[i].set_aspect('equal', 'box')
+        if i==0:
+            ax[i].set_ylabel('True label'.format(y_hat.size))
+
+        plt.suptitle('Confusion matrix\n{} features'.format(name))
+        fig.text(0.5, 0.2, 'Predicted label', ha='center', fontstyle='normal')
+        # fig.text(0.04, 0.5, 'True label\nn={}'.format(y_hat.size),
+                #  va='center', rotation='vertical')
+
+    fig.savefig(savepath + 'confusionmatrices_total.png', dpi=DPI)
+
+    # Per hospital
+    hosps = hospitals.unique()
+    fig, ax = plt.subplots(hosps.size, ys.shape[1],
+                           sharex=True, sharey=True,
+                           figsize=(5, 9))
+    for i, hosp in enumerate(hosps):
+        # y_h = y.loc[hospitals==hosp]
+        ys_h = ys.loc[hospitals==hosp, :]
+        for j, thr in enumerate(ys.columns):
+            y_hat = ys_h[thr]
+            tp = (y_hat==1).sum()
+            tn = (y_hat==2).sum()
+            fp = (y_hat==3).sum()
+            fn = (y_hat==4).sum()
+            df_cm = pd.DataFrame([[tn, fp], [fn, tp]],
+                                 index=[0, 1], columns=[0, 1])
+            # fpr = fp/(fp + tn) # == fpr per hospital per threshold 
+            sns.heatmap(df_cm, annot=True, fmt="d", 
+                        cmap=plt.get_cmap('Blues'), cbar=False,
+                        ax=ax[i, j])
+            
+            if i == 0:
+                ax[i, j].set_title('Overall FPR: {:.2f}'.format(fprs[j]), fontsize=8)
+            if j == 0:
+                ax[i, j].set_ylabel('{}\nn={}'.format(hosp, y_hat.size), fontsize=6)
+            ax[i, j].set_aspect('equal', 'box')
+
+    plt.suptitle('Confusion matrix per hospital per threshold\nFeatures: {}'.format(name))
+    fig.text(0.5, 0.04, 'Predicted label', ha='center')
+    fig.text(0.04, 0.5, 'True label', va='center', rotation='vertical')
+    fig.savefig(savepath + 'confusion_matrices_per_hosptal.png', dpi=DPI)
+
+def plot_dists(x, y, thresholds, savepath, 
+               auc=None, histogram=False, kde=True):
+
+    title = 'Class distribution per variable'
+    if auc != None:
+        title += ' - auc: {:.3f}'.format(auc)
+
+    n_cols = 5
+    n_rows = 2
+    for i, column in enumerate(x.columns.to_list()):
+        row = (i//n_cols)%2
+        col = i%12
+        page = i%(n_rows*n_cols)
+
+        if i%(n_rows*n_cols)==0:
+            fig, axes = plt.subplots(n_rows, n_cols, sharex=False, sharey=False)
+            fig.suptitle('{} (page {}/{})'.format(title, page, len(x.columns)//(n_cols*n_rows)+1))
+        
+        sns.distplot(x.loc[y==0, column], hist=histogram, kde=kde, ax=axes[row, col], color='b', label='Alive')
+        sns.distplot(x.loc[y==1, column], hist=histogram, kde=kde, ax=axes[row, col], color='r', label='Deceased')
+
+        for k, thr in enumerate(thresholds):
+            axes[row, col].axvline(thr, linewidth=1, label='threshold: {:.2f}'.format(thr))
+        
+        axes[row, col].set_title(column, fontsize=7)
+        axes[row, col].set_xlabel('')
+        axes[row, col].set_ylabel('')
+
+        # axes[row, col].get_legend().set_visible(False)
+        if row == n_rows-1 & col==0:
+            handles, labels = axes[row, col].get_legend_handles_labels()
+            fig.legend(handles, labels, loc='upper right')
+        fig.savefig(savepath + 'feature_distribution_p{}'.format(page), dpi=DPI)
+    
+    plt.show()
+
+def plot_correct_per_day(y, y_hat, dto, thresholds, name='', savepath='./'):
+    fig, ax = plt.subplots(1, 1, figsize=(16, 9))
+    for i, col in enumerate(y_hat.columns):
+        if i < y_hat.columns.size-1:
+            continue
+        y = pd.Series(0, index=y_hat.index)
+        y[y_hat[col].isin([1, 2])] = 1
+        can_use = dto.notna() & (dto <= 21) & (dto >= 0)
+
+        pos = pd.Series(dto[(y==1) & can_use].value_counts().sort_index(), 
+                        index=list(range(0, 22))).fillna(0)
+        neg = pd.Series(dto[(y==0) & can_use].value_counts().sort_index(), 
+                        index=list(range(0, 22))).fillna(0)
+        rel = pos / (pos + neg)
+
+        df = pd.concat([pos, neg, rel], axis=1)
+        df.columns = ['Correct', 'Incorrect', 'Relative']
+
+        df.iloc[:, 0:2].plot.bar(ax=ax, legend=False)
+        ax2 = ax.twinx()
+        df.iloc[:, -1].plot.bar(ax=ax2, color='g', alpha=0.2, label='Relative')
+
+    ax.set_xlabel('Day')
+    ax.set_ylabel('Count')
+    ax2.set_ylabel('Relative correct')
+    ax.set_title('Prediction per day - {} features'.format(name))
+    
+    bar, labels = ax.get_legend_handles_labels()
+    bar2, labels2 = ax2.get_legend_handles_labels()
+    ax2.legend(bar+bar2, labels+labels2)
+    fig.savefig(savepath+'prediction_per_day_{}'.format(name), dpi=DPI)
+
+def plot_cm_simple_baseline(y, y_hat, y_hat_xgb, feat, cutoff, name, savepath):
+    age = pd.read_excel('age.xlsx').iloc[:, 1]
+    y_hat_s = age > 70
+    y_hat_s2 = age > 80
+    sz = min(y_hat_s.size, y_hat_xgb.size, y_hat_s2.size, y_hat.size, y.size)
+    y = y.iloc[:sz]
+    y_hat = y_hat.iloc[:sz]
+    y_hat_xgb = y_hat_xgb.iloc[:sz]
+    y_hat_s = y_hat_s.iloc[:sz]
+    y_hat_s2 = y_hat_s2.iloc[:sz]
+
+    auc_s = roc_auc_score(y, y_hat_s)
+    auc_s2 = roc_auc_score(y, y_hat_s2)
+    auc_p = roc_auc_score(y, y_hat)
+    auc_xgb = roc_auc_score(y, y_hat_xgb)
+    
+    # What happens here?
+    cm_s = get_confusion_matrix(get_cm_label(y, y_hat_s))
+    cm_s2 = get_confusion_matrix(get_cm_label(y, y_hat_s2))
+    fpr_s = cm_s[0][1] / (cm_s[0][1] + cm_s[0][0])
+    fpr_s2 = cm_s2[0][1] / (cm_s2[0][1] + cm_s2[0][0])
+    
+    _, best_thr_s = get_fpr_thr(y, y_hat, fpr_goal=fpr_s) # this is unecessary?
+
+    
+    thresholds = np.linspace(0, 1, 101)
+    
+    # LR
+    best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat>thr, y) 
+                                     for thr in thresholds])]     
+    y_hat = y_hat>best_thr
+    cm_p = get_confusion_matrix(get_cm_label(y, y_hat))
+    fpr = cm_p[0][1] / (cm_p[0][1] + cm_p[0][0])
+
+    # XGB
+    best_thr_xgb = thresholds[np.argmin([get_distance_to_corner(y_hat_xgb>thr, y)
+                                     for thr in thresholds])]
+    y_hat_xgb = y_hat_xgb>best_thr
+    cm_xgb = get_confusion_matrix(get_cm_label(y, y_hat_xgb))
+    fpr_xgb = cm_xgb[0][1] / (cm_xgb[0][1] + cm_xgb[0][0])
+
+    fig, axes = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(12, 12))
+
+    # LR
+    sns.heatmap(cm_p, annot=True, fmt="d",
+                cmap=plt.get_cmap('Blues'), cbar=False,
+                ax=axes[0, 0])
+    # optimal threshold (shortest distance to upperleft corner)
+    axes[0, 0].set_title('LR \n AUC: {:.2f} - FPR:{:.2f}'.format(auc_p, fpr))
+    axes[1, 1].set_ylabel('True label')
+    # axes[0, 0].set_xlabel('Predicted label')
+    axes[0, 0].set_aspect('equal', 'box')
+
+    # XGB
+    sns.heatmap(cm_xgb, annot=True, fmt="d",
+                cmap=plt.get_cmap('Blues'), cbar=False,
+                ax=axes[0, 1])
+    # optimal threshold (shortest distance to upperleft corner)
+    axes[0, 1].set_title('XGB \n AUC: {:.2f} - FPR:{:.2f}'.format(auc_xgb, fpr_xgb))
+    # axes[0, 1].set_xlabel('Predicted label')
+    axes[0, 1].set_aspect('equal', 'box')
+
+    # age>70
+    sns.heatmap(cm_s, annot=True, fmt="d", 
+                cmap=plt.get_cmap('Blues'), cbar=False,
+                ax=axes[1, 0])
+    axes[1, 0].set_title('Age>70\nAUC: {:.2f} - FPR:{:.2f}'.format(auc_s, fpr_s))
+    axes[1, 0].set_ylabel('True label')
+    axes[1, 0].set_xlabel('Predicted label')
+    axes[1, 0].set_aspect('equal', 'box')
+
+    # age>80
+    sns.heatmap(cm_s2, annot=True, fmt="d", 
+                cmap=plt.get_cmap('Blues'), cbar=False,
+                ax=axes[1, 1])
+    axes[1, 1].set_title('Age>80\nAUC: {:.2f} - FPR:{:.2f}'.format(auc_s2, fpr_s2))
+    axes[1, 1].set_xlabel('Predicted label')
+    axes[1, 1].set_aspect('equal', 'box')
+
+    plt.suptitle('Featureset: {}'.format(name))
+    fig.savefig(savepath+'compared_with_simple_baseline_{}'.format(name), dpi=DPI)
+
+feature_sets = {
+    'pm':   'Premorbid',
+    'cp':   'Clinical Presentation',
+    'lab':  'Laboratory and Radiology',
+    'pmcp': 'Premorbid + Clinical Presentation',
+    'all':  'All',
+    'k10': '10 best'
+    }
+
+path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\results_output\logreg/'
+path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\results/'
+path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\results_final\results_lr_pipeline_random_50/'
+path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\FINAL\LR/'
+path_xgb = r'C:\Users\p70066129\Projects\COVID-19 CDSS\FINAL\XGB/'
+files = [file for file in os.listdir(path) if '.pkl' in file]
+folders = [fol for fol in os.listdir(path) if os.path.isdir(path + fol)]
+for file in files:
+    # if 'k10' not in file:
+    #     continue
+    fullpath = path + file
+    fullpath_xgb = path_xgb + file
+    file = file.split('_')
+    # fset = file[1][-3:] if 'k10' in file else file[1]
+    fset = file[1]
+    n = file[2]
+    y = file[3]
+    name = feature_sets[fset]
+
+    if name not in folders:
+        os.mkdir(path+name)
+    savepath = path + name + '/'
+
+    df = pd.read_pickle(fullpath)
+    df_xgb = pd.read_pickle(fullpath_xgb)
+
+    hospital = df['hospital']
+    dto = df['days_until_death']
+    y = df['y']
+    y_hat = df['y_hat']
+    y_hat_xgb = df_xgb['y_hat']
+    x = df.drop(['hospital', 'days_until_death', 'y', 'y_hat'], axis=1)
+
+
+    get_simple_model_performance(y, y_hat, y_hat_xgb, hospital)
+
+    # df = df.fillna(-99)
+    auc = roc_auc_score(y, y_hat)
+
+    if 'age_yrs' in x.columns:
+        plot_cm_simple_baseline(y, y_hat, y_hat_xgb, x['age_yrs'], .65, name, savepath) # NOT COMPLETELY TRUE
+
+
+
+    # GET FPRs
+    fpr_goals = [0, 0.05, 0.1]
+    y_hats, thresholds = get_yhats(y, y_hat, fpr_goals)
+
+    # PLOT
+    plot_correct_per_day(y, y_hats, dto, thresholds, name=name, savepath=savepath)
+    plot_conf_mats(y, y_hats, hospital, name, savepath=savepath)
+    # plot_dists(x, y, thresholds, savepath, 
+            #    auc=auc, histogram=True, kde=False)
+
+
+    #### Decision boundaries
+    # plot_dbs(x, df.y, df.y_hat, ys)
+
+
+    # plt.show()
+
+### GENERAL PLOTS
+
+with open(path+'results.txt', 'r') as f:
+    lines = f.readlines()
+
+scores = []
+errors = []
+names = []
+for l in lines[2:]:
+    l = l.split(' ')
+    scores += [float(l[4])]
+    errors += [float(l[6][:-1])]
+    names += [l[2]]
+print('done')
+
+fset_order = ['pm', 'cp', 'lab', 'pmcp', 'all']#, 'k10']
+idx = [names.index(i) for i in fset_order]
+scores = [scores[i] for i in idx]
+errors = [errors[i] for i in idx]
+names = [feature_sets[names[i]] for i in idx]
+n_bars = list(range(len(scores)))
+
+fig, ax = plt.subplots(figsize=(9,5))
+ax.bar(n_bars, scores, yerr=errors)
+ax.set_ylim(0, 1)
+ax.set_xticks(n_bars)
+ax.set_xticklabels(names, fontsize=6)
+ax.set_ylabel('ROC AUC')
+ax.set_title('Performance\nError = 95% CI')
+fig.savefig(path+'AUC_performance_per_featureset', dpi=DPI)
+
+plt.show()
+
+# def plot_correct_per_day(y, y_hat, dto, thresholds, name='', savepath='./'):
+#     fig, ax = plt.subplots(1, y_hat.shape[1], figsize=(16, 9))
+#     for i, col in enumerate(y_hat.columns):
+#         y = pd.Series(0, index=y_hat.index)
+#         y[y_hat[col].isin([1, 2])] = 1
+#         can_use = dto.notna() & (dto <= 21) & (dto >= 0)
+
+#         pos = pd.Series(dto[(y==1) & can_use].value_counts().sort_index(), 
+#                         index=list(range(0, 22))).fillna(0)
+#         neg = pd.Series(dto[(y==0) & can_use].value_counts().sort_index(), 
+#                         index=list(range(0, 22))).fillna(0)
+#         rel = pos / (pos + neg)
+
+#         df = pd.concat([pos, neg, rel], axis=1)
+#         df.columns = ['Correct', 'Wrong', 'Relative']
+
+#         df.iloc[:, 0:2].plot.bar(ax=ax[i])
+#         ax2 = ax[i].twinx()
+#         df.iloc[:, -1].plot.bar(ax=ax2, color='g', alpha=0.2)
+
+#         ax[i].set_xlabel('Day')
+#         if i==0:
+#             ax[i].set_ylabel('count')
+#         if i==(y_hat.columns.size-1):
+#             ax2.set_ylabel('Relative (P/(P+N))')
+#         ax[i].set_title('Thr: {:.3f}'.format(thresholds[i]))
+#     fig.suptitle('Prediction per day - {}'.format(name))
+#     fig.savefig(savepath+'prediction_per_day_{}'.format(name), dpi=DPI)
diff --git a/get_feature_set.py b/get_feature_set.py
index 12d6801..a1c3b8f 100644
--- a/get_feature_set.py
+++ b/get_feature_set.py
@@ -21,7 +21,8 @@ def get_3_laboratory_radiology_findings():
             'Lactate_2_1','Creatinine_value_1','Sodium_1_1','Potassium_1_1',
             'Calcium_adm','crp_1_1','Blood_albumin_value_1','creatininekinase',
             'LDH','d_dimer','ferritine_admin_spec','fibrinogen_admin',
-            'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1','PCO2_1','PH_value_1',
+            'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1_Arterial', 'PaO2_1_Venous',
+            'PaO2_1_Capillary','PCO2_1','PH_value_1',
             'Influenza','Coronavirus','Adenovirus','Bacteria','culture',
             'infec_resp_diagnosis','infiltrates_2','CT_thorax_performed',
             'corads_admission']
@@ -52,7 +53,8 @@ def get_6_all():
             'Lactate_2_1','Creatinine_value_1','Sodium_1_1','Potassium_1_1',
             'Calcium_adm','crp_1_1','Blood_albumin_value_1','creatininekinase',
             'LDH','d_dimer','ferritine_admin_spec','fibrinogen_admin',
-            'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1','PCO2_1','PH_value_1',
+            'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1_Arterial', 'PaO2_1_Venous',
+            'PaO2_1_Capillary','PCO2_1','PH_value_1',
             'Influenza','Coronavirus','RSV_','Adenovirus','Bacteria','culture',
             'infec_resp_diagnosis','infiltrates_2','CT_thorax_performed',
             'corads_admission',
diff --git a/lr_coefs.npy b/lr_coefs.npy
new file mode 100644
index 0000000000000000000000000000000000000000..98273b8dda9a09eb9dc446b781f7bd064364e773
GIT binary patch
literal 1928
zcmdT@c}&x17%l=stQ>-bIYf&bg;9pZ$dDtiBNNnEj)5GRq8xQ-gf<0<T3`qg&%qc}
zp+cdG$R#UBEBE(<7D}P*xCT_5kd31dARH1wr_3d@e?a_a*^=%3=gadvZ=NUb_q|`H
z*KvwZuo^B7$0SBjW5O9kX9Cgnj1!SWAfBNyVi}>)r)Z1_>Yuzv==m7xJ3b~dlumtz
ztq+h04*P9LgapFhtu45}%4q1<s7%EOyvP5Slj?_^*ZoQi70uc+=_y~s>p1Kc({1rz
zo5R2SsGnn0^FQwIpGb@3wJX3hrGk#XB!IVDeoM9Dzfk3TZc4vwYMh5Cn|r<YtUW<J
zO8R2L_CB=!ou;Gw**z2?m}Juf+v8-=hC8>0Mntfq<-T*^paQmHqpbxv2^j1c5<U8^
zSLMIn#~^v`;1y-@2X$9mmX_~{h7CGfoyCc798~dGnZxSI$ORQ%pYvbEmQ!K)R7?Sz
z?2I&xbsK<8S~e%Yp5HLXEtW=gD82|4&?URabLvrgTu+cyJzLeM#K^NtSDU~_b~g2w
zL^0Uo^QEWLYk_;X$54K+6K;m}2U!fXz{t<aak2{r{H3A^c@jeF?<M=KTAim^p2~N<
z{vo`tSho@LdI~i{E%RmX61ZC$72;+)3T%f64N>6@B%RXsjFhG$!u-%wK~xE%$wT}P
zJ$Z;45LKkb#=xIqo8ey1gotMvS<G|+ENd@h2*(BRGUl_z;$|N39`KI@KCgjKZN?6?
zzbplv&V{g>-h6QC6NIjq@zLbBMU64$g{U|ev01s*V7}Q;X=x|`O?tW8WsTG5OJBS>
zZk3MuXC21EMSkd~``ZG#4&H%|?i8Z>EEkG)eH1HT+=J^w<??B=90o2}47~EH1<P2m
z&aop4;FH9u3fsm<N=~rP+GI7-)`vwk4u(dr@rz{wA?lA&gb))SLiTOlYzv*ouqkbh
z)V5p$DaKsmR9*)dhA$mX_(B9WmM8K_aqZA*$C+s)cR@zjSV6P21(K3)zVV>7AiVx?
zz)V9c3b5n~ho!Zsi|!gOcHskOh(4Vmxq)Uf^%=vS$&h1k<D}!>Xe5X}H=T8%0PdQY
z*;N=~@WcLR!xt+F=+PvS8Oxr6OV-muMokMaO?t=nrp`nB#8JLjT8W~^46AmV-$A->
zvd*SfNl@Kh#c=2OHuPi18D=k`8C)>>(kQD4Qq)~$<CqeNby{?jlV!k(+@H=+$bqiB
zdd;J+0~~^WUtI}qgV6wc6Q{y9V5StsbB?#6B&x})W3QXgH(N;&G`l7suG}K;Noa;*
z3-vT{2^V&mFpPrpg20${#FSM>Lun-OE?4VTSYt~lZ%c23y=B+-u(E38n<uIM*iiwK
zbKKZ&Rwb;4n^C%-Ep6CO_a0s*#w|qNonHLIdi@*rdRXAzd1<B<@X-$vb52QttY>sl
zG+P6?vI^S3^Fi>}uk^97mBNHj^D*!HPH?=gmM@$_=!M3i@_n>=l^@I5b&&N*;fDP8
e|Dx>5to6<+DOgbw+v9gKRejfmdi^}*5c~luJ0@EI

literal 0
HcmV?d00001

diff --git a/requirements.txt b/requirements.txt
index b073469..f5d09ac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,6 @@ pandas==0.25.2
 progressbar2==3.50.1
 sklearn==0.0
 requests==2.22.0
-castorapi==0.1.3
+castorapi==0.2
+xgboost==1.0.2
+missingpy=0.2.0
\ No newline at end of file
diff --git a/results/loho_pm_class_dist_per_hosp.txt b/results/loho_pm_class_dist_per_hosp.txt
new file mode 100644
index 0000000..2960dd5
--- /dev/null
+++ b/results/loho_pm_class_dist_per_hosp.txt
@@ -0,0 +1,10 @@
+hospital	0	1
+Center 1	145	24
+Center 3	229	98
+Center 0	211	50
+Center 7	322	118
+Center com	105	24
+Center 5	315	86
+Center 4	97	16
+Center 2	106	12
+Center 6	237	88
diff --git a/results/loho_pm_n2283_y516.0_coefs.txt b/results/loho_pm_n2283_y516.0_coefs.txt
new file mode 100644
index 0000000..232eb50
--- /dev/null
+++ b/results/loho_pm_n2283_y516.0_coefs.txt
@@ -0,0 +1,45 @@
+[[0.02724774 0.07796375 0.04514499 0.037072   0.03693948 0.04072643
+  0.04699144 0.03451858 0.03676676 0.04358888 0.03437743 0.05223805
+  0.04450634 0.03076349 0.02880035 0.04174154 0.03355281 0.02878598
+  0.04131289 0.04650693 0.02872644 0.04124029 0.03936394 0.03553464
+  0.04558886]
+ [0.02852098 0.11933233 0.02049731 0.03196372 0.0402501  0.04791315
+  0.04758906 0.0272247  0.03244538 0.05959017 0.03316517 0.05397624
+  0.04702694 0.04040785 0.03965527 0.02883011 0.03357292 0.03211337
+  0.03795148 0.02385984 0.03098355 0.03889681 0.02707311 0.02900842
+  0.0481519 ]
+ [0.03824867 0.20201536 0.03722147 0.02238339 0.03902486 0.04041912
+  0.04165535 0.02382601 0.03506192 0.03819693 0.03310945 0.04155134
+  0.04301092 0.03435294 0.03162336 0.03340763 0.02047763 0.03296867
+  0.03788981 0.02431517 0.02674964 0.01537701 0.02278797 0.
+  0.08432537]
+ [0.02365478 0.15148643 0.01284037 0.02902209 0.03568465 0.03697684
+  0.05815557 0.0291574  0.03424063 0.05271753 0.03415936 0.04179135
+  0.0447507  0.02028611 0.02205772 0.04093557 0.0316455  0.02927355
+  0.03154784 0.02852412 0.03397431 0.03592318 0.03366717 0.03448628
+  0.07304095]
+ [0.03153612 0.16012011 0.02650238 0.02958032 0.03428465 0.03534453
+  0.07054944 0.01924068 0.03674266 0.04099085 0.03405198 0.03728574
+  0.03685509 0.03123184 0.03651385 0.03296662 0.03361399 0.04126919
+  0.03322199 0.02767796 0.02392729 0.02306266 0.02242733 0.02182049
+  0.07918217]
+ [0.02297538 0.2050213  0.02518855 0.02765787 0.03315873 0.03289452
+  0.09503685 0.         0.03322658 0.02815784 0.04470608 0.03382082
+  0.03129571 0.02981318 0.03155567 0.03594936 0.05403385 0.0245353
+  0.03034715 0.02750624 0.         0.0184159  0.02812885 0.02183791
+  0.0847363 ]
+ [0.03794999 0.11937173 0.02292331 0.03781984 0.03594646 0.03787136
+  0.04503115 0.02828448 0.0322755  0.04052298 0.0344321  0.04110741
+  0.04858724 0.04217534 0.03850076 0.03208326 0.03291799 0.03559599
+  0.03548445 0.03230241 0.03144418 0.03115782 0.03739164 0.02933442
+  0.05948817]
+ [0.03765591 0.1244607  0.02361755 0.03341233 0.03704211 0.03986889
+  0.05130611 0.0245237  0.0476805  0.04277378 0.03831241 0.03791314
+  0.04802676 0.03886366 0.03721868 0.02699121 0.03517297 0.03607577
+  0.03673963 0.02632892 0.03438185 0.02605204 0.0285626  0.02555271
+  0.06146609]
+ [0.03061492 0.08841494 0.03246649 0.03778347 0.03862301 0.04902279
+  0.04338187 0.03391563 0.03703794 0.05063674 0.0386482  0.04710913
+  0.04242903 0.03407801 0.0321296  0.03596683 0.02974905 0.02953567
+  0.04011565 0.03769885 0.02688617 0.03670201 0.03521456 0.04155528
+  0.05028412]]
\ No newline at end of file
diff --git a/unit_lookup.py b/unit_lookup.py
index f8c3d21..5c861be 100644
--- a/unit_lookup.py
+++ b/unit_lookup.py
@@ -4,14 +4,14 @@ def get_unit_lookup_dict():
                    'units_neutro':               {1: 0.001,  2: -999.0,    3: 1.0  },
                    'Platelets_unit_1':           {1: 1.0,    2: 1.0                },
                    'Total_Bilirubin_1_1':        {1: 1.0,    2: 17.1               },
-                   'Haemoglobin_unit_1':         {1: 1.6114, 2: 16.1140,   3: 1    },
+                   'Haemoglobin_unit_1':         {1: 1.6114, 2: 16.1140,   3: 1.0  },
                    'Glucose_unit_2':             {1: 1.0,    2: 0.0555             },
                    'Blood_Urea_Nitrogen_unit_1': {1: 1.0,    2: 0.3571             },
                    'Lactate_1_1':                {1: 1.0,    2: 0.1110             },
                    'Creatinine_unit_1':          {1: 1.0,    2: 88.4956            },
                    'sodium_units':               {1: 1.0,    2: 1.0                },
                    'pot_units_1':                {1: 1.0,    2: 1.0                },
-                   'd_dimer_units_adm':          {1: 1000.0, 2: 0.18,      3: 1    },
+                   'd_dimer_units_adm':          {1: 5.5556, 2: 0.0055,    3: 1.0  },
                    'WBC_1_1':                    {1: 1.0,    2: 1.0                },
                    'units_lymph':                {1: 0.001,  2: -999.0,    3: 1.0  },
                    'PCO2_unit_1':                {1: 1.0,    2: 0.1333             },
@@ -28,8 +28,8 @@ def get_unit_lookup_dict():
                    'Creatinine_unit':            {1: 1.0,    2: 88.4956            },
                    'sodium_units_1':             {1: 1.0,    2: 1.0                },
                    'pot_units_2':                {1: 1.0,    2: 1.0                },
-                   'Haemoglobin_unit':           {1: 1.6114, 2: 16.1140,   3: 1    },
-                   'units_d_dimer':              {1: 1000.0, 2: 0.18,      3: 1    }, 
+                   'Haemoglobin_unit':           {1: 1.6114, 2: 16.1140,   3: 1.0  },
+                   'units_d_dimer':              {1: 5.5556, 2: 0.0055,    3: 1.0  }, 
                    'WBC_1':                      {1: 1.0,    2: 1.0                }}
 
 
diff --git a/update_slack/covid19_update_slack_all.py b/update_slack/covid19_update_slack_all.py
index f29de19..b777c8d 100755
--- a/update_slack/covid19_update_slack_all.py
+++ b/update_slack/covid19_update_slack_all.py
@@ -7,10 +7,9 @@
 
 @author: wouterpotters
 """
-import time, statistics, os, site, sys
-site.addsitedir('./../') # add directory to path to enable import of castor_api
-from castor_api import Castor_api
-
+import time
+import statistics
+import castorapi as ca
 import configparser
 config = configparser.ConfigParser()
 config.read('../user_settings.ini')
@@ -18,10 +17,11 @@
 # put both the secret, client and the tokens_slack file here
 location_castor_slack_api_data = config['SlackAPI']['local_private_path']
 
-c = Castor_api(location_castor_slack_api_data) # e.g. in user dir outside of GIT repo
+c = ca.CastorApi(location_castor_slack_api_data)  # e.g. in user dir outside of GIT repo
 
 # get study ID for COVID study
-study_id = c.request_study_id('COVID')[0]
+study_id = c.select_study_by_name('COVID-19 NL')
+
 
 # Posting to a Slack channel
 def send_message_to_slack(text):
@@ -38,65 +38,91 @@ def send_message_to_slack(text):
 
         for t_url in tokens:
             req = request.Request(t_url,
-                              data=json_data.encode('ascii'),
-                              headers={'Content-Type': 'application/json'})
+                                  data=json_data.encode('ascii'),
+                                  headers={'Content-Type': 'application/json'})
             request.urlopen(req)
     except Exception as em:
         print("EXCEPTION: " + str(em))
 
+
 records = c.request_study_records(study_id)
-count = len([x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False])
+count = len([x['_embedded']['institute']['name'] for x in records
+             if x['_embedded']['institute']['name'] != 'Test Institute'
+             and x['archived'] == False])
 first_update = True
 
 while True:
     try:
         # renew session with api to avoid timeout errors
-        c = Castor_api(location_castor_slack_api_data) # e.g. in user dir outside of GIT repo
+        c = ca.CastorApi(location_castor_slack_api_data) # e.g. in user dir outside of GIT repo
         records = c.request_study_records(study_id)
         institutes = [x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute'  and x['archived'] == False]
         institutesUnique = []
         for inst in institutes:
             if inst not in institutesUnique:
                 institutesUnique.append(inst)
-                
+
         count_sub = {}
         completion_rate_sub = {}
         completion_rate_100_sub = {}
         completion_rate_90_sub = {}
         completion_rate_0_sub = {}
-        completion_rate_avg_sub =  {}
+        completion_rate_avg_sub = {}
         sub_messages = {}
         for inst in institutesUnique:
-            count_sub[inst] = (len([x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False]))
-            completion_rate_sub[inst] = [x['progress'] for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False]
-            completion_rate_100_sub[inst] = sum([x['progress']==100 for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False])
-            completion_rate_90_sub[inst] = sum([x['progress']>90 for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False])
-            completion_rate_0_sub[inst] = sum([x['progress']==0 for x in records if x['_embedded']['institute']['name'] == inst  and x['archived'] == False])
-            completion_rate_avg_sub[inst] = statistics.mean([x['progress'] for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False])
-            sub_messages[inst] = str(count_sub[inst]) + ' (avg completion rate: ' + str(round(completion_rate_avg_sub[inst],2)) + '%)'
-            
-        count_nw = (len([x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False]))
-        completion_rate = [x['progress'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False]
-        completion_rate_100 = sum([x['progress']==100 for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False])
-        completion_rate_90 = sum([x['progress']>90 for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False])
-        completion_rate_0 = sum([x['progress']==0 for x in records if x['_embedded']['institute']['name'] != 'Test Institute'  and x['archived'] == False])
-        completion_rate_avg = statistics.mean([x['progress'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute'   and x['archived'] == False])
+            count_sub[inst] = (len([x['_embedded']['institute']['name'] for x in records
+                                    if x['_embedded']['institute']['name'] == inst and x['archived'] == False]))
+            completion_rate_sub[inst] = [x['progress'] for x in records
+                                         if x['_embedded']['institute']['name'] == inst and x['archived'] == False]
+            completion_rate_100_sub[inst] = sum([x['progress']==100 for x in records
+                                                 if x['_embedded']['institute']['name'] == inst and x['archived'] == False])
+            completion_rate_90_sub[inst] = sum([x['progress']>90 for x in records
+                                                if x['_embedded']['institute']['name'] == inst and x['archived'] == False])
+            completion_rate_0_sub[inst] = sum([x['progress']==0 for x in records
+                                               if x['_embedded']['institute']['name'] == inst  and x['archived'] == False])
+            completion_rate_avg_sub[inst] = statistics.mean([x['progress'] for x in records
+                                                             if x['_embedded']['institute']['name'] == inst and x['archived'] == False])
+            sub_messages[inst] = str(count_sub[inst]) + ' (Castor completion rate: ' + \
+                str(int(completion_rate_avg_sub[inst])) + '%)'
+
+        count_nw = (len([x['_embedded']['institute']['name'] for x in records
+                         if x['_embedded']['institute']['name'] != 'Test Institute'
+                         and x['archived'] is False]))
+        completion_rate = [x['progress'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute'
+                           and x['archived'] == False]
+        completion_rate_100 = sum([x['progress']==100 for x in records
+                                   if x['_embedded']['institute']['name'] != 'Test Institute'
+                                   and x['archived'] == False])
+        completion_rate_90 = sum([x['progress']>90 for x in records
+                                  if x['_embedded']['institute']['name'] != 'Test Institute'
+                                  and x['archived'] == False])
+        completion_rate_0 = sum([x['progress']==0 for x in records
+                                 if x['_embedded']['institute']['name'] != 'Test Institute'
+                                 and x['archived'] == False])
+        completion_rate_avg = statistics.mean([x['progress'] for x in records
+                                               if x['_embedded']['institute']['name'] != 'Test Institute'
+                                               and x['archived'] == False])
         if count_nw - count >= 5 or first_update:
             count = count_nw
-            message = 'Total number of inclusions is now:      ' + str(count) +' records'+\
-                      '\nTotal number of institutions with data: '+ str(len(institutesUnique)) +\
-                      '\n - 100% completed: ' + str(completion_rate_100)+'/'+ str(count) +\
-                      '\n - >90% completed: ' + str(completion_rate_90)+'/'+ str(count) +\
-                      '\n - 0% completed:   ' + str(completion_rate_0)+'/'+ str(count) +\
-                     '\n - average completion: ' + str(round(completion_rate_avg,2))+'% (n='+ str(count) +')\n'
-            for c in sub_messages:
-                message = message + '\n>' + c + ': ' + (max([len(x) for x in institutesUnique])-len(c))*' ' + sub_messages[c]
-            
+            message = '```Total # of inclusions  : ' + str(count) + ' records' + \
+                      '\nTotal # of institutions: ' + str(len(institutesUnique)) +\
+                      '\n-       > 90% completed: ' + str(completion_rate_90) + '/' + str(count) + \
+                      '\n-          0% completed: ' + str(completion_rate_0) + '/' + str(count) + \
+                      '\n-   avg completion rate: ' + str(int(completion_rate_avg)) + '%\n'
+            sorted_subs = sorted(sub_messages, key=count_sub.get, reverse=True)
+            for c in sorted_subs:
+                n = 0
+                if (count_sub[c]) < 10:
+                    n += 1
+                if (count_sub[c]) < 100:
+                    n += 1
+                message = message + '\n> ' + c + ': ' + (max([len(x) for x in institutesUnique])-len(c)+n)*' ' + sub_messages[c]
+            message += '\n\nNB: Castor completion rates can be unreliable as these are only updated after records are opened/edited.```'
             send_message_to_slack(message)
             print(message)
-            
+
             first_update = False
-            
+
         else:
             print('No new entries found; try again in 5 minutes')
         time.sleep(60*5) # 5 minute updates, only if count increases by 10 or more.
@@ -106,4 +132,3 @@ def send_message_to_slack(text):
         time.sleep(60) # in case of errors; wait 1 minute and try again.
 
 
-        
\ No newline at end of file
diff --git a/xgb_coefs.npy b/xgb_coefs.npy
new file mode 100644
index 0000000000000000000000000000000000000000..05cb0e6153998c55ee7b1e1489e79027835d83cf
GIT binary patch
literal 1028
zcmbV~{ZkVJ9L5=2Fkl?a#Woo*6mT$y<4qXc_Zbs>36D(yDZG=E8+h}jNgH(WQX0~M
z<`toEMu8nsT7X`(1tca21e!09VQ{dxD9AMik<$uD!~TRmzdfJdp68y^noRZ1-3+H!
zoJu6Qc}A^4k`O4N{V@_*pk)6MLxCYjzxRkCH}9$6nseBgx8jYuocz3%92zeRjM^9`
z3w$l`|Kffjc9II4&*6bpf!m`3jH?Cc8CnNnu?l;%p169J57o&iWZmJQ`p-<Hq{gCy
zxlHYIHrnvF6s>wUR1ew66c+~1*Of45*yxNc6h5>D{inFlUQWRGO;#$)bzsSQp0u;h
zNU;Q?MztB;vq{KsnIz*h3*S9kfS6HNMDnwdc20#a%0lq&OKY(ve3p8SyFqh;31_1S
z=E7c54F5yP>0;;yEyVrEPWFT?X!^_#{uYh$twJGkvfJszs2JsMhu~I&6nz_dNl@<v
zZGj*9>s`?mDTP87ffh3d$0``ezsH8D`#ee9U9me#g!dx`h`V;2ew;6%k=C!sX@!Bd
z7PJiXP?A`rJg_gFzP@TFu5=aHg#uK@=utL4M>B~WJU{mWF8$^KUHEz&PmP0@{1ei)
znaHa}f-f5w@VPTj8U+KoW-EOh>Vb%09%^RXP`b$zM{8@qk}**6C`ef@8l(L5KgdhZ
zfxh1x38htNY^bNAqkKFwBY~jU1M?k1y!en0>(~=I^^Omm*{-;I(G62>VYtp?<L9m^
zk}~_~zP*dy>};oBy4xsO+6YT*7l{_ql<`@&skeHH3Wi;w>vF-x#%p-m^*tdQTbpO7
z(?1TT3U|!Qo$yV9gUVG5G$ry-m!6HJ$AfgH?l0=y_0xa*EKJl=v3`aQPPb9~$yOL%
z7GZsn0~fAerp5cdI7>V*#3Ixh6qqpmN@>w799{}R3)>Tk`$BPuxec6d85Z08apeFL
z%n=u?zQsjI78jK}Soq*uC$uV`(4u5Du4sOz$xn^&fiG0kk5OCHLwm>BDC!iTN*#nf
z>(VhdJxCv>CZj4^j$*qXhPJPKr+qVy$;D_e2+`_ip-Qed<b6DBFXf}>(hszJmjmIU
z5fUw1X*Oe+CVNW|baRoWSD8_>#KOI6zK})-Kz#E#$Ul!nfXy93vjDH^cHp!+0B<~0
w!TUxyv^5^ctYgBF#zTtf0o`yckY;BvL<fDLRZY;?E<S?cjmeEtbUaeyU+Q^)^8f$<

literal 0
HcmV?d00001


From 491bf838bdbaf2e79ef04ab809fda58ff73d254b Mon Sep 17 00:00:00 2001
From: laramos <l.a.ramos@amc.uva.nl>
Date: Tue, 16 Jun 2020 12:35:27 +0200
Subject: [PATCH 2/2] fix path

---
 covid19_ICU_admission.py               |   4 +-
 results/loho_pm_n2283_y516.0_coefs.txt |  90 ++++++++++++-------------
 xgb_coefs.npy                          | Bin 1028 -> 1028 bytes
 3 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/covid19_ICU_admission.py b/covid19_ICU_admission.py
index f7ec90a..3b36bd0 100644
--- a/covid19_ICU_admission.py
+++ b/covid19_ICU_admission.py
@@ -418,10 +418,10 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude,
     test_set = pd.DataFrame(test_set,columns=x.columns)
     f = plt.figure()    
     shap.summary_plot(shap_values, test_set)
-    f.savefig(os.path.join( model.save_path ,"summary_plot1.png"), bbox_inches='tight', dpi=600)
+    f.savefig((model.save + "_summary_plot1.png"), bbox_inches='tight', dpi=600)
     f = plt.figure()
     shap.summary_plot(shap_values, test_set, plot_type="bar")
-    f.savefig(os.path.join( model.save_path ,"summary_plot2.png"), bbox_inches='tight', dpi=600)
+    f.savefig((model.save_path + "_summary_plot2.png"), bbox_inches='tight', dpi=600)
     if not save_figures:
         plt.show()
 
diff --git a/results/loho_pm_n2283_y516.0_coefs.txt b/results/loho_pm_n2283_y516.0_coefs.txt
index 232eb50..6ac06c8 100644
--- a/results/loho_pm_n2283_y516.0_coefs.txt
+++ b/results/loho_pm_n2283_y516.0_coefs.txt
@@ -1,45 +1,45 @@
-[[0.02724774 0.07796375 0.04514499 0.037072   0.03693948 0.04072643
-  0.04699144 0.03451858 0.03676676 0.04358888 0.03437743 0.05223805
-  0.04450634 0.03076349 0.02880035 0.04174154 0.03355281 0.02878598
-  0.04131289 0.04650693 0.02872644 0.04124029 0.03936394 0.03553464
-  0.04558886]
- [0.02852098 0.11933233 0.02049731 0.03196372 0.0402501  0.04791315
-  0.04758906 0.0272247  0.03244538 0.05959017 0.03316517 0.05397624
-  0.04702694 0.04040785 0.03965527 0.02883011 0.03357292 0.03211337
-  0.03795148 0.02385984 0.03098355 0.03889681 0.02707311 0.02900842
-  0.0481519 ]
- [0.03824867 0.20201536 0.03722147 0.02238339 0.03902486 0.04041912
-  0.04165535 0.02382601 0.03506192 0.03819693 0.03310945 0.04155134
-  0.04301092 0.03435294 0.03162336 0.03340763 0.02047763 0.03296867
-  0.03788981 0.02431517 0.02674964 0.01537701 0.02278797 0.
-  0.08432537]
- [0.02365478 0.15148643 0.01284037 0.02902209 0.03568465 0.03697684
-  0.05815557 0.0291574  0.03424063 0.05271753 0.03415936 0.04179135
-  0.0447507  0.02028611 0.02205772 0.04093557 0.0316455  0.02927355
-  0.03154784 0.02852412 0.03397431 0.03592318 0.03366717 0.03448628
-  0.07304095]
- [0.03153612 0.16012011 0.02650238 0.02958032 0.03428465 0.03534453
-  0.07054944 0.01924068 0.03674266 0.04099085 0.03405198 0.03728574
-  0.03685509 0.03123184 0.03651385 0.03296662 0.03361399 0.04126919
-  0.03322199 0.02767796 0.02392729 0.02306266 0.02242733 0.02182049
-  0.07918217]
- [0.02297538 0.2050213  0.02518855 0.02765787 0.03315873 0.03289452
-  0.09503685 0.         0.03322658 0.02815784 0.04470608 0.03382082
-  0.03129571 0.02981318 0.03155567 0.03594936 0.05403385 0.0245353
-  0.03034715 0.02750624 0.         0.0184159  0.02812885 0.02183791
-  0.0847363 ]
- [0.03794999 0.11937173 0.02292331 0.03781984 0.03594646 0.03787136
-  0.04503115 0.02828448 0.0322755  0.04052298 0.0344321  0.04110741
-  0.04858724 0.04217534 0.03850076 0.03208326 0.03291799 0.03559599
-  0.03548445 0.03230241 0.03144418 0.03115782 0.03739164 0.02933442
-  0.05948817]
- [0.03765591 0.1244607  0.02361755 0.03341233 0.03704211 0.03986889
-  0.05130611 0.0245237  0.0476805  0.04277378 0.03831241 0.03791314
-  0.04802676 0.03886366 0.03721868 0.02699121 0.03517297 0.03607577
-  0.03673963 0.02632892 0.03438185 0.02605204 0.0285626  0.02555271
-  0.06146609]
- [0.03061492 0.08841494 0.03246649 0.03778347 0.03862301 0.04902279
-  0.04338187 0.03391563 0.03703794 0.05063674 0.0386482  0.04710913
-  0.04242903 0.03407801 0.0321296  0.03596683 0.02974905 0.02953567
-  0.04011565 0.03769885 0.02688617 0.03670201 0.03521456 0.04155528
-  0.05028412]]
\ No newline at end of file
+[[0.02900513 0.10485817 0.0189172  0.03721691 0.03553807 0.04540571
+  0.04054026 0.03142751 0.03802205 0.04737291 0.03735742 0.04617571
+  0.04504537 0.03630833 0.03214922 0.04576809 0.03009612 0.02972055
+  0.04109957 0.03762121 0.02534612 0.03589006 0.03655426 0.03809978
+  0.05446427]
+ [0.03371864 0.07376102 0.03255043 0.03968529 0.03761422 0.04802226
+  0.04134564 0.03310054 0.03776442 0.05359318 0.03574292 0.04927718
+  0.04543485 0.03519416 0.03937469 0.03208647 0.03053788 0.03389869
+  0.04050232 0.03536687 0.03212772 0.04184384 0.03644386 0.03788628
+  0.04312659]
+ [0.03507941 0.15744089 0.02616969 0.02849024 0.03828554 0.04440631
+  0.05145083 0.02374118 0.03698253 0.03523904 0.03241921 0.04719491
+  0.03995088 0.03506144 0.04007663 0.02934017 0.03156112 0.03145391
+  0.03467921 0.02535327 0.02256914 0.02781971 0.02548802 0.03285579
+  0.06689094]
+ [0.03372161 0.08439521 0.0289013  0.04076211 0.03650614 0.05089137
+  0.03374405 0.04437228 0.03895336 0.05374929 0.03740871 0.04490269
+  0.04559904 0.02854626 0.03117101 0.03326504 0.02982679 0.03199616
+  0.0395859  0.02939419 0.02762785 0.04320026 0.04013652 0.04495482
+  0.04638802]
+ [0.02926653 0.24708247 0.         0.02734006 0.03898028 0.03827675
+  0.08777084 0.         0.0423913  0.0383439  0.02220213 0.04523094
+  0.05577661 0.04846299 0.         0.02791958 0.         0.02541763
+  0.03712561 0.02293024 0.02168907 0.01740101 0.03307582 0.
+  0.0933162 ]
+ [0.029925   0.20497677 0.         0.02398588 0.03220888 0.03252695
+  0.08346679 0.01035944 0.03922718 0.02866674 0.03422621 0.03407473
+  0.03592969 0.03183847 0.02384912 0.02781185 0.04592736 0.04671792
+  0.03090115 0.0223245  0.01321638 0.01738103 0.02445605 0.027792
+  0.09820986]
+ [0.03334145 0.12808916 0.0167826  0.03656108 0.03405709 0.0379472
+  0.05089486 0.02370403 0.0318248  0.03878355 0.03259754 0.0418744
+  0.04873329 0.04059746 0.04309903 0.0293893  0.03169436 0.04251581
+  0.03647418 0.02938977 0.02605443 0.02558089 0.03959654 0.03090884
+  0.06950831]
+ [0.03093497 0.06281067 0.04603253 0.03931557 0.0433754  0.04624838
+  0.04940653 0.03764849 0.04165215 0.05599385 0.03610231 0.03991234
+  0.0440371  0.03005461 0.03021684 0.03320304 0.03827928 0.03343385
+  0.03905055 0.03715051 0.03372669 0.03598797 0.03964458 0.03446046
+  0.04132135]
+ [0.02725543 0.10402164 0.03387656 0.03820056 0.04408296 0.0424456
+  0.04272338 0.03363778 0.03688686 0.0500395  0.03885584 0.04751011
+  0.04281425 0.03273204 0.02841813 0.03399113 0.03231791 0.02896029
+  0.04179978 0.03131918 0.02726615 0.03499264 0.03622282 0.03890293
+  0.05072653]]
\ No newline at end of file
diff --git a/xgb_coefs.npy b/xgb_coefs.npy
index 05cb0e6153998c55ee7b1e1489e79027835d83cf..170c305cff601b26af7af71531ab48c9bd272e88 100644
GIT binary patch
delta 914
zcmW;CeM}St9L90r4mgf?@dA#Ei{s7nl9vK`@%KHDF_dB|k%=cSX`qplD3+L(2Mwl7
zM95y$s6Vh8I)Yi2Z%gEAh(rN7C}v53x-cjdB4+`K&!6AtQ>m&{)s<U~4HGibHbBz7
zV`3}uBd^Yz4gV%^z&#AZc{?1w4!~cXg1^$Aiw4+R5s&NH#oTI9kXg0@L(Pm-?wS(E
z>?NG`)6&$~Bd^)}Fn&%$m4}WAO+K|cHA=7dIhy5&+d%_)-dg59PsQD0wDR&h&RlQN
zQdPN$$G0tFXds;%{)t=)_abLi5ZWCI-d%cMHV&$>-3(>$mk0fbf>#Sn7}GR-8WF%-
zpG2{v)sx8hAal>=0m&&ClZ-wcFO0^bz4#2jzG0Ku-M;ib=Y>O;lG3#+sXiXedkt1k
z3d2MA&N3;IX3vlPc67OV&~&Ot#zxMH>DH*2yL+YUuoHoE`)Ssz+4@s6MYeG<&4f|5
zPs8h_>xhX{bGj{-+JkzQFUesz!Jm;o;_&Q9WN>gyES7m0Pgaq+VOokj6kH84py{5J
zZfn+v*zQL0#MTgM4DsaJo3UK_OQIBSn(ZDe<=&87IiW|>spHA^FIZ^S#${MN9z}+~
z6isRZ%AExSRir)qj-0t63+ChaUUByJBl_nHlJ?mtdAp`oDt9|SH1)bRR)_vA#Wyp}
z)&;EE`>pJpP;fudk>ismX*zF|mgdK(7`QE$qujVt?~2mogL{z!EAqdQ*8WjhA1FM%
zDT(bNvvPg-E2*l^mJ?fQ#Vfm0+?q!v*>DB7dKEty73QWr)#B5xq15WizTyB3W$T!Z
zu*o@#gO&1Yf!I|!Vy!fBs9_aV>0x}hKZ>jElQPw3k7sBE(}fy(Y?I<w(l1Vj`sBYz
z1JyCJl5eRcYx}GWH<oj(M#yssV%y1R?iNFN-cziJ)$+;*OX=BIi1Ifd^m~FydOil<
z*cn+<{g2$9w>op7OUIvgRFqBW2{QRHAEHM8q7OM+47?QP&f#PuU1>v-;nGj2O-=MJ
z9qa32D7m(bGjkDiTy~-55ntkFGRO$mGjlYB`P@iUi<}7l{&z`Fbfq(2NsxI$l9!t}
au-cB)+#xA>)|1Vyd5dYy*7H%vTK)%*JW=KV

delta 914
zcmW-f{ZkVJ9ETZOY-8hKF1E>lodSXr9Eic>zMrwcUhrVX(4spryY&NV?TnYZK&}z4
z1I;TU#~B57L}{_<wYGqq$s7opFQ8&@uy#>ft}%$5RzRBd{0Gl-kC9{Kv^E+}<#?#J
z_XO_xmAE!3#&n4o&arin)T^;e7lrf3MNl8vg!1crwEejgMa4PjXRc7+e1JCoEr&(V
zMcY__Y`N<Z^`;8;vH%?mq#>5pVECv2y0dw>)#In8S3+3!pCsK}IEviKFsZj<aIV0B
zvIq}Vds(>gcnvmBvXLdKK<NoJzHUmv`>(CVnv6Me9^yiCn2GReDeSc_QjY&ag<2W(
zqizy>8l+&}Q|S3J9tm!Z>YZ8%Uash)!;>-?-%Y{QE;)uax=7p^4P8w<hCA7CWXYjS
z&xF}-;G?mbfdjWWunnH1)JQg7+$6<EnIj}vJ54_>9Ha@$w-ok>5vv*I5hoSMq^iAp
zwDjHiAPMBFz^N6ZWwRbl)AQuZ=i~YL7jWh`9x5|7;81Zcq7|Q$&T6A*b1J^+Vj%YV
z0%?>CRQCGmGea5=naM(!d|Wg<83jYT39NJmn(rs6jM6DOp#6iQ^?c}uW02R-itesX
zsxyf2m@gILdL9<~CD?sW1pm}SI{IEL!Z~c*Jk7-{cRel%Ir!NzOLFEA-3dDA?SVe}
zWzb56@@}|u93)*VRT;LFUn5uBEY*y&QR#@l>Fx{o&-OzJ2T%9<XdodMw&qAID8g_t
zFGNN)13mddbZ9G3@L-gVb^JxH9Y6hFXmO^U>h(U_=e5$7BNkM>F2#nr5Kdh>OG|g+
zaGZo-$fRhmQewvTE0t!m40v@p1!hhZ^7o`+Kl2&z2h*`+O~ARmOfV-Ru==V1DdhsR
zY-i!)@55kGJ*1`7)i|g5ojhMuBNlN`dmo^^&Pgv#b5J)RMr%nDcCFK5-aAU46c?g3
zTY>suJjS*?`loLz8Wl2hRY_opH@K-q5Cg@K5Zf9=aGv>rR&Mek**`(j6+g|Djg!Z9
z5J^{-$h*o8(=rRUFT_Efoe0^LXQB8i3yFb9NbF*~QMnx-lqBM<duqg7&VbIu!_E#S
nLZw0!+3wQi&?0G`ONMk`9CYd#n%W^kG7mBEJRwK_{Sy2O9usRB