From 88a55392be10c02d1741e2a975e96f9eba1db649 Mon Sep 17 00:00:00 2001 From: laramos Date: Tue, 16 Jun 2020 12:23:15 +0200 Subject: [PATCH 1/2] shap features --- .gitignore | 6 +- Classifiers/XGB.py | 421 +++++++++------- Classifiers/gradboost.py | 50 -- Classifiers/logreg.py | 298 ++++++----- covid19_ICU_admission.py | 200 ++++++-- covid19_ICU_util.py | 214 ++++++-- .../ALL_castor2excel_variables.py | 166 +++++-- .../ALL_completeness_category.py | 76 +++ .../ALL_convert_to_age_in_years.py | 2 +- data_query_scripts/survival_death.py | 199 ++++++-- data_query_scripts/survival_death_km.py | 350 +++++++++++++ error_replacements.py | 4 +- generate_figures.py | 469 ++++++++++++++++++ get_feature_set.py | 6 +- lr_coefs.npy | Bin 0 -> 1928 bytes requirements.txt | 4 +- results/loho_pm_class_dist_per_hosp.txt | 10 + results/loho_pm_n2283_y516.0_coefs.txt | 45 ++ unit_lookup.py | 8 +- update_slack/covid19_update_slack_all.py | 101 ++-- xgb_coefs.npy | Bin 0 -> 1028 bytes 21 files changed, 2047 insertions(+), 582 deletions(-) delete mode 100644 Classifiers/gradboost.py create mode 100644 data_query_scripts/ALL_completeness_category.py create mode 100644 data_query_scripts/survival_death_km.py create mode 100644 generate_figures.py create mode 100644 lr_coefs.npy create mode 100644 results/loho_pm_class_dist_per_hosp.txt create mode 100644 results/loho_pm_n2283_y516.0_coefs.txt create mode 100644 xgb_coefs.npy diff --git a/.gitignore b/.gitignore index eee52b3..d8a6989 100644 --- a/.gitignore +++ b/.gitignore @@ -146,4 +146,8 @@ covid19_variables.xlsx *.xlsx *.xls *.png -.vscode/ \ No newline at end of file +.vscode/ +*.zip +*.gif +*.tiff +*.pxm diff --git a/Classifiers/XGB.py b/Classifiers/XGB.py index 370badc..cca853d 100644 --- a/Classifiers/XGB.py +++ b/Classifiers/XGB.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ Created on Thu Apr 30 13:25:50 2020 @@ -40,7 +39,7 @@ from xgboost import XGBClassifier from sklearn.feature_selection import SelectKBest from sklearn.preprocessing import PolynomialFeatures -#import shap +import shap from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedShuffleSplit from sklearn.feature_selection import SelectFromModel @@ -53,6 +52,7 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import RobustScaler from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_predict from sklearn.metrics import auc as auc_calc @@ -83,22 +83,30 @@ def __init__(self): ''' self.goal = None + self.data_struct = None + self.model_args = { + 'imputer': 'iterative', # Simple, iterative, forest 'add_missing_indicator': False, + 'apply_polynomials': False, + 'apply_feature_selection': False, + 'n_features': 10, 'apply_pca': False, 'pca_n_components': 3, - - 'select_features': False, - 'n_best_features': 10, - 'plot_feature_graph': True, - 'grid_search': True } + self.grid = { + 'XGB__learning_rate': ([0.1, 0.01, 0.001]), + 'XGB__gamma': ([0.1, 0.01, 0.001]), + 'XGB__n_estimators': ([100, 200, 300, 500, 700]), + 'XGB__subsample': ([0.5, 0.7, 0.9]), + 'XGB__colsample_bytree': ([0.5, 0.6, 0.7, 0.8]), + 'XGB__max_depth': ([2, 4, 6,8])} + self.score_args = { - 'plot_n_rocaucs': 5, + 'plot_n_rocaucs': 0, 'n_thresholds': 50, - } self.evaluation_args = { @@ -111,29 +119,19 @@ def __init__(self): self.intercepts = [] self.n_best_features = [] + self.trained_classifiers = [] + self.learn_size = [] - self.grid = { - 'XGB__learning_rate': ([0.1, 0.01, 0.001]), - 'XGB__gamma': ([0.1, 0.01, 0.001]), - 'XGB__n_estimators': ([100, 200, 300, 500, 700]), - 'XGB__subsample': ([0.5, 0.7, 0.9]), - 'XGB__colsample_bytree': ([0.5, 0.6, 0.7, 0.8]), - 'XGB__max_depth': ([2, 4, 6,8])} - - #self.grid = { - #'XGB__n_estimators': ([200,400,500,600,800,1000,1200,1400]), - #'XGB__max_features': (['auto', 'sqrt', 'log2']), # precomputed,'poly', 'sigmoid' - #'XGB__max_depth': ([10,20,30,40, 50, 60, 70, 80, 90, 100, None]), - #'XGB__criterion': (['gini', 'entropy']), - #'XGB__min_samples_split': [2,4,6,8], - #'XGB__min_samples_leaf': [2,4,6,8,10]} - self.save_path = '' self.fig_size = (1280, 960) self.fig_dpi = 600 + self.random_state = 0 - + self.save_prediction = True + self.hospital = pd.Series() + self.days_until_outcome = pd.Series() + def define_imputer(self,impute_type): '''Initialize the imputer to be used for every iteration. @@ -146,18 +144,16 @@ def define_imputer(self,impute_type): if impute_type=='simple': self.imputer = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=self.model_args['add_missing_indicator']) - else: - if impute_type=='iterative': - self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median', + elif impute_type=='iterative': + self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median', add_indicator=self.model_args['add_missing_indicator']) - else: - if impute_type=='forest': - self.imputer = MissForest(random_state=self.random_state,n_jobs=-2) + elif impute_type=='forest': + self.imputer = MissForest(random_state=self.random_state,n_jobs=-2) def get_pipeline(self): - self.define_imputer('iterative') + self.define_imputer(self.model_args['imputer']) steps = [('imputer', self.imputer), - ('scaler', MinMaxScaler())] + ('scaler', RobustScaler())] if self.model_args['apply_polynomials']: steps += [('polynomials', PolynomialFeatures(interaction_only=True))] @@ -176,11 +172,7 @@ def get_pipeline(self): for key in keys: del self.grid[key] - steps += [('LR', LogisticRegression(solver='saga', - penalty='elasticnet', #class_weight='balanced', - l1_ratio=.5, - max_iter=200, - random_state=self.random_state))] + steps += [('XGB', XGBClassifier(random_state=self.random_state))] return Pipeline(steps) def train(self, datasets): @@ -215,43 +207,63 @@ def train(self, datasets): self.learn_size += [{'tr_x': train_x.shape, 'tr_y': train_y.shape, 'te_x': test_x.shape, 'te_y': test_y.shape}] - n_best_features = self.model_args['n_best_features'] - plot_feature_graph = self.model_args['plot_feature_graph'] - train_x = self.impute_missing_values(train_x) test_x = self.impute_missing_values(test_x) + # Define pipeline self.pipeline = self.get_pipeline() # Model and feature selection # TODO ideally also the feature selection would take place within a CV pipeline - if self.model_args['select_features']: - if not any(self.n_best_features): - self.importances = self.feature_contribution(clf, train_x, train_y, - plot_graph=plot_feature_graph) - self.n_best_features = np.argsort(self.importances)[-n_best_features:] - print('Select {} best features:\n{}'.format(self.model_args['n_best_features'], - list(train_x.columns[ - self.n_best_features]))) - train_x = train_x.iloc[:, self.n_best_features] - test_x = test_x.iloc[:, self.n_best_features] if self.model_args['grid_search']: # print("Train classfier using grid search for best parameters.") cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=self.random_state) - grid = RandomizedSearchCV(self.pipeline, param_grid=self.grid, cv=cv, - scoring='roc_auc', n_jobs=-2) + grid = RandomizedSearchCV(self.pipeline, param_distributions=self.grid, cv=cv, + scoring='roc_auc', n_jobs=-2, n_iter=50) grid.fit(train_x, train_y) clf = grid.best_estimator_ + self.trained_classifiers += [clf] # print("Best estimator: ", clf) else: # Train classifier without optimization. clf = self.pipeline clf.fit(train_x, train_y) + self.coefs.append(clf['XGB'].feature_importances_) + test_y_hat = clf.predict_proba(test_x) # Predict - self.coefs.append(clf['XGB'].feature_importances_) + if 'feature_selection' in clf.named_steps: + columns = train_x.columns[np.argsort(clf.named_steps\ + .feature_selection\ + .pvalues_)][0:self.model_args['n_features']].to_list() + self.n_best_features += [columns] + print(columns) + else: + columns = train_x.columns + + idx_train = train_x.index + idx_test = test_x.index + + if self.model_args['add_missing_indicator']: + missing_cols = columns.to_list()\ + + ['{}_nan'.format(c) + for c in train_x.loc[:, train_x.isna().any()]] + + train_x = pd.DataFrame(clf[:-1].transform(train_x)) + test_x = pd.DataFrame(clf[:-1].transform(test_x)) + + if self.model_args['add_missing_indicator']: + train_x.columns = missing_cols + test_x.columns = missing_cols + else: + train_x.columns = columns + test_x.columns = columns + + train_x.index = idx_train + test_x.index = idx_test + datasets = {"train_x": train_x, @@ -259,7 +271,10 @@ def train(self, datasets): "train_y": train_y, "test_y": test_y} - return clf, datasets, test_y_hat + explainer = shap.TreeExplainer(clf['XGB']) + shap_values = explainer.shap_values(test_x) + + return clf, datasets, test_y_hat, shap_values, test_x def score(self, clf, datasets, test_y_hat, rep): ''' Scores the individual prediction per outcome. @@ -308,7 +323,7 @@ def score(self, clf, datasets, test_y_hat, rep): return score - def evaluate(self, clf, datasets, scores): + def evaluate(self, clf, datasets, scores, hospitals=None): ''' Evaluate the results of the modelling process, such as, feature importances. NOTE: No need to show plots here, plt.show is called right @@ -329,9 +344,15 @@ def evaluate(self, clf, datasets, scores): cms = [score['conf_mats'] for score in scores] thresholds = [score['thr'] for score in scores] + if self.model_args['apply_feature_selection']: + self.vote_best_featureset() + self.analyse_fpr(cms, thresholds) - fig, ax = self.plot_model_results([score['roc_auc'] for score in scores]) - fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf, + fig, ax = self.plot_model_results([score['roc_auc'] for score in scores], + hospitals=hospitals) + if not self.model_args['apply_feature_selection']\ + and not self.model_args['add_missing_indicator']: + fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf, show_n_features=self.evaluation_args['show_n_features'], normalize_coefs=self.evaluation_args['normalize_coefs']) if self.save_prediction: @@ -340,6 +361,9 @@ def evaluate(self, clf, datasets, scores): def impute_missing_values(self, data, missing_class=-1): data = data.copy() # Prevents copy warning + vars_binary = get_fields_per_type(data, self.data_struct, 'radio') + data.loc[:, vars_binary] = data[vars_binary].fillna(0, axis=0) + # Categorical vars_categorical = get_fields_per_type(data, self.data_struct, 'category') data.loc[:, vars_categorical] = data[vars_categorical].fillna(missing_class, axis=0) @@ -353,85 +377,29 @@ def impute_missing_values(self, data, missing_class=-1): # data = data.fillna(0).astype(float) return data - def add_engineered_features(self, train_x, test_x): - ''' Generate and add features''' - # TODO: Add PCA to preprocessing pipeline to avoid leakage in CV. - columns = train_x.columns - train_x = train_x.reset_index(drop=True) - test_x = test_x.reset_index(drop=True) - - scaler = MinMaxScaler().fit(train_x) - # scaler = StandardScaler().fit(train_x) - train_x = pd.DataFrame(scaler.transform(train_x), columns=columns) - test_x = pd.DataFrame(scaler.transform(test_x), columns=columns) - - # Dimensional transformation - if self.model_args['apply_pca']: - train_x, test_x = self.get_pca(train_x, test_x) - - return train_x, test_x - - def get_pca(self, train_x, test_x): - pca_col_names = ['PC {:d}'.format(i+1) for i in range(self.model_args['pca_n_components'])] - pca = PCA(n_components=self.model_args['pca_n_components']) - pca = pca.fit(train_x) - pcs_train = pd.DataFrame(pca.transform(train_x), columns=pca_col_names) - pcs_test = pd.DataFrame(pca.transform(test_x), columns=pca_col_names) - train_x = pd.concat([train_x, pcs_train], axis=1) - test_x = pd.concat([test_x, pcs_test], axis=1) - - return train_x, test_x - - def feature_contribution(self, clf, x, y, plot_graph=False, plot_n_features=None, - n_cv=2, method='predict_proba'): - - plot_n_features = x.shape[1] if not plot_n_features else plot_n_features - y_hat = cross_val_predict(clf, x, y, cv=n_cv, method=method) - baseline_score = roc_auc_score(y, y_hat[:, 1]) - - importances = np.array([]) - - for col in x.columns: - x_tmp = x.drop(col, axis=1) - y_hat = cross_val_predict(clf, x_tmp, y, cv=n_cv, method=method) - score = roc_auc_score(y, y_hat[:, 1]) - importances = np.append(importances, baseline_score-score) - - if plot_graph: - idc = np.argsort(importances) - columns = x.columns[idc] - fig, ax = plt.subplots(1, 1) - ax.plot(importances[idc[-plot_n_features:]]) - ax.axhline(0, color='k', linewidth=.5) - ax.set_xticks(np.arange(x.shape[1])) - ax.set_xticklabels(columns[-plot_n_features:], rotation=90, fontdict={'fontsize': 6}) - ax.set_xlabel('Features') - ax.set_ylabel('Difference with baseline') - fig.tight_layout() - - return importances - - def plot_model_results(self, - aucs): # , classifier='Logistic regression', outcome='ICU admission'): - + def plot_model_results(self, aucs, hospitals=[]): # , classifier='Logistic regression', outcome='ICU admission'): avg = sum(aucs) / max(len(aucs), 1) std = sqrt(sum([(auc - avg) ** 2 for auc in aucs]) / len(aucs)) sem = std / sqrt(len(aucs)) fig, ax = plt.subplots(1, 1) ax.plot(aucs) - ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)' - .format('XGB', avg, sem * 1.96)) + # ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)' + # .format('Logistic Regression', avg, sem * 1.96)) + ax.set_title('{}\nAUC: {:.2f} ({:.2f} to {:.2f}) (95% CI)' + .format('XGB', avg, avg-(sem*1.96), avg+(sem*1.96))) ax.axhline(sum(aucs) / max(len(aucs), 1), color='g', linewidth=1) ax.axhline(.5, color='r', linewidth=1) ax.set_ylim(0, 1) - ax.set_xlabel('Iteration') + ax.set_xlabel('Test Fold') + if any(hospitals): + ax.set_xticks(list(range(hospitals.size))) + ax.set_xticklabels(hospitals) ax.set_ylabel('AUC') - ax.legend(['ROC AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5)) + ax.legend(['AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5)) fig.tight_layout() fig.savefig(self.save_path + '_Performance_roc_auc_{}_{}.png'.format(avg, sem * 1.96), figsize=self.fig_size, dpi=self.fig_dpi) - return fig, ax def analyse_fpr(self, cms, thresholds): @@ -497,72 +465,115 @@ def analyse_fpr(self, cms, thresholds): fig.savefig(self.save_path + '_average_roc.png', figsize=self.fig_size, dpi=self.fig_dpi) - def plot_model_weights(self, feature_labels, clf, - show_n_features=10, normalize_coefs=False): - if self.model_args['add_missing_indicator']: - # ADD featuere labels for missing feature indicators - feature_labels = feature_labels.to_list() \ - + ['m_id_{}'.format(feature_labels[i])\ - for i in clf.named_steps.imputer.indicator_.features_] - - coefs = self.coefs - intercepts = self.intercepts - coefs = np.array(coefs).squeeze() - intercepts = np.array(intercepts).squeeze() + def plot_model_weights(self, feature_labels, clf, + show_n_features=10, normalize_coefs=False): - if len(coefs.shape) <= 1: - return + feature_labels = self.get_feature_labels(feature_labels, clf) - show_n_features = coefs.shape[1] if show_n_features is None else show_n_features + # FIXME + if self.model_args['apply_pca']: + print('UNEVEN FEATURE LENGTH. CANT PLOT WEIGHTS') + return None, None - coefs = (coefs - coefs.mean(axis=0)) / coefs.std(axis=0) if normalize_coefs else coefs + coefs = self.coefs + intercepts = self.intercepts + coefs = np.array(coefs).squeeze() + intercepts = np.array(intercepts).squeeze() - avg_coefs = abs(coefs.mean(axis=0)) - mask_not_nan = ~np.isnan(avg_coefs) # Remove non-existent weights - avg_coefs = avg_coefs[mask_not_nan] + if len(coefs.shape) <= 1: + return - var_coefs = coefs.var(axis=0)[mask_not_nan] if not normalize_coefs else None - idx_sorted = avg_coefs.argsort() - n_bars = np.arange(avg_coefs.shape[0]) + np.save('xgb_coefs.npy', coefs) + + with open(self.save_path + '_coefs.txt', 'w') as f: + f.write('{}'.format(coefs)) + + show_n_features = coefs.shape[1] if show_n_features is None else show_n_features + + odds = np.exp(coefs) + odds_avg = odds.mean(axis=0)-1 + odds_var = odds.var(axis=0) + + idx_sorted = odds_avg.argsort() + n_bars = np.arange(odds_avg.size) + labels = np.array([self.var_dict.get(c, c) for c in feature_labels]) + fontsize = 5.75 if labels.size < 50 else 5 + bar_width = .5 # bar width + + fig, ax = plt.subplots() + ax.set_title('XGBoost - Odds ratios') + ax.barh(n_bars, odds_avg[idx_sorted], bar_width, xerr=odds_var[idx_sorted], + label='Weight') + ax.set_yticks(n_bars) + ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize }) + ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5) + ax.set_xlabel('Odds ratio') + fig.tight_layout() + fig.savefig(self.save_path + '_Average_weight_variance.png', + figsize=self.fig_size, dpi=self.fig_dpi) + return fig, ax + + def get_feature_labels(self, labels, clf): + steps = clf.named_steps.keys() + labels = np.array(labels) - labels = [self.var_dict.get(c) for c in feature_labels] - has_no_label = labels for i, l in enumerate(labels): if l == None: - labels[i] = feature_labels[i] + labels[i] = labels[i] elif 'chronic cardiac disease' in l.lower(): labels[i] = 'Chronic Cardiac Disease (Not hypertension)' elif 'home medication' in l.lower(): labels[i] = 'Home medication' - labels = np.array(labels) - fontsize = 5.75 if labels.size < 50 else 5 - bar_width = .5 # bar width - fig, ax = plt.subplots() - if normalize_coefs: - ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, label='Weight') - ax.set_title('XGB - Average weight value') - else: - ax.set_title('XGB - Average weight value') - ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, xerr=var_coefs[idx_sorted], - label='Weight') - ax.set_yticks(n_bars) - ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize }) - ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5) - ax.set_xlabel('Weight{}'.format(' (normalized)' if normalize_coefs else '')) - fig.tight_layout() - fig.savefig(self.save_path + '_Average_weight_variance.png', - figsize=self.fig_size, dpi=self.fig_dpi) - return fig, ax + # NOTE: use loop over steps to be able to switch order + if self.model_args['add_missing_indicator']: + labels = labels.to_list() \ + + ['m_id_{}'.format(labels[i])\ + for i in clf.named_steps.imputer.indicator_.features_] + + if 'polynomials' in steps: + # N_features = n_features (n) + # + n_combinations_without_repeat (k) + # + bias (if true) + # = n + (n!)/(k!(n-k)!) + 1 + + labels = np.array(clf.named_steps.polynomials.get_feature_names(labels)) + + if 'feature_selection' in steps: + # TODO: check if also works with adding_missing_indicators + k = self.model_args['n_features'] + labels = labels[np.argsort(clf.named_steps\ + .feature_selection\ + .pvalues_)[0:k]] + + return labels + return labels + + def vote_best_featureset(self): + nansum = lambda x: sum([i for i in x if str(i)!='nan']) + # Get list by voting. i.e sorted list with most occurences + columns_list = [sorted(fset[0]) for fset in self.n_best_features] + result = pd.DataFrame(self.n_best_features, columns=['columns', 'fvalues', 'pvalues']) + result['fsum'] = result['fvalues'].apply(nansum) + result['psum'] = result['pvalues'].apply(nansum) + result['columns'] = result['columns'].apply(sorted) + result.to_excel('_xgb_feature_selection_results.xlsx') + + counts = pd.Series(columns_list).value_counts() + counts.to_excel(self.save_path + '_xgb_feature_selection_votes.xlsx') + print('votes={} for {}'.format(counts.iloc[0], counts.index[0])) def save_prediction_to_file(self, scores): - x = pd.concat([score['x'] for score in scores], axis=0).reset_index(drop=True) - y_hat = pd.concat([pd.Series(score['y_hat']) for score in scores], axis=0) \ - .reset_index(drop=True) - y = pd.concat([score['y'] for score in scores], axis=0).reset_index(drop=True) - - df = pd.concat([x, y, y_hat], axis=1) - df.columns=list(x.columns)+['y', 'y_hat'] + x = pd.concat([score['x'] for score in scores], axis=0) + y_hat = pd.concat([pd.Series(score['y_hat']) for score in scores], axis=0) + y_hat.index = x.index + y = pd.concat([score['y'] for score in scores], axis=0) + self.hospital.index = x.index + self.days_until_outcome.index = self.days_until_outcome.index + df = pd.concat([x, y, y_hat, + self.hospital, + self.days_until_outcome], axis=1) + df.columns=list(x.columns)+['y', 'y_hat', 'hospital', 'days_until_death'] filename = self.save_path + '_prediction.pkl' df.to_pickle(filename) @@ -607,3 +618,61 @@ def get_fields_per_type(data, data_struct, type): # ax.set_title('FALSE POSITIVE RATE\n Mean mean and median over FPRs at different\nthresholds for all training iterations\nmean: {:.3f} \u00B1 {:.3f}\nmedian: {:.3f} \u00B1 {:.3f}' # .format(means_mets['mean'], means_mets['ci'], median_mets['mean'], median_mets['ci'])) # fig.savefig('FPR_results.png') + + # def plot_model_weights(self, feature_labels, clf, + # show_n_features=10, normalize_coefs=False): + # if self.model_args['add_missing_indicator']: + # # ADD featuere labels for missing feature indicators + # feature_labels = feature_labels.to_list() \ + # + ['m_id_{}'.format(feature_labels[i])\ + # for i in clf.named_steps.imputer.indicator_.features_] + + # coefs = self.coefs + # intercepts = self.intercepts + # coefs = np.array(coefs).squeeze() + # intercepts = np.array(intercepts).squeeze() + + # if len(coefs.shape) <= 1: + # return + + # show_n_features = coefs.shape[1] if show_n_features is None else show_n_features + + # coefs = (coefs - coefs.mean(axis=0)) / coefs.std(axis=0) if normalize_coefs else coefs + + # avg_coefs = abs(coefs.mean(axis=0)) + # mask_not_nan = ~np.isnan(avg_coefs) # Remove non-existent weights + # avg_coefs = avg_coefs[mask_not_nan] + + # var_coefs = coefs.var(axis=0)[mask_not_nan] if not normalize_coefs else None + # idx_sorted = avg_coefs.argsort() + # n_bars = np.arange(avg_coefs.shape[0]) + + # labels = [self.var_dict.get(c) for c in feature_labels] + # has_no_label = labels + # for i, l in enumerate(labels): + # if l == None: + # labels[i] = feature_labels[i] + # elif 'chronic cardiac disease' in l.lower(): + # labels[i] = 'Chronic Cardiac Disease (Not hypertension)' + # elif 'home medication' in l.lower(): + # labels[i] = 'Home medication' + # labels = np.array(labels) + # fontsize = 5.75 if labels.size < 50 else 5 + # bar_width = .5 # bar width + + # fig, ax = plt.subplots() + # if normalize_coefs: + # ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, label='Weight') + # ax.set_title('XGB - Average weight value') + # else: + # ax.set_title('XGB - Average weight value') + # ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, xerr=var_coefs[idx_sorted], + # label='Weight') + # ax.set_yticks(n_bars) + # ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize }) + # ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5) + # ax.set_xlabel('Weight{}'.format(' (normalized)' if normalize_coefs else '')) + # fig.tight_layout() + # fig.savefig(self.save_path + 'XGB_Average_weight_variance.png', + # figsize=self.fig_size, dpi=self.fig_dpi) + # return fig, ax \ No newline at end of file diff --git a/Classifiers/gradboost.py b/Classifiers/gradboost.py deleted file mode 100644 index b3080e4..0000000 --- a/Classifiers/gradboost.py +++ /dev/null @@ -1,50 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Apr 17 16:27:52 2020 - -@author: rriccilopes -""" - -import matplotlib.pyplot as plt -import pandas as pd -import numpy as np - -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.metrics import roc_auc_score -from sklearn.model_selection import GridSearchCV - -import warnings -from sklearn.exceptions import ConvergenceWarning -warnings.filterwarnings(action='ignore', category=ConvergenceWarning) - - -def train_gradient_boosting(train_x, test_x, train_y, test_y, **kwargs): - """ - - kwargs - ---------- - plot_graph : bool - gridsearch : bool - """ - gridsearch = False if not kwargs.get('gridsearch') else kwargs.get('gridsearch') - - # Initialize Classifier - clf = GradientBoostingClassifier(random_state=0) - - if gridsearch: - grid = { - "n_estimators": [100, 500, 1000, 2000], - "min_samples_split": [2, 4], - "min_samples_leaf": [1, 2, 4], - "max_depth": [None, 2, 3], - "max_features": ['auto'] - } - clf = GridSearchCV(estimator=clf, param_grid=grid, cv=5, n_jobs=-2, scoring='roc_auc') - clf.fit(train_x, train_y) # Model - clf = clf.best_estimator_ - else: - clf.fit(train_x, train_y) - - test_y_hat = clf.predict_proba(test_x) # Predict - - return clf, train_x, test_x, train_y, test_y, test_y_hat \ No newline at end of file diff --git a/Classifiers/logreg.py b/Classifiers/logreg.py index 1a9e49d..a2b10d5 100644 --- a/Classifiers/logreg.py +++ b/Classifiers/logreg.py @@ -1,37 +1,38 @@ -''' +''' DO NOT OVERWRITE THIS FILE, MAKE A COPY TO EDIT! This file contains blueprint for the model that is used -in covid19_ICU_admission. +in covid19_ICU_admission. This class should take care of training a classifier/regressor, -scoring each iteration and evaluating post training. +scoring each iteration and evaluating post training. -Make sure the names, inputs and outputs of the predefined methods stay +Make sure the names, inputs and outputs of the predefined methods stay the same! Model parameters: Please store all parameters in the defined dicts in __init__(). -This way it is easy to change some parameters without changing -the main code and without the need for extra config files. +This way it is easy to change some parameters without changing +the main code and without the need for extra config files. Most important naming conventions and variables: Model: The whole class in this file. This means that "model" takes care of training, scoring and evaluating Clf: This is the actually classifier/regressor. It can be for example - the trained instance from the Sklearn package + the trained instance from the Sklearn package (e.g. LogisticRegression()) Datasets: dictionary containin all training and test sets: train_x, train_y, test_x, test_y, test_y_hat -..._args: Dictionary that holds the parameters that are used as +..._args: Dictionary that holds the parameters that are used as input for train, score or evaluate - + ''' +import os from math import sqrt - +import shap import matplotlib.pyplot as plt import pandas as pd import numpy as np @@ -39,11 +40,15 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import PolynomialFeatures +from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import SelectKBest from sklearn.model_selection import cross_val_predict from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.model_selection import RandomizedSearchCV +from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.metrics import auc as auc_calc from sklearn.metrics import roc_curve @@ -51,46 +56,46 @@ from sklearn.metrics import confusion_matrix from sklearn.metrics import plot_confusion_matrix -import os -from sklearn.pipeline import Pipeline + from sklearn.model_selection import GridSearchCV -from sklearn.impute import SimpleImputer import warnings from sklearn.exceptions import ConvergenceWarning +# from sklearn.exceptions import UserWarning +from sklearn.experimental import enable_iterative_imputer # Required to enable experimental iterative imputer from sklearn.impute import IterativeImputer from missingpy import MissForest -# warnings.filterwarnings(action='ignore', category=ConvergenceWarning) +warnings.filterwarnings(action='ignore', category=ConvergenceWarning) +# warnings.filterwarnings(action='ignore', category=UserWarning) class LogReg: def __init__(self): ''' Initialize model. - Save all model parameters here. + Save all model parameters here. Please don't change the names of the preset parameters, as they might be called outside - this class. + this class. ''' self.goal = None self.data_struct = None self.model_args = { + 'imputer': 'iterative', # Simple, iterative, forest 'add_missing_indicator': False, - 'apply_polynomials': False, - 'apply_feature_selection': False, 'n_features': 10, - 'apply_pca': False, 'pca_n_components': 3, - 'grid_search': True } self.grid = { - 'PCA__n_components': list(range(1, 11)), - 'LR__l1_ratio': [i / 10 for i in range(0, 11, 1)] + 'LR__l1_ratio': [i / 10 for i in range(0, 11, 2)], + 'LR__C': np.logspace(-4, 4, 10), + 'LR__penalty': ['l2','elasticnet'], + 'LR__max_iter': [100, 300, 500] } self.score_args = { @@ -99,13 +104,15 @@ def __init__(self): self.evaluation_args = { 'show_n_features': None, - 'normalize_coefs': True, + 'normalize_coefs': False, 'plot_analyse_fpr': False} self.coefs = [] self.intercepts = [] self.n_best_features = [] + self.trained_classifiers = [] + self.learn_size = [] self.save_path = '' @@ -113,23 +120,24 @@ def __init__(self): self.fig_dpi = 600 self.random_state = 0 - self.save_prediction = False - self.hospital = None + self.save_prediction = True + self.hospital = pd.Series() + self.days_until_outcome = pd.Series() def train(self, datasets): ''' Initialize, train and predict a classifier. This includes: Feature engineering (i.e. PCA) and - selection, training clf, (hyper)parameter optimization, + selection, training clf, (hyper)parameter optimization, and a prediction on the test set. Make sure to save all variables you want to keep track of in the instance. - Input: + Input: datasets:: dict Contains train and test x, y - + Output: clf:: instance, dict, list, None - Trained classifier/regressor instance, such as + Trained classifier/regressor instance, such as sklearn logistic regression. Is not used outside this file, so can be left empty datasets:: dict @@ -150,6 +158,7 @@ def train(self, datasets): self.learn_size += [{'tr_x': train_x.shape, 'tr_y': train_y.shape, 'te_x': test_x.shape, 'te_y': test_y.shape}] + # Imputes non numeric train_x = self.impute_missing_values(train_x) test_x = self.impute_missing_values(test_x) @@ -160,10 +169,14 @@ def train(self, datasets): if self.model_args['grid_search']: # print("Train classfier using grid search for best parameters.") cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=self.random_state) - grid = GridSearchCV(self.pipeline, param_grid=self.grid, cv=cv, - scoring='roc_auc', n_jobs=-2) + grid = RandomizedSearchCV(self.pipeline, param_distributions=self.grid, cv=cv, + scoring='roc_auc', n_jobs=-2, + verbose=1, n_iter=50) + # grid = GridSearchCV(self.pipeline, param_grid=self.grid, cv=cv, + # scoring='roc_auc', n_jobs=-2) grid.fit(train_x, train_y) clf = grid.best_estimator_ + self.trained_classifiers += [clf] # print("Best estimator: ", clf) else: # Train classifier without optimization. @@ -174,21 +187,47 @@ def train(self, datasets): self.intercepts.append(clf.named_steps['LR'].intercept_) test_y_hat = clf.predict_proba(test_x) # Predict - + if 'feature_selection' in clf.named_steps: - columns = train_x.columns[np.argsort(clf.named_steps\ - .feature_selection\ - .pvalues_)][0:self.model_args['n_features']].to_list() + idx_sorted = np.argsort(clf['feature_selection']\ + .pvalues_) + f_values = clf['feature_selection'].scores_ + p_values = clf['feature_selection'].pvalues_ + columns = train_x.columns[idx_sorted[0:self.model_args['n_features']]].to_list() + self.n_best_features += [[columns, f_values, p_values]] + print(columns) else: columns = train_x.columns - train_x = pd.DataFrame(clf[:-1].transform(train_x), columns=columns) - train_x = pd.DataFrame(clf[:-1].transform(test_x), columns=columns) + + idx_train = train_x.index + idx_test = test_x.index + + if self.model_args['add_missing_indicator']: + missing_cols = columns.to_list()\ + + ['{}_nan'.format(c) + for c in train_x.loc[:, train_x.isna().any()]] + + train_x = pd.DataFrame(clf[:-1].transform(train_x)) + test_x = pd.DataFrame(clf[:-1].transform(test_x)) + + if self.model_args['add_missing_indicator']: + train_x.columns = missing_cols + test_x.columns = missing_cols + else: + train_x.columns = columns + test_x.columns = columns + + train_x.index = idx_train + test_x.index = idx_test datasets = {"train_x": train_x, "test_x": test_x, "train_y": train_y, "test_y": test_y} - return clf, datasets, test_y_hat + explainer = shap.LinearExplainer(clf['LR'], test_x) + shap_values = explainer.shap_values(test_x) + + return clf, datasets, test_y_hat, shap_values, test_x def score(self, clf, datasets, test_y_hat, rep): ''' Scores the individual prediction per outcome. @@ -234,10 +273,10 @@ def score(self, clf, datasets, test_y_hat, rep): 'y': datasets['test_y'], 'y_hat': test_y_hat } - + return score - - def evaluate(self, clf, datasets, scores): + + def evaluate(self, clf, datasets, scores, hospitals=None): ''' Evaluate the results of the modelling process, such as, feature importances. NOTE: No need to show plots here, plt.show is called right @@ -253,44 +292,53 @@ def evaluate(self, clf, datasets, scores): List of all scores generated per training iteration. ''' - self.var_dict = dict(zip(self.data_struct['Field Variable Name'], + self.var_dict = dict(zip(self.data_struct['Field Variable Name'], self.data_struct['Field Label'])) cms = [score['conf_mats'] for score in scores] thresholds = [score['thr'] for score in scores] + if self.model_args['apply_feature_selection']: + # self.save_path += 'k{}'.format(self.model_args['n_features']) + self.vote_best_featureset() + self.analyse_fpr(cms, thresholds) - fig, ax = self.plot_model_results([score['roc_auc'] for score in scores]) - fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf, - show_n_features=self.evaluation_args['show_n_features'], - normalize_coefs=self.evaluation_args['normalize_coefs']) + fig, ax = self.plot_model_results([score['roc_auc'] for score in scores], + hospitals=hospitals) + + if not self.model_args['apply_feature_selection']\ + and not self.model_args['add_missing_indicator']: + fig2, ax2 = self.plot_model_weights(datasets['test_x'].columns, clf, + show_n_features=self.evaluation_args['show_n_features'], + normalize_coefs=self.evaluation_args['normalize_coefs']) if self.save_prediction: self.save_prediction_to_file(scores) - - def define_imputer(self,impute_type): + + def define_imputer(self, impute_type): '''Initialize the imputer to be used for every iteration. - + Input: - impute_type: string, {'simple': SimpleImputer, + impute_type: string, {'simple': SimpleImputer, 'iterative': IterativeImputer and 'forest': RandomForest imputer} Output: - Imputer: imputer object to be used in the pipeline + Imputer: imputer object to be used in the pipeline ''' if impute_type=='simple': self.imputer = SimpleImputer(missing_values=np.nan, strategy='median', - add_indicator=self.model_args['add_missing_indicator']) - else: - if impute_type=='iterative': - self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median', - add_indicator=self.model_args['add_missing_indicator']) - else: - if impute_type=='forest': - self.imputer = MissForest(random_state=self.random_state,n_jobs=-2) - + add_indicator=self.model_args['add_missing_indicator']) + elif impute_type=='iterative': + self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median', + add_indicator=self.model_args['add_missing_indicator']) + elif impute_type=='forest': + self.imputer = MissForest(random_state=self.random_state,n_jobs=-2) + def get_pipeline(self): - self.define_imputer('iterative') - steps = [('imputer', self.imputer), - ('scaler', MinMaxScaler())] + self.define_imputer(self.model_args['imputer']) + + steps = [ + ('imputer', self.imputer), + ('scaler', RobustScaler()) + ] if self.model_args['apply_polynomials']: steps += [('polynomials', PolynomialFeatures(interaction_only=True))] @@ -309,30 +357,29 @@ def get_pipeline(self): for key in keys: del self.grid[key] - steps += [('LR', LogisticRegression(solver='saga', - penalty='elasticnet', #class_weight='balanced', + steps += [('LR', LogisticRegression(solver='saga', + penalty='elasticnet', #class_weight='balanced', l1_ratio=.5, max_iter=200, - random_state=self.random_state))] + random_state=self.random_state))] return Pipeline(steps) - def impute_missing_values(self, data, missing_class=-1): + def impute_missing_values(self, data, missing_class=0): + # Imputes all missing values from non-numeric columns data = data.copy() # Prevents copy warning + vars_binary = get_fields_per_type(data, self.data_struct, 'radio') + data.loc[:, vars_binary] = data[vars_binary].fillna(0, axis=0) + # Categorical vars_categorical = get_fields_per_type(data, self.data_struct, 'category') data.loc[:, vars_categorical] = data[vars_categorical].fillna(missing_class, axis=0) - - # # Numeric - # vars_numeric = get_fields_per_type(data, self.data_struct, 'numeric') - # data.loc[:, vars_numeric] = data.loc[:, vars_numeric] \ - # .fillna(data.loc[:, vars_numeric] \ - # .median()) - # data = data.fillna(0).astype(float) + + return data - def plot_model_results(self, - aucs): # , classifier='Logistic regression', outcome='ICU admission'): + def plot_model_results(self, aucs, hospitals=[]): + # , classifier='Logistic regression', outcome='ICU admission'): avg = sum(aucs) / max(len(aucs), 1) std = sqrt(sum([(auc - avg) ** 2 for auc in aucs]) / len(aucs)) @@ -340,14 +387,19 @@ def plot_model_results(self, fig, ax = plt.subplots(1, 1) ax.plot(aucs) - ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)' - .format('Logistic Regression', avg, sem * 1.96)) + # ax.set_title('{}\nROC AUC: {:.3f} \u00B1 {:.3f} (95% CI)' + # .format('Logistic Regression', avg, sem * 1.96)) + ax.set_title('{}\nAUC: {:.2f} ({:.2f} to {:.2f}) (95% CI)' + .format('Logistic Regression', avg, avg-(sem*1.96), avg+(sem*1.96))) ax.axhline(sum(aucs) / max(len(aucs), 1), color='g', linewidth=1) ax.axhline(.5, color='r', linewidth=1) ax.set_ylim(0, 1) - ax.set_xlabel('Iteration') + ax.set_xlabel('Test Fold') + if any(hospitals): + ax.set_xticks(list(range(hospitals.size))) + ax.set_xticklabels(hospitals) ax.set_ylabel('AUC') - ax.legend(['ROC AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5)) + ax.legend(['AUC', 'Average', 'Chance level'], bbox_to_anchor=(1, 0.5)) fig.tight_layout() fig.savefig(self.save_path + '_Performance_roc_auc_{}_{}.png'.format(avg, sem * 1.96), figsize=self.fig_size, dpi=self.fig_dpi) @@ -417,11 +469,11 @@ def analyse_fpr(self, cms, thresholds): fig.savefig(self.save_path + '_average_roc.png', figsize=self.fig_size, dpi=self.fig_dpi) - def plot_model_weights(self, feature_labels, clf, + def plot_model_weights(self, feature_labels, clf, show_n_features=10, normalize_coefs=False): feature_labels = self.get_feature_labels(feature_labels, clf) - + # FIXME if self.model_args['apply_pca']: print('UNEVEN FEATURE LENGTH. CANT PLOT WEIGHTS') @@ -435,34 +487,32 @@ def plot_model_weights(self, feature_labels, clf, if len(coefs.shape) <= 1: return - show_n_features = coefs.shape[1] if show_n_features is None else show_n_features + # Save Coeffs + np.save('lr_coefs.npy', coefs) - coefs = (coefs - coefs.mean(axis=0)) / coefs.std(axis=0) if normalize_coefs else coefs + with open(self.save_path + '_coefs.txt', 'w') as f: + f.write('{}'.format(coefs)) - avg_coefs = abs(coefs.mean(axis=0)) - mask_not_nan = ~np.isnan(avg_coefs) # Remove non-existent weights - avg_coefs = avg_coefs[mask_not_nan] + show_n_features = coefs.shape[1] if show_n_features is None else show_n_features - var_coefs = coefs.var(axis=0)[mask_not_nan] if not normalize_coefs else None - idx_sorted = avg_coefs.argsort() - n_bars = np.arange(avg_coefs.shape[0]) + odds = np.exp(coefs) + odds_avg = odds.mean(axis=0)-1 + odds_var = odds.var(axis=0) - labels = np.array([self.var_dict.get(c, c) for c in feature_labels]) + idx_sorted = odds_avg.argsort() + n_bars = np.arange(odds_avg.size) + labels = np.array([self.var_dict.get(c, c) for c in feature_labels]) fontsize = 5.75 if labels.size < 50 else 5 bar_width = .5 # bar width - fig, ax = plt.subplots() - if normalize_coefs: - ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, label='Weight') - ax.set_title('Logistic regression - Average weight value') - else: - ax.set_title('Logistic regression - Average weight value') - ax.barh(n_bars, avg_coefs[idx_sorted], bar_width, xerr=var_coefs[idx_sorted], + fig, ax = plt.subplots() + ax.set_title('Logistic regression - Odds ratios') + ax.barh(n_bars, odds_avg[idx_sorted], bar_width, xerr=odds_var[idx_sorted], label='Weight') ax.set_yticks(n_bars) ax.set_yticklabels(labels[idx_sorted], fontdict={ 'fontsize': fontsize }) ax.set_ylim(n_bars[0] - .5, n_bars[-1] + .5) - ax.set_xlabel('Weight{}'.format(' (normalized)' if normalize_coefs else '')) + ax.set_xlabel('Odds ratio') fig.tight_layout() fig.savefig(self.save_path + '_Average_weight_variance.png', figsize=self.fig_size, dpi=self.fig_dpi) @@ -472,20 +522,20 @@ def get_feature_labels(self, labels, clf): steps = clf.named_steps.keys() labels = np.array(labels) - for i, l in enumerate(labels): - if l == None: - labels[i] = labels[i] - elif 'chronic cardiac disease' in l.lower(): - labels[i] = 'Chronic Cardiac Disease (Not hypertension)' - elif 'home medication' in l.lower(): - labels[i] = 'Home medication' - - # NOTE: use loop over steps to be able to switch order + # NOTE: use loop over steps to be able to switch order if self.model_args['add_missing_indicator']: - labels = labels.to_list() \ + labels = list(labels) \ + ['m_id_{}'.format(labels[i])\ for i in clf.named_steps.imputer.indicator_.features_] - + else: + for i, l in enumerate(labels): + if l == None: + labels[i] = labels[i] + elif 'chronic cardiac disease' in str(l).lower(): + labels[i] = 'Chronic Cardiac Disease (Not hypertension)' + elif 'home medication' in str(l).lower(): + labels[i] = 'Home medication' + if 'polynomials' in steps: # N_features = n_features (n) # + n_combinations_without_repeat (k) @@ -504,13 +554,31 @@ def get_feature_labels(self, labels, clf): return labels return labels + def vote_best_featureset(self): + nansum = lambda x: sum([i for i in x if str(i)!='nan']) + # Get list by voting. i.e sorted list with most occurences + columns_list = [sorted(fset[0]) for fset in self.n_best_features] + result = pd.DataFrame(self.n_best_features, columns=['columns', 'fvalues', 'pvalues']) + result['fsum'] = result['fvalues'].apply(nansum) + result['psum'] = result['pvalues'].apply(nansum) + result['columns'] = result['columns'].apply(sorted) + result.to_excel('_lr_feature_selection_results.xlsx') + + counts = pd.Series(columns_list).value_counts() + counts.to_excel(self.save_path + '_lr_feature_selection_votes.xlsx') + print('votes={} for {}'.format(counts.iloc[0], counts.index[0])) + def save_prediction_to_file(self, scores): x = pd.concat([score['x'] for score in scores], axis=0) y_hat = pd.concat([pd.Series(score['y_hat']) for score in scores], axis=0) y_hat.index = x.index y = pd.concat([score['y'] for score in scores], axis=0) - df = pd.concat([x, y, y_hat, self.hospital], axis=1) - df.columns=list(x.columns)+['y', 'y_hat', 'hospital'] + self.hospital.index = x.index + self.days_until_outcome.index = self.days_until_outcome.index + df = pd.concat([x, y, y_hat, + self.hospital, + self.days_until_outcome], axis=1) + df.columns=list(x.columns)+['y', 'y_hat', 'hospital', 'days_until_death'] filename = self.save_path + '_prediction.pkl' df.to_pickle(filename) @@ -532,8 +600,8 @@ def get_metrics(lst): 'sem': sem, 'ci': ci} -def get_fields_per_type(data, data_struct, type): - fields = data_struct.loc[data_struct['Field Type']=='category', +def get_fields_per_type(data, data_struct, type_): + fields = data_struct.loc[data_struct['Field Type']==type_, 'Field Variable Name'].to_list() return [field for field in fields if field in data.columns] @@ -580,7 +648,7 @@ def get_fields_per_type(data, data_struct, type): # baseline_score = roc_auc_score(y, y_hat[:, 1]) # importances = np.array([]) - + # for col in x.columns: # x_tmp = x.drop(col, axis=1) # y_hat = cross_val_predict(clf, x_tmp, y, cv=n_cv, method=method) diff --git a/covid19_ICU_admission.py b/covid19_ICU_admission.py index df5cf27..f7ec90a 100644 --- a/covid19_ICU_admission.py +++ b/covid19_ICU_admission.py @@ -11,6 +11,8 @@ import os import os.path import sys +import shap + path_to_classifiers = r'./Classifiers/' # path_to_settings = r'./' # TODO: add settings sys.path.insert(0, os.path.join(os.path.dirname(__file__), path_to_classifiers)) @@ -51,7 +53,7 @@ # classifiers from logreg import LogReg from XGB import XGB -from gradboost import train_gradient_boosting +#from survival import Survival # data from get_feature_set import get_1_premorbid @@ -64,7 +66,6 @@ is_in_columns = lambda var_list, data: [v for v in var_list if v in data.columns] - def load_data_api(path_credentials): # Try loading objects from disk file; delete saveddata.pkl to force reload @@ -135,6 +136,14 @@ def load_data(path_to_creds): def preprocess(data, data_struct): ''' Processed the data per datatype.''' + # Drop useless data + cols = data_struct.loc[data_struct.loc[:, 'Form Collection Name']\ + .isin(['!!! CARDIO (OLD DON’T USE)!!!', + 'Vascular medicine (OPTIONAL)', + 'CARDIOLOGY - CAPACITY (OPTIONAL)']), + 'Field Variable Name'].to_list() + data = data.drop(is_in_columns(cols, data), axis=1) + # Fix single errors data = fix_single_errors(data) @@ -154,7 +163,10 @@ def preprocess(data, data_struct): def prepare_for_learning(data, data_struct, variables_to_incl, variables_to_exclude, goal, group_by_record=True, use_outcome=None, - additional_fn=None, use_imputation=True): + additional_fn=None, + remove_records_threshold_above=1, + remove_features_threshold_above=0.5, + pcr_corona_confirmed_only=True): outcomes, used_columns = calculate_outcomes(data, data_struct) data = pd.concat([data, outcomes], axis=1) @@ -170,34 +182,98 @@ def prepare_for_learning(data, data_struct, variables_to_incl, x, y, all_outcomes = select_x_y(data, outcomes, used_columns, goal) + # Remove samples with missing y + if goal[0] != 'survival': + has_y = y.notna() + x = x.loc[has_y, :] + y = y.loc[has_y] + + # Include only patients with CONFIRMED covid infection (PCR+ or Coronavirus) + # This excludes patients based on CORADS > 4 + if pcr_corona_confirmed_only: + is_confirmed_patient = \ + (data['Coronavirus']==1) |\ + (data['pcr_pos']==1) |\ + (data['corads_admission_cat_4']==1) |\ + (data['corads_admission_cat_5']==1) + x = x.loc[is_confirmed_patient, :] + y = y.loc[is_confirmed_patient] + # Select variables to include in prediction variables_to_incl['Field Variable Name'] += ['hospital'] - x = select_variables(x, data_struct, variables_to_incl) - - # Select variables to exclude - x = x.drop(is_in_columns(variables_to_exclude, x), axis=1) + variables_to_exclude += ['Coronavirus', 'pcr_pos'] + x = select_variables(x, data_struct, + variables_to_incl, + variables_to_exclude) + + # Select time frame + days_until_death = outcomes.loc[x.index, 'Days until death'].copy() + days_until_discharge = outcomes.loc[x.index, 'Days until discharge'].copy() + has_death_week = (days_until_death>=0) & (days_until_death<=7) + has_disch_week = (days_until_discharge>=0) & (days_until_discharge<=7) + outcome_in_week = has_death_week | has_disch_week |\ + (outcomes['Levend dag 21 maar nog in het ziekenhuis - totaal']==1) + + # tmp = x.drop(['Record Id', 'hospital'], axis=1) + + # x = x.loc[outcome_in_week, :] + # y = y.loc[outcome_in_week] + days_until_death = days_until_death[outcome_in_week] + + + # Drop features with too many missing + if remove_features_threshold_above is not None: + threshold = remove_features_threshold_above + has_too_many_missing = (x.isna().sum(axis=0)/x.shape[0]) > threshold + x = x.loc[:, ~has_too_many_missing] + print('LOG: dropped features: {}, due to more than {}% missing' + .format(has_too_many_missing.loc[has_too_many_missing].index.to_list(), + threshold*100)) + + # Remove records with too many missing + if remove_records_threshold_above is not None: + threshold = remove_records_threshold_above + has_too_many_missing = ((x.isna().sum(axis=1))/(x.shape[1])) > threshold + print('LOG: Dropped {} records, due to more than {}% missing' + .format(has_too_many_missing.sum(), threshold*100)) + x = x.loc[~has_too_many_missing, :] + y = y.loc[~has_too_many_missing] + + # Combine and Rename hospitals + name_combined = 'Center com' + cutoff = 100 + hospital = x.loc[:, 'hospital'].copy() + counts = hospital.value_counts() + hospital.loc[hospital.isin(counts[counts 1] # Remove columns without information print('LOG: Using <{}:{}> as y.'.format(goal[0], goal[1])) - print('LOG: Class distribution: 1: {}, 0: {}, total: {}'\ - .format(y.value_counts()[1], y.value_counts()[0], y.size)) + # print('LOG: Class distribution: 1: {}, 0: {}, total: {}'\ + # .format(y[y.columns[0]].value_counts()[1], y[y.columns[0]].value_counts()[0], y[y.columns[0]].size)) print('LOG: Selected {} variables for predictive model' - .format(x.columns.size)) + .format(x.columns.size)) - # Remove samples with missing y - if goal[0] != 'survival': - has_y = y.notna() - x = x.loc[has_y, :] - y = y.loc[has_y] - - return x, y, data, hospital, records + explore_data(x, y) + + return x, y, data, hospital, records, days_until_death -def train_and_predict(x, y, model, rep, type='subsamp', type_col=None, test_size=0.2): +def train_and_predict(x, y, model, rep, splittype='loho', + hospitals=None, unique_hospitals=None, + days_until_death=None, test_size=0.2): ''' Splits data into train and test set using monte carlo subsampling method, i.e., n random train/test splits using equal class balance. @@ -225,14 +301,17 @@ class instance from on of the clf on test y. ''' - if type == 'loho': + if splittype == 'loho': # Leave-one-hospital-out cross-validation - test_hosp = type_col.unique()[rep] - is_test_hosp = type_col == test_hosp + test_hosp = unique_hospitals[rep] + is_test_hosp = hospitals == test_hosp train_x = x.loc[~is_test_hosp, :] train_y = y.loc[~is_test_hosp] test_x = x.loc[is_test_hosp, :] test_y = y.loc[is_test_hosp] + model.hospital = model.hospital.append(hospitals.loc[is_test_hosp]) + model.days_until_outcome = model.days_until_outcome\ + .append(days_until_death.loc[is_test_hosp]) else: # Default to random subsampling train_x, test_x, train_y, test_y = train_test_split(x, y, @@ -241,8 +320,8 @@ class instance from on of the datasets = {"train_x": train_x, "test_x": test_x, "train_y": train_y, "test_y": test_y} - clf, datasets, test_y_hat = model.train(datasets) - return clf, datasets, test_y_hat + clf, datasets, test_y_hat, shap_values, test_set = model.train(datasets) + return clf, datasets, test_y_hat,shap_values, test_set def score_prediction(model, clf, datasets, test_y_hat, rep): ''' Wrapper for scoring individual predictions made @@ -272,8 +351,10 @@ class instance from on of the score = model.score(clf, datasets, test_y_hat, rep) return score -def evaluate_model(model, clf, datasets, scores): - model.evaluate(clf, datasets, scores) +def evaluate_model(model, clf, datasets, scores, + hospitals=None): + model.evaluate(clf, datasets, scores, + hospitals=hospitals) def run(data, data_struct, goal, variables_to_include, variables_to_exclude, train_test_split_method, model_class, @@ -282,10 +363,12 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude, model.goal = goal data, data_struct = preprocess(data, data_struct) - x, y, data, hospital, records = prepare_for_learning(data, data_struct, + x, y, data, hospital,\ + records, days_until_death = prepare_for_learning(data, data_struct, variables_to_include, variables_to_exclude, goal) + if goal[0] == 'survival': save_class_dist_per_hospital(save_path, y['event_mortality'], hospital[y.index]) else: @@ -297,32 +380,53 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude, model.save_path = '{}_n{}_y{}'.format(save_path, y.size, y.sum()) model.save_prediction = save_prediction - if train_test_split_method == 'loho': # Leave-one-hospital-out - repetitions = hospital.unique().size + unique_hospitals = np.sort(hospital.unique()) + repetitions = unique_hospitals.size else: # Random Subsampling repetitions = 100 scores = [] + shap_list = list() + test_list = list() for rep in range(repetitions): - clf, datasets, test_y_hat = train_and_predict(x, y, model, rep, - type=train_test_split_method, - type_col=hospital) + clf, datasets, test_y_hat,shap_values,test_set = train_and_predict(x, y, model, rep, + splittype=train_test_split_method, + hospitals=hospital, + unique_hospitals=unique_hospitals, + days_until_death=days_until_death) + shap_list.append(shap_values) + test_list.append(test_set) + score = score_prediction(model, clf, datasets, test_y_hat, rep) scores.append(score) model.hospital = hospital[y.index] - evaluate_model(model, clf, datasets, scores) - + evaluate_model(model, clf, datasets, scores, + hospitals=unique_hospitals) + + shap_values = shap_list[0] + for val in shap_list: + shap_values = np.concatenate((shap_values,val),axis=0) + + test_set = test_list[0] + for val in shap_list: + test_set = np.concatenate((test_set,val),axis=0) + test_set = pd.DataFrame(test_set,columns=x.columns) + f = plt.figure() + shap.summary_plot(shap_values, test_set) + f.savefig(os.path.join( model.save_path ,"summary_plot1.png"), bbox_inches='tight', dpi=600) + f = plt.figure() + shap.summary_plot(shap_values, test_set, plot_type="bar") + f.savefig(os.path.join( model.save_path ,"summary_plot2.png"), bbox_inches='tight', dpi=600) if not save_figures: plt.show() print('\n', flush=True) - if __name__ == "__main__": ##### START PARAMETERS ##### @@ -355,10 +459,13 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude, # NOTE: See get_feature_set.py for preset selections feature_opts = { 'pm': get_1_premorbid(), - 'cp': get_2_clinical_presentation(), - 'lab': get_3_laboratory_radiology_findings(), - 'pmcp': get_4_premorbid_clinical_representation(), - 'all': get_5_premorbid_clin_rep_lab_rad() + # 'cp': get_2_clinical_presentation(), + # 'lab': get_3_laboratory_radiology_findings(), + # 'pmcp': get_4_premorbid_clinical_representation(), + # 'all': get_5_premorbid_clin_rep_lab_rad(), + # 'k10': ['Blood_Urea_Nitrogen_value_1', 'LDH', 'PH_value_1', 'age_yrs', 'ccd', 'fio2_1', 'hypertension', 'irregular', 'rtr', 'uses_n_medicine'] # XGB + # 'k10': ['Blood_Urea_Nitrogen_value_1', 'LDH', 'PH_value_1', 'age_yrs', 'ccd', 'fio2_1', 'hypertension', 'irregular', 'rtr', 'uses_n_medicine'] # LOGREG + # 'paper': ['LDH', 'Lymphocyte_1_1', 'crp_1_1'] } # Options: @@ -368,11 +475,17 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude, # Add all 'Field Variable Name' from data_struct to # EXCLUDE variables from analysis - variables_to_exclude = ['microbiology_worker'] + variables_to_exclude = [ + # 'Smoking', + # 'alcohol' + # 'oxygen_saturation_on', + # 'oxygen_saturation' + ] # Options: # see .\Classifiers - model = LogReg # NOTE: do not initialize model here, + model = XGB + # model = LogReg # NOTE: do not initialize model here, # but supply the class (i.e. omit # the parentheses) @@ -387,6 +500,7 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude, for train_test_split_method in cv_opts: for feat_name, features in feature_opts.items(): + print(feat_name) vars_to_include = { 'Form Collection Name': [], # groups 'Form Name': [], # variable subgroups @@ -401,3 +515,7 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude, save_prediction=save_prediction) plt.show() + + + + diff --git a/covid19_ICU_util.py b/covid19_ICU_util.py index 0fa6e93..5dcd978 100644 --- a/covid19_ICU_util.py +++ b/covid19_ICU_util.py @@ -1,3 +1,5 @@ +import time + import pandas as pd import numpy as np import matplotlib.pyplot as plt @@ -28,10 +30,29 @@ 'CRP', 'Albumin', 'CKin', 'LDH_daily', 'Chest_X_Ray', 'CTperf', 'd_dimer_yes_no'] +TIME_FNS = True + format_dt = lambda col: pd.to_datetime(col, format='%d-%m-%Y', errors='coerce').astype('datetime64[ns]') is_in_columns = lambda var_list, data: [v for v in var_list if v in data.columns] +# Function timer decorator +def timeit(method): + def timed(*args, **kw): + if not TIME_FNS: + return method(*args, **kw) + ts = time.time() + result = method(*args, **kw) + te = time.time() + if 'log_time' in kw: + name = kw.get('log_name', 'method.__name__'.upper()) + kw['log_time'][name] = int((te - ts)*1000) + else: + print('TIM: {:s}: {:2.2f}ms'.format(method.__name__, + (te-ts)*1000)) + return result + return timed +@timeit def get_all_field_information(path_to_creds): study_struct, reports_struct, \ optiongroups_struct = import_study_report_structure(path_to_creds) @@ -52,6 +73,7 @@ def get_all_field_information(path_to_creds): return data_struct +@timeit def count_occurrences(col, record_ids, reset_count=False, start_count_at=1): # Make counter of occurences in binary column # NOTE: Make sure to supply sorted array @@ -81,6 +103,7 @@ def count_occurrences(col, record_ids, reset_count=False, start_count_at=1): new_col = pd.concat(new_cols, axis=0) return new_col +@timeit def calculate_outcomes(data, data_struct): # CATEGORY EXPLANATION # Discharged alive: 1) Discharged to home @@ -199,6 +222,7 @@ def calculate_outcomes(data, data_struct): used_columns = [col for col in data.columns if 'Outcome' in col] # Keep track of var return df_outcomes, used_columns +@timeit def select_x_y(data, outcomes, used_columns, goal, remove_not_outcome=True): x = data.drop(used_columns, axis=1) @@ -211,12 +235,14 @@ def select_x_y(data, outcomes, used_columns, # TEMP select (Non-ICU patients) # ICU pts: - # was_icu = outcomes.iloc[:, [3, 6, 10, 12, 14]].any(axis=1) - # y[was_icu] = None # ONLY INCLUDE NON ICU PATIENTS + was_icu = outcomes.iloc[:, [3, 6, 10, 12, 14]].any(axis=1) + # was_icu.to_excel('was_icu.xlsx') + # y[was_icu] = None # ONLY INCLUDE NON-ICU PATIENTS (Inverted because set to None) # y[~was_icu] = None # ONLY INCLUDE ICU PATIENTS - + # x['was_icu'] = was_icu return x, y, outcomes_dict +@timeit def get_classification_outcomes(data, outcomes): y_dict = {} @@ -233,15 +259,17 @@ def get_classification_outcomes(data, outcomes): # Alive (=0): Discharged at t <= 21 | # Alive at t=21 y = pd.Series(None, index=data.index) - y.loc[outcomes.loc[:, 'Dood - totaal']==1] = 1 + y.loc[outcomes.loc[:, 'Dood - totaal']==1] = 1 # TODO: with days_until_death <=21d y.loc[outcomes.loc[:, ['Levend ontslagen en niet heropgenomen - totaal', 'Levend dag 21 maar nog in het ziekenhuis - totaal']].any(axis=1)] = 0 y_dict['mortality_with_outcome'] = y return y_dict +@timeit def get_survival_analysis_outcomes(data, outcomes): y_dict = {} + data = data.copy() # 1) Event (=1): ICU admission # Duration (event=1): days until first ICU admission since hospital admission @@ -265,11 +293,29 @@ def get_survival_analysis_outcomes(data, outcomes): # TODO: Think about same exclusion at classification[y2] outcome_name = 'Patient has died' + # start with everyone alive at t=0 event = pd.Series(0, index=data.index) - duration = outcomes.loc[:, 'Days until death'] + + # mark all patients with a time to death as death + duration = outcomes.loc[:, 'Days until death'].copy() event.loc[duration.notna()] = 1 + + max_duration = np.nanmax(duration) + + # mark patients that died, but do not have a death date as alive until their last followup duration.loc[duration.isna()] = data.loc[duration.isna(), 'days_since_admission_current_hosp'] + # # now add the people that are ALIVE DISCHARGED - set as no event until t=max_duration + duration.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = max_duration # or 42 days # discharged alive - ALL 3 + event.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = 0 # censor alive at t=max_duration days + + # # now add the people that are UNKNOWN (= not death and not discharged) + duration.loc[data['Onbekend (alle patiënten zonder outcome)']] = data['days_since_admission_current_hosp'][data['Onbekend (alle patiënten zonder outcome)']] # Onbekend (alle patiënten zonder outcome) == UNKNOWN outcome + event.loc[data['Onbekend (alle patiënten zonder outcome)']] = 0 # censor at time of event. + + # remaining data: does not have any outcome or follow-up: we only now these patients are alive at t=0. + duration[duration.isna()] = 0.0 # duration unknown because date is not filled in? set to 0.0 + y = pd.concat([event, duration], axis=1) y.columns = ['event_mortality', 'duration_mortality'] y_dict['mortality_all'] = y @@ -304,6 +350,7 @@ def get_survival_analysis_outcomes(data, outcomes): axis=1) return y_dict +@timeit def fix_single_errors(data): ''' Replaces values on the global dataframe, per column or per column and record Id. @@ -334,8 +381,23 @@ def fix_single_errors(data): data.loc[data['Record Id']==record_id, column] \ = data.loc[data['Record Id']==record_id, column] \ .replace(pair[0], pair[1]) + + # SaO2 and SaO2_1 are incidentally reported as fraction rather than as + # a percentage; fix this by multiplication. 0 is impossible -> replace by nan + sao2_invalid = data['SaO2'].astype(float) < 1.0 + data.loc[sao2_invalid,'SaO2'] = (100. * data.loc[sao2_invalid,'SaO2'].astype(float)).astype(str) + + sao2_1_invalid = data['SaO2_1'].astype(float) < 1.0 + data.loc[sao2_1_invalid,'SaO2_1'] = (100. * data.loc[sao2_1_invalid,'SaO2_1'].astype(float)).astype(str) + + sao2_zero = data['SaO2'].astype(float) == 0. + data.loc[sao2_zero,'SaO2'] = np.nan + + sao2_1_zero = data['SaO2_1'].astype(float) == 0. + data.loc[sao2_1_zero,'SaO2_1'] = np.nan return data +@timeit def transform_binary_features(data, data_struct): ''' ''' @@ -343,7 +405,6 @@ def transform_binary_features(data, data_struct): dict_yes_no = {0:0, 1:1, 2:0, 3:value_na, 9:value_na, 9999: value_na} dict_yp = {0:0, 1:1, 2:.5, 3:0, 4:value_na} # [1, 2, 3, 4 ] --> [1, .5, 0, -1] dict_yu = {0:0, 1:1, 9999:value_na} - dict_smoke = {0:0, 1:1, 2:0, 3:.5, 4:value_na} # [Yes, no, stopped_smoking] --> [1, 0, .5] # Some fixed for erronuous field types data_struct.loc[data_struct['Field Variable Name']=='MH_HF', 'Field Type'] = 'radio' @@ -377,9 +438,9 @@ def transform_binary_features(data, data_struct): data.loc[:, 'Bacteria'] = data.loc[:, 'Bacteria'].fillna(3) \ .astype(int) \ .apply(lambda x: dict_yes_no.get(x)) - data.loc[:, 'Smoking'] = data.loc[:, 'Smoking'].fillna(4) \ - .astype(int) \ - .apply(lambda x: dict_smoke.get(x)) + # data.loc[:, 'Smoking'] = data.loc[:, 'Smoking'].fillna(4) \ + # .astype(int) \ + # .apply(lambda x: dict_smoke.get(x)) data.loc[:, 'CT_thorax_performed'] = data.loc[:, 'CT_thorax_performed'].fillna(3) \ .astype(int) \ .apply(lambda x: {0:0, 1:0, 2:1, 3:0}.get(x)) @@ -407,6 +468,7 @@ def transform_binary_features(data, data_struct): return data, data_struct +@timeit def transform_categorical_features(data, data_struct): ''' Create dummyvariables for category variables, removes empty variables and attaches column names @@ -418,35 +480,41 @@ def transform_categorical_features(data, data_struct): # Extract variables that can contain multiple answers OR need to be # dummified to be used in a later stage - vars_to_dummy = ['oxygen_saturation_on', 'dept', 'Outcome'] + vars_to_dummy = ['dept', 'Outcome'] # 'oxygen_saturation_on' - is_one_hot_encoded = data_struct['Field Type'].isin(['checkbox']) | \ - data_struct['Field Variable Name'].isin(vars_to_dummy) - data_struct.loc[is_one_hot_encoded, 'Field Type'] = 'category_one_not_encoded' + is_one_hot_encoded = data_struct['Field Type'].isin(['checkbox', 'category']) | \ + data_struct['Field Variable Name'].isin(vars_to_dummy) + data_struct.loc[is_one_hot_encoded, 'Field Type'] = 'category_one_hot_encoded' cat_struct_ohe = data_struct.loc[is_one_hot_encoded, - ['Field Variable Name', 'Option Name', - 'Option Value']] + ['Field Variable Name', 'Option Name', + 'Option Value']] category_columns_ohe = is_in_columns(cat_struct_ohe['Field Variable Name'], data) + # Exclude some vars + category_columns_ohe = [c for c in category_columns_ohe if 'PaO2_sample_type' not in c] + get_name = lambda c, v: '{:s}_cat_{:s}'.format(col, str(v)) # Dummify variables dummies_list = [] for col in category_columns_ohe: + # Get all unique categories in the column unique_categories = pd.unique([cat for value in data[col].values for cat in str(value).split(';')]) - unique_categories = [cat for cat in unique_categories - if cat.lower() not in ['nan', 'none']] + # unique_categories = [cat for cat in unique_categories + # if cat.lower() not in ['nan', 'none']] + if not any(unique_categories): continue # TODO: Make column names to actual name instead of numeric answer - dummy_column_names = [get_name(col, v) for v in unique_categories - if v.lower() not in ['nan', 'none']] + dummy_column_names = [get_name(col, v) for v in unique_categories] + # if v.lower() not in ['nan', 'none']] # Create new dataframe with the dummies dummies = pd.DataFrame(0, index=data.index, columns=dummy_column_names) + # Insert the data for cat in unique_categories: # TODO: Filter specific categories that are nan/na/none/unknown @@ -455,8 +523,15 @@ def transform_categorical_features(data, data_struct): data[col] = data[col].fillna('') regex_str = '(?:;|^){}(?:;|$)'.format(cat) - dummies.loc[data[col].str.contains(regex_str, regex=True), - get_name(col, cat)] = 1 + has_cat = data[col].str.contains(regex_str, regex=True) + if has_cat.sum() > 1: + dummies.loc[has_cat, get_name(col, cat)] = 1 + + nan_cols = [c for c in dummies.columns if '_nan' in c or '_None' in c] + missing_col = dummies.loc[:, nan_cols].max(axis=1) + dummies = dummies.drop(nan_cols, axis=1) + dummies = pd.concat([dummies, missing_col], axis=1) + dummies.columns = dummies.columns[:-1].to_list() + [col+'_cat_missing'] dummies_list += [dummies] @@ -471,9 +546,12 @@ def transform_categorical_features(data, data_struct): return data, data_struct +@timeit def transform_numeric_features(data, data_struct): # Calculates all variables to the same unit, # according to a handmade mapping in unit_lookup.py + data = data.copy() + unit_dict, var_numeric = get_unit_lookup_dict() numeric_columns = is_in_columns(var_numeric.keys(), data) @@ -500,8 +578,33 @@ def transform_numeric_features(data, data_struct): data = data.drop(is_in_columns(unit_dict.keys(), data), axis=1) + # PaO2_1 and PaO2 are measured in 3 ways + # as described in PaO2_sample\_type_1 and PaO2_sample_type + # divide these variables in four dummies. + for p in ['', '_1']: + options = data_struct.loc[ + data_struct['Field Variable Name']==('PaO2_sample_type'+p), + ['Option Name', 'Option Value']].iloc[0, :] + + col_dict = dict(zip(options['Option Value'], options['Option Name'])) + # Non-Strings are missing values + colnames = ['PaO2'+p+'_'+v for v in col_dict.values() if type(v)==str] + df = pd.DataFrame(0, columns=colnames, index=data.index) + + for value, name in col_dict.items(): + if str(name) == 'nan': + # occurs only once, so better drop it + continue + colname = 'PaO2'+p+'_'+str(name) + is_measure_type = data['PaO2_sample_type'+p] == value + df.loc[is_measure_type, colname] = data.loc[is_measure_type, 'PaO2'+p] + df.loc[data['PaO2'+p].isna(), :] = None + + data = data.drop(columns=['PaO2'+p, 'PaO2_sample_type'+p]) + data = pd.concat([data, df], axis=1) return data, data_struct +@timeit def transform_time_features(data, data_struct): ''' TODO: Check difference hosp_admission and Outcome_dt @@ -594,6 +697,7 @@ def transform_time_features(data, data_struct): return data, data_struct +@timeit def transform_string_features(data, data_struct): # TODO: Why it med_specify not in data_struct? @@ -615,6 +719,7 @@ def transform_string_features(data, data_struct): index=data_struct.columns), ignore_index=True) return data, data_struct +@timeit def transform_calculated_features(data, data_struct): struct_calc = data_struct.loc[data_struct['Field Type']=='calculation', :] @@ -625,6 +730,7 @@ def transform_calculated_features(data, data_struct): data = data.drop(cols_to_drop, axis=1) return data, data_struct +@timeit def select_data(data, data_struct): cols_to_keep = [col for col in data.columns if col not in IS_MEASURED_COLUMNS] @@ -640,39 +746,46 @@ def select_data(data, data_struct): return data, data_struct -def select_variables(data, data_struct, variables_to_include_dict): +@timeit +def select_variables(data, data_struct, variables_to_include_dict, + variables_to_exclude): # Get all variables variables_to_include = [] for k, v in variables_to_include_dict.items(): + v = [i for i in v if i not in variables_to_exclude] if k == 'Field Variable Name': - variables_to_include += variables_to_include_dict[k] + variables_to_include += v else: variables_to_include += data_struct.loc[data_struct[k].isin(v), 'Field Variable Name'] \ .to_list() # Retrieve the corresponding categorical 1-hot encoded column names - category_vars = data_struct.loc[data_struct['Field Type'] == 'category', + category_vars = data_struct.loc[(data_struct['Field Type'] == 'category') | + (data_struct['Field Type'] == 'category_one_hot_encoded'), 'Field Variable Name'].to_list() variables_to_include += [c for var in variables_to_include - for c in data.columns - if (var in category_vars) and (var in c)] + for c in data.columns + if ((var in category_vars)\ + and (var not in variables_to_exclude))\ + and (var in c)] variables_to_include += ['Record Id'] variables_to_include = list(np.unique(is_in_columns(variables_to_include, data))) return data.loc[:, variables_to_include] +@timeit def impute_missing_values(data, data_struct): - ''' + ''' NOTE: DEPRECATED in upcoming versions NOTE: Only impute values that not leak data. - i.e. only use single field or + i.e. only use single field or record based imputations. ''' missing_class = -1 # Categorical --> Add a missing class vars_categorical = is_in_columns(data_struct.loc[data_struct['Field Type']=='category', 'Field Variable Name'].to_list(), data) - + data.loc[:, vars_categorical] = data.loc[:, vars_categorical].fillna(missing_class) ## MODE should be moved to Pipeline in Classifier to prevent data leakage @@ -692,6 +805,7 @@ def impute_missing_values(data, data_struct): return data +@timeit def plot_feature_importance(importances, features, show_n_features=5): show_n_features = features.shape[0] if not show_n_features else show_n_features @@ -703,7 +817,8 @@ def plot_feature_importance(importances, features, show_n_features=5): return fig, ax -def save_class_dist_per_hospital(path, y, hospital): +@timeit +def save_class_dist_per_hospital(path, y, hospital): with open(path + '_class_dist_per_hosp.txt', 'w') as f: f.write('{}\t{}\t{}\n'.format('hospital', '0', '1')) for h in hospital.unique(): @@ -712,31 +827,28 @@ def save_class_dist_per_hospital(path, y, hospital): f.write(line) print('LOG: Written class distribution per hospital to file.') - def explore_data(x, y): - data = pd.concat([x, y], axis=1) - corr = data.corr(method='spearman') - plt.matshow(corr) + # VIF + from statsmodels.stats.outliers_influence import variance_inflation_factor + from statsmodels.tools.tools import add_constant + x = x.fillna(0).astype(float) + # x = add_constant(x) + vif = pd.Series([variance_inflation_factor(x.values, i) + for i in range(x.shape[1])], index=x.columns)\ + .sort_values(ascending=True) -# beademd geweest op IC -# outcome_ventilation_any = data['patient_interventions_cat_1'] == 1.0 \ -# | data['patient_interventions_cat_2'] == 1.0 \ -# | data['Invasive_ventilation_1'] == 1.0 -# outcome_ventilation_daily = data['patient_interventions_cat_1'] == 1.0 | \ -# data['patient_interventions_cat_2'] == 1.0 + fig, ax = plt.subplots() + ax.set_title('Variance inflation factor') + vif.nlargest(25).plot(kind='barh') + fig.tight_layout() -# Orgaanfalen lever, nier -# outcome_organfailure_any = data['patient_interventions_cat_3'] == 1.0 | \ -# data['patient_interventions_cat_5'] == 1.0 | \ -# data['Extracorporeal_support_1'] == 1.0 | \ -# data['Liver_dysfunction_1_1'] == 1.0 | \ -# data['INR_1_1'].astype('float') > 1.5 | \ -# data['Acute_renal_injury_Acute_renal_failure_1_1'] == 1.0 + # CORRELATION + # xc = x.corr('spearman') # 'spearmon' + # fig, ax = plt.subplots() + # ax.set_title('Correlation matrix') + # ax.matshow(xc) + return -# has_not_died = outcomes.loc[:, ['Levend ontslagen en niet heropgenomen - totaal', -# 'Levend dag 21 maar nog in het ziekenhuis - totaal']].any(axis=1) -# y_duration.loc[has_not_died] = data.loc[has_not_died, 'days_since_admission_current_hosp'] -# y_event.loc[has_not_died] = 0 \ No newline at end of file diff --git a/data_query_scripts/ALL_castor2excel_variables.py b/data_query_scripts/ALL_castor2excel_variables.py index 1b52c09..2377387 100755 --- a/data_query_scripts/ALL_castor2excel_variables.py +++ b/data_query_scripts/ALL_castor2excel_variables.py @@ -9,59 +9,117 @@ @author: wouterpotters """ import os -import site import configparser import pandas as pd -site.addsitedir('./../') # add directory to path to enable local imports -from covid19_import import import_study_report_structure # noqa: E402 - -config = configparser.ConfigParser() -config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini')) - -# the excel file with all variables and answer options is stored here -target_excel = config['exportresults']['excel_file_variables'] - -# # Get all data from Castor database (without any selection criterium) -# Note that you need export rights for every individual center. -study_struct, reports_struct, optiongroups_struct = \ - import_study_report_structure( - config['CastorCredentials']['local_private_path']) - -# ## Export all variables to excel -writer = pd.ExcelWriter(target_excel, engine='xlsxwriter') - -readme = 'This excel sheet contains an overview of the variables that are ' +\ - 'used in the Castor EDC database for the COVID 19 project. \nThere are' +\ - ' three tabs; \n(1) Admission variables; to be entered once and ' +\ - 'updated incidentally. \n(2) Daily reports are created once per day.' -readme = pd.DataFrame([x for x in readme.split('\n')]) - -# Write each dataframe to a different worksheet. -readme.to_excel(writer, sheet_name='README', index=False) -optiongroups_struct.to_excel(writer, sheet_name='OptionGroups', index=False) - -# added answeroptions to the excel file -answeroptions = pd.pivot_table(optiongroups_struct, - index='Option Group Id', - values=['Option Name', 'Option Value'], - aggfunc=lambda x: list(x)) -answeroptions.rename(columns={'Option Name': 'Field Options Names', - 'Option Value': 'Field Options Values'}) -study_struct_withoptions = pd.merge(study_struct, - answeroptions, how='left', - left_on='Field Option Group', - right_on='Option Group Id') -study_struct_withoptions.to_excel(writer, sheet_name='AdmissionVariables', - index=False) - -reports_struct_withoptions = pd.merge(reports_struct, - answeroptions, - how='left', - left_on='Field Option Group', - right_on='Option Group Id') -reports_struct_withoptions.to_excel(writer, - sheet_name='DailyUpdateVariables', - index=False) - -writer.save() # save excel file -print('excel file was saved to : ' + target_excel) +import castorapi as ca + + +def import_study_report_structure(c): + # STEP 0: collect answer options from optiongroups + # get answer option groups + optiongroups_struct = c.request_study_export_optiongroups() + + # STEP 1: collect data from study + # get the main study structure (i.e. questions) + structure = c.request_study_export_structure() + + # sort on form collection order and field order + # (this matches how data is filled in castor) + structure_filtered = structure \ + .sort_values(['Form Collection Name', 'Form Collection Order', + 'Form Order', 'Field Order']) + + # filter variables that have no Field Variable name; these field do not + # record data + structure_filtered[~structure_filtered['Field Variable Name'].isna()] + + # select only study variables + study_structure = structure_filtered[ + structure_filtered['Form Type'].isin(['Study'])] + + # select only report variables (may contain duplicates) + reports_structure = structure_filtered[ + structure_filtered['Form Type'].isin(['Report'])] + + return study_structure, reports_structure, optiongroups_struct + + +if __name__ == "__main__": + config = configparser.ConfigParser() + config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini')) + + path_to_api_creds = config['CastorCredentials']['local_private_path'] + + # input: private folder where client & secret files (no extension, + # 1 string only per file) from castor are saved by the user + # see also: + # https://helpdesk.castoredc.com/article/124-application-programming-interface-api + c = ca.CastorApi(path_to_api_creds) # e.g. in user dir outside of GIT repo + + # get study ID for COVID study + if False: + name = 'COVID-19 NL' + excel_postfix = '' + else: + name = 'Clinical features of COVID-19 positive patients in VieCuri' + excel_postfix = '_viecurie.xlsx' + + study_id = c.select_study_by_name(name) + + study_name = c.request_study(study_id=study_id)['name'] + + # # Get all data from Castor database (without any selection criterium) + # Note that you need export rights for every individual center. + study_structure, reports_structure, optiongroups_struct = \ + import_study_report_structure(c) + + # STEP 2: convert to excel file + # the excel file with all variables and answer options is stored here + target_excel = config['exportresults']['excel_file_variables'] + excel_postfix + + # ## Export all variables to excel + writer = pd.ExcelWriter(target_excel, engine='xlsxwriter') + + readme = 'This excel sheet contains an overview of the variables that are ' +\ + 'used in the Castor EDC database for the study: ' + study_name +\ + 'project. \nThere are multiple tabs; \n(1) Study variables; ' +\ + 'to be entered once (and updated incidentally). ' +\ + '\n(2, 3 ..., n) Reports tabs could be created more often.' + readme = pd.DataFrame([x for x in readme.split('\n')]) + + # Write each dataframe to a different worksheet. + readme.to_excel(writer, sheet_name='README', index=False) + optiongroups_struct.to_excel(writer, sheet_name='OptionGroups', index=False) + + # added answeroptions to the excel file + answeroptions = pd.pivot_table(optiongroups_struct, + index='Option Group Id', + values=['Option Name', 'Option Value'], + aggfunc=lambda x: list(x)) + answeroptions.rename(columns={'Option Name': 'Field Options Names', + 'Option Value': 'Field Options Values'}) + + study_struct_withoptions = pd.merge(study_structure, + answeroptions, how='left', + left_on='Field Option Group', + right_on='Option Group Id') + study_struct_withoptions.to_excel(writer, sheet_name='Study', + index=False) + + reports_struct_withoptions = pd.merge(reports_structure, + answeroptions, + how='left', + left_on='Field Option Group', + right_on='Option Group Id') + + for report_name in reports_structure['Form Collection Name'].unique(): + selection = reports_struct_withoptions[ + reports_struct_withoptions['Form Collection Name'] == report_name] + report_name = 'Report' + report_name + for r in ':*?/[]\\': + report_name = report_name.replace(r, '_') + + selection.to_excel(writer, sheet_name=report_name[:31], index=False) + + writer.save() # save excel file + print('\n excel file for study ' + study_name + ' was saved to : ' + target_excel) diff --git a/data_query_scripts/ALL_completeness_category.py b/data_query_scripts/ALL_completeness_category.py new file mode 100644 index 0000000..923b7a4 --- /dev/null +++ b/data_query_scripts/ALL_completeness_category.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 3 12:46:07 2020 + +@author: wouterpotters +""" +import os +import numpy as np +import pickle +import configparser +import castorapi as ca + +config = configparser.ConfigParser() +config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini')) + +c = ca.CastorApi(config['CastorCredentials']['local_private_path']) +study_id = c.select_study_by_name(config['CastorCredentials']['study_name']) + +# select AMC + VUmc +institutes = c.request_institutes() +study_info = c.request_study_export_structure() + +try: + path_save = os.path.join(config['CastorCredentials']['local_private_path'], 'saveddata.pkl') + with open(path_save, 'rb') as f: + df_study, df_structure, df_report, df_report_structure, \ + df_optiongroup_structure = pickle.load(f) +except Exception as ex: + print('error'+str(ex)) + +# %% per hospital % +variables = df_structure[df_structure['Form Collection Name'] == 'INTERNAL MED/ GERIATRICS']['Field Variable Name'].reset_index(drop=True) +varcount = len(variables) + +for i in institutes: + inst = i['name'] + sel = df_study['hospital'] == inst + if sum(sel) > 0: + print('{}:\t{} records, records with any INTERNAL MED/ GERIATRICS data {}, {}% complete' + .format(inst, sum(sel), + sum(np.sum(~df_study[sel][variables].isna(),axis=1) > 0), + round(100-100*np.sum(np.sum(df_study[sel][variables].isna()))/(sum(sel)*varcount),0))) + else: + if False: + print('{}:\t{} entries'.format(inst,sum(sel))) + +# %% per hospital +# cfs_yesno CFS evaluated during admission? + +# %% per hospital % +variables = df_structure[df_structure['Form Collection Name'] == 'INTERNAL MED/ GERIATRICS']['Field Variable Name'].reset_index(drop=True) +varcount = len(variables) + +variables = ['cfs_yesno'] +varcount = 1 + +subgroep_70 = df_study['age_yrs'] >= 70 + +for i in institutes: + inst = i['name'] + sel = np.logical_and(df_study['hospital'] == inst, subgroep_70) + if sum(sel) > 0: + if False: + print('{}:\t{} records, records with any INTERNAL MED/ GERIATRICS data {}, {}% complete' + .format(inst, sum(sel), + sum(np.sum(df_study[sel][variables] == '1',axis=1) > 0), + round(100-100*np.sum(np.sum(df_study[sel][variables] == '1'))/(sum(sel)*varcount),0))) + else: + print('{}:\t{}/{} records' + .format(inst, + sum(np.sum(df_study[sel][variables] == '1',axis=1) > 0), + sum(sel))) + else: + if False: + print('{}:\t{} entries'.format(inst,sum(sel))) \ No newline at end of file diff --git a/data_query_scripts/ALL_convert_to_age_in_years.py b/data_query_scripts/ALL_convert_to_age_in_years.py index 541fcc1..f0e62df 100644 --- a/data_query_scripts/ALL_convert_to_age_in_years.py +++ b/data_query_scripts/ALL_convert_to_age_in_years.py @@ -13,7 +13,7 @@ import datetime import castorapi as ca -LIVE = True # be very carefull. This updates a huge amount of data!!! +LIVE = False # be very carefull. This updates a huge amount of data!!! config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini')) diff --git a/data_query_scripts/survival_death.py b/data_query_scripts/survival_death.py index ed5d8aa..3c6e26d 100755 --- a/data_query_scripts/survival_death.py +++ b/data_query_scripts/survival_death.py @@ -20,6 +20,9 @@ from lifelines import * import pandas as pd import numpy as np +from sklearn.experimental import enable_iterative_imputer # Required to enable experimental iterative imputer +from sklearn.impute import SimpleImputer, IterativeImputer +from sklearn.preprocessing import StandardScaler, RobustScaler plt.rcParams["font.family"] = "Times New Roman" matplotlib.rcParams.update({'font.size': 14}) @@ -38,16 +41,52 @@ 'oxygen_saturation':'Saturatie (%)','Temperature':'Temperatuur (ºC)','Antiviral_agent_1':'Behandeling met antivirale middelen (ja)','Oxygen_therapy_1':'zuurstof therapie (ja)','Non_invasive_ventilation_1':'non-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)', 'intubate_yesno':'Niet intuberen (ja)','CT_thorax_performed':'CT thorax verricht op SEH (ja)','corads_admission':'CO-RAD score (1-5)','pcr_pos':'Corona virus PCR + op SEH (ja)','Coagulation_disorder_1_1':'Stollingsstoornis (ja)', 'Acute_renal_injury_Acute_renal_failure_1_1':'Nierfunctie stoornis waarvoor dialyse (ja)','diabetes_complications':'Diabetes met complicaties (ja)','cpd':'chronische longziekte (ja)','Smoking':'Actieve roker (ja)','obesity':'Obesitas (ja)','immune_sup':'immunosupressiva (ja)','ace_i_spec_1':'Gebruik ACE remmer (ja)', - 'sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','capillary_refill':'Verlengde capillaire refill (ja)','rtr':'Ademhalingsfrequentie (1/min)','oxygen_saturation':'Zuurstof saturatie (%)','SaO2_1':'SaO2 (%)','PaO2_1':'PaO2 (kPa)','PCO2_1':'pCO2 (kPa)', - 'fio2_1':'FiO2 (%)','Temperature':'Temperatuur (ºC)','eye':'EMV','Antiviral_agent_1':'Anti-virale middelen (ja)','Antibiotic_1':'Antibiotica (ja)','Corticosteroid_1':'Corticosteroiden voor ARDS (ja)','Corticosteroid_type_1':'gebruik corticosteroiden (ja)','Oxygen_therapy_1':'Zuurstof behandeling (ja)', + 'sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','capillary_refill':'Verlengde capillaire refill (ja)','rtr':'Ademhalingsfrequentie (1/min)','oxygen_saturation':'Zuurstof saturatie (%)','SaO2_1':'SaO2 (%)','PaO2_1_Arterial':'PaO2 arteriëel (kPa)', + 'PaO2_1_Venous':'PaO2 veneus (kPa)', 'PaO2_1_Capillary':'PaO2 capillair (kPa)', 'PaO2_1_nan':'PaO2 onbekend (kPa)', 'PCO2_1':'pCO2 (kPa)','fio2_1':'FiO2 (%)','Temperature':'Temperatuur (ºC)','eye':'EMV','Antiviral_agent_1':'Anti-virale middelen (ja)','Antibiotic_1':'Antibiotica (ja)', + 'Corticosteroid_1':'Corticosteroiden voor ARDS (ja)','Corticosteroid_type_1':'gebruik corticosteroiden (ja)','Oxygen_therapy_1':'Zuurstof behandeling (ja)', 'Non_invasive_ventilation_1':'Niet-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)','intubate_yesno':'Niet intuberen (ja)','auxiliary_breathing_muscles':'Gebruik van extra ademhalingsspieren (ja)','fever':'Koorts gehad sinds eerste klachten (ja)', 'Anosmia':'Anosmie (ja)','Rhinorrhoea':'Rhinorrhoea (ja)','Sore_throat':'Keelpijn (ja)','cough_sputum':'Hoest met sputum (ja)','cough_sputum_haemoptysis':'Hoest met bloederig slijm/ haemoptoë (ja)','Arthralgia':'Arthralgie (ja)','Myalgia':'Spierpijn (ja)','Fatigue_Malaise':'Vermoeidheid/ algehele malaise (ja)', 'Abdominal_pain':'Buikpijn (ja)','Vomiting_Nausea':'Misselijkheid/ overgeven (ja)','Diarrhoea':'diarree (ja)','Dyspnea':'Dyspneu (ja)','Wheezing':'Piepende ademhaling (ja)','Chest_pain':'Pijn op de borst (ja)','ear_pain':'oorpijn (ja)','Bleeding_Haemorrhage':'bloeding (ja)','Headache':'Hoofdpijn (ja)', 'confusion':'Veranderd bewustzijn (ja)','Seizures':'Insulten (ja)','infiltrates_2':'Infiltraat op X-thorax (ja)','corads_admission':'CO-RADS (1-5)','Glucose_unit_1_1':'glucose (mmol/L)','Sodium_1_1':'Natrium (mmol/L)','Potassium_1_1':'Kalium (mmol/L)','Blood_Urea_Nitrogen_value_1':'Ureum (mmol/L)', 'Creatinine_value_1':'Creatinine (µmol/L)','Calcium_adm':'Ca (totaal) (mmol/L)','ferritine_admin_spec':'ferritine (mg/L)','creatininekinase':'CK (U/L)','d_dimer':'D-Dimeer (nmol/L)','AST_SGOT_1_1':'ASAT (U/L)','ALT_SGPT_1_1':'ALAT (U/L)','Total_Bilirubin_2_1':'totaal Bilirubine (IE)','LDH':'LDH (U/L)', 'PH_value_1':'Zuurgraad (pH)','Lactate_2_1':'Lactaat (mmol/L)','crp_1_1':'CRP (mg/L)','Haemoglobin_value_1':'Hb (mmol/L)','Platelets_value_1':'trombocyten (x10^9/L)','WBC_2_1':'Leukocyten (x10^3/µL)','Lymphocyte_1_1':'Lymfocyten (x10^9/L)','Neutrophil_unit_1':'Neutrofielen (x10^9/L)','INR_1_1':'INR', - 'pt_spec':'PT (sec)','fibrinogen_admin':'Fibrinogeen (g/L)','pcr_pos':'Corona PCR + (ja)','Adenovirus':'Adenovirus (ja)','RSV_':'RS virus (ja)','Influenza':'Influenza virus (ja)','Bacteria':'Sputumkweek (ja)','days_since_onset':'Tijd sinds eerste klachten (dagen)','irregular':'Onregelmatige hartslag (ja)', - 'DNR_yesno':' (ja)','healthcare_worker':'Zorgmedewerker (ja)','infec_resp_diagnosis':'Andere infectieuze ademhalingsdiagnose','Blood_albumin_value_1':'Albumine (g/L)','culture':'Bloedkweek (positief)','APT_APTR_1_1':'APTT (sec)'} + 'pt_spec':'PT (sec)','fibrinogen_admin':'Fibrinogeen (g/L)','pcr_pos':'Corona PCR + (ja)','Adenovirus':'Adenovirus (ja)','RSV_':'RS virus (ja)','Influenza':'Influenza virus (ja)','Bacteria':'Sputumkweek (ja)','days_untreated':'Tijd sinds eerste klachten (dagen)','irregular':'Onregelmatige hartslag (ja)', + 'DNR_yesno':' (ja)','healthcare_worker':'Zorgmedewerker (ja)','infec_resp_diagnosis':'Andere infectieuze ademhalingsdiagnose','Blood_albumin_value_1':'Albumine (g/L)','culture':'Bloedkweek (positief)','APT_APTR_1_1':'APTT (sec)','uses_n_medicine':'Aantal thuismedicamenten'} + + +class StandardScaler_min2cols(StandardScaler): + def __init__(self,columns,copy=True, with_mean=False, with_std=True): + self.scaler = StandardScaler(copy, with_mean=with_mean, with_std=with_std) + self.columns = columns + + def fit(self, X, y=None): + self.scaler.fit(X[self.columns], y) + return self + + def transform(self, X, y=None, copy=None): + init_col_order = X.columns + X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns) + X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)] + return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order] + +class RobustScaler_min2cols(RobustScaler): + def __init__(self, columns, copy=True, with_centering=True, + with_scaling=True, quantile_range=(25.0, 75.0)): + self.scaler = RobustScaler(with_centering=with_centering, + with_scaling=with_scaling, + quantile_range=quantile_range, + copy=copy) + self.columns = columns + + def fit(self, X, y=None): + self.scaler.fit(X[self.columns], y) + return self + + def transform(self, X, y=None, copy=None): + init_col_order = X.columns + X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns) + X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)] + return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order] def simple_estimates(features): @@ -55,7 +94,6 @@ def simple_estimates(features): E = features['Event death'] fig, axes = plt.subplots(1, 1) - kmf = KaplanMeierFitter().fit(T, E, label='KaplanMeierFitter') kmf.plot_survival_function() axes.set_xlim(0, 21) @@ -89,7 +127,7 @@ def KM_age_groups(features): print('ICU no: n=', np.nansum(was_not_on_icu)) icu = False - special = True + special = False fig, axes = plt.subplots(1, 1) if special: @@ -140,34 +178,76 @@ def KM_age_groups(features): plt.savefig('KM_survival_curve_DEATH.png', format='png', dpi=300, figsize=(20,20), pad_inches=0, bbox_inches='tight') plt.show() + return kmf1, kmf2, kmf3 + + +def cox_ph_loho(features, hazard_ratios=False, + imputation_method='median', + minimal_n=10, + pvalue=0.05): + cindex = [] + n = [] + for test_hospital in features['hospital'].unique(): + print(test_hospital) + train_set = features[features['hospital'] != test_hospital] + test_set = features[features['hospital'] == test_hospital] + + if len(test_set) >= minimal_n: + test_set = test_set.drop(columns=['hospital']) + + cph, p, imputer, scaler = cox_ph(train_set, hazard_ratios=False, + imputation_method=imputation_method, + return_scaler_imputer=True) + + test_set = pd.DataFrame(imputer.transform(test_set), columns=test_set.columns) + test_set = scaler.transform(test_set) + + hazard_ratios_sum = cph.summary + print(hazard_ratios_sum) + sel_corr = hazard_ratios_sum[hazard_ratios_sum['p'] <= pvalue] + c = cph.score(test_set, scoring_method="concordance_index") + print('test hospital:', test_hospital, 'test c-index', c) + print(sel_corr) + cindex += [c] + n += [len(test_set)] + else: + print('test hospital:', test_hospital, ' has <= ',minimal_n,' records. Cannot determine concordance.') + return cindex, n + +def cox_ph(features, hazard_ratios=False, + imputation_method='median', return_scaler_imputer=False, + show_progress=True, + penalizer=0., + l1_ratio=0.): + train_set = features.drop(columns=['hospital']) # NOTE aids_hiv is an outlier for plotting coefs -def cox_ph_concordance_test(features): - test_hospital = 'MUMC' # this can be MUMC, Zuyderland, or AUMC - AMC - train_set = features[features['hospital'] != test_hospital] - test_set = features[features['hospital'] == test_hospital] - - train_set = train_set.drop(columns=['hospital']) - test_set = test_set.drop(columns=['hospital']) - - cph = CoxPHFitter() - cph.fit(train_set, duration_col='Time to event', event_col='Event death', show_progress=True, step_size=0.1) - - print('with and without ICU') - print('test hospital:', test_hospital) - print('test c-index', cph.score(test_set, scoring_method="concordance_index")) + # impute nan with median; note that time to event and event death cannot contain nan + if imputation_method == 'iterative': + imputer = IterativeImputer(missing_values=np.nan, + initial_strategy='median') + else: + imputer = SimpleImputer(missing_values=np.nan, strategy=imputation_method) + train_set = pd.DataFrame(imputer.fit_transform(train_set), columns=train_set.columns) + # scale data using zscore + scaler = RobustScaler_min2cols(columns=train_set.columns) + train_set = scaler.fit_transform(train_set) -def cox_ph(features, hazard_ratios=False): - train_set = features.drop(columns=['hospital']) # NOTE aids_hiv is an outlier for plotting coefs + cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio) + cph.fit(train_set, duration_col='Time to event', event_col='Event death', show_progress=show_progress, step_size=0.1) - cph = CoxPHFitter() - cph.fit(train_set, duration_col='Time to event', event_col='Event death', show_progress=True, step_size=0.1) + if True: + cph.print_summary() - cph.print_summary() + fig, ax = plt.subplots(figsize=(5, 25)) + p = cph.plot(hazard_ratios=hazard_ratios) + else: + p = None - fig, ax = plt.subplots(figsize=(5, 25)) - p = cph.plot(hazard_ratios=hazard_ratios) - return cph, p + if return_scaler_imputer: + return cph, p, imputer, scaler + else: + return cph, p def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False): hazard_ratios_sum = cph.summary @@ -200,11 +280,14 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False): variables_to_exclude += ['Coronavirus'] # duplicate of PCR information data, data_struct = load_data(path_creds) data, data_struct = preprocess(data, data_struct) - features, outcomes, data, hospital, record_id = prepare_for_learning(data, data_struct, - variables_to_include, - variables_to_exclude, - goal, - use_imputation=False) + # %% + features, outcomes, data, hospital, record_id, days_until_death = prepare_for_learning(data, data_struct, + variables_to_include, + variables_to_exclude, + goal, + remove_records_threshold_above=1.0, + remove_features_threshold_above=0.5, + pcr_corona_confirmed_only=False) del outcomes # not relevant in this form. Use features and hospital. # %% STEP 2: CALCULATE and CORRECT ALL TIMINGS. @@ -215,6 +298,9 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False): times.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = 21. # or 365 # discharged alive - ALL 3 events.loc[data['Levend ontslagen en niet heropgenomen - totaal']] = False # censor at t=21 days + times.loc[data['Levend dag 21 maar nog in het ziekenhuis - totaal']] = 21. + events.loc[data['Levend dag 21 maar nog in het ziekenhuis - totaal']] = False + # now add the people that are UNKNOWN at 21 days. times.loc[data['Onbekend (alle patiënten zonder outcome)']] = data['days_since_admission_current_hosp'][data['Onbekend (alle patiënten zonder outcome)']] # Onbekend (alle patiënten zonder outcome) == UNKNOWN outcome events.loc[data['Onbekend (alle patiënten zonder outcome)']] = False # censor at time of event. @@ -223,6 +309,19 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False): times.loc[data['Days until death'] > 21.] = 21. events.loc[data['Days until death'] > 21.] = False # censor at time of event. + # fix remaining issues with events + sel = np.logical_and(times.isna(),data['Outcome_cat_4'] == 1) # transfer + events[sel] = np.nan + sel = np.logical_and(times.isna(),data['Outcome_cat_8'] == 1) # death + events[sel] = False # no information on timing present, hence state as alive at t=0 + times[sel] = 0. + sel = np.logical_and(times.isna(),data['Outcome_cat_10'] == 1) # discharged readmitted + events[sel] = np.nan + + # remaining data: does not have any outcome or follow-up: we only now these patients are alive at t=0. + times[times.isna()] = 0.0 + times[events.isna()] = np.nan + features = pd.concat([features, pd.Series(name='Time to event', data=times), @@ -232,11 +331,11 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False): data=hospital)], axis=1) - pd.Series(name='Record Id',data=record_id)[[d not in ['VieCuri'] for d in data['hospital']]] + # pd.Series(name='Record Id',data=record_id)[[d not in ['VieCuri'] for d in data['hospital']]] # drop hospitals without enough data - data.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True) - features.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True) + # data.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True) + # features.drop(index=features.index[[d in ['VieCuri'] for d in features['hospital']]], inplace=True) # now drop the people that have no information regarding durations (very new records). # now drop n/a's in time to event; these are incomplete records that cannot be used. @@ -246,25 +345,31 @@ def cox_plot_pvalue_only(cph, pvalue, hazard_ratios=False): # translate feature columns features.rename(columns=englishvar_to_dutchnames, inplace=True) - - # CORRECT SaO2 - features['SaO2 (%)'][features['SaO2 (%)']<= 1.0] = features['SaO2 (%)'][features['SaO2 (%)']<= 1.0] * 100. - - # fi_sa_ratio = (features['SaO2 (%)']/100) / features['FiO2 (%)'] - # fi_sa_ratio[fi_sa_ratio == np.inf] = np.nan - # fi_sa_ratio[fi_sa_ratio == 0.0] = np.nan + # do not model data with > 50% missing data + drop_vars = features.columns[np.sum(features.isnull())/len(features) > 0.5] + features = features.drop(columns=drop_vars) # %% STEP 3: SHOW KM CURVE # simple_estimates(features) # events: True = death, False = censor, None = ? - KM_age_groups(features) + kmf = KM_age_groups(features) + + # %% step 5: valdiate concordance with LOHO + imputation_method = 'iterative' + hazard_ratios = False + cindex, testsetn = cox_ph_loho(features, hazard_ratios=hazard_ratios, + imputation_method=imputation_method, + minimal_n=25) + print('concordances (mean: {}) = {}'.format(round(np.mean(cindex),3), [round(c,3) for c in cindex])) # %% STEP 4: COX REGRESSION. # USE HERE: features, hospital, times, events - cph, plot = cox_ph(features, hazard_ratios=False) + cph, plot = cox_ph(features, hazard_ratios=True, + imputation_method=imputation_method) plot.figure.savefig('Cox_coef_all.png', format='png', dpi=300, figsize=(3,23), pad_inches=0, bbox_inches='tight') # %% - cphplt = cox_plot_pvalue_only(cph, 0.05, hazard_ratios=False) + cphplt = cox_plot_pvalue_only(cph, 0.05, hazard_ratios=True) + # run on command line: cphplt.figure.set_figheight(6) - cphplt.figure.savefig('Cox_coef_pvalue_.05.png', format='png', dpi=300, figsize=(7,9), pad_inches=0, bbox_inches='tight') \ No newline at end of file + cphplt.figure.savefig('Cox_coef_pvalue_.05.png', format='png', dpi=300, figsize=(7,9), pad_inches=0, bbox_inches='tight') diff --git a/data_query_scripts/survival_death_km.py b/data_query_scripts/survival_death_km.py new file mode 100644 index 0000000..540bf2d --- /dev/null +++ b/data_query_scripts/survival_death_km.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Apr 29 20:19:18 2020 +Example to run survival model. +@author: wouterpotters +""" +import configparser +import matplotlib +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), './../')) +from covid19_ICU_admission import load_data, preprocess, prepare_for_learning +from get_feature_set import get_1_premorbid, \ + get_2_clinical_presentation, \ + get_3_laboratory_radiology_findings, \ + get_5_premorbid_clin_rep_lab_rad + +import matplotlib.pyplot as plt +from lifelines import KaplanMeierFitter +from lifelines.statistics import logrank_test, pairwise_logrank_test +import pandas as pd +import numpy as np +from sklearn.experimental import enable_iterative_imputer # Required to enable experimental iterative imputer +from sklearn.impute import SimpleImputer, IterativeImputer +from sklearn.preprocessing import StandardScaler, RobustScaler + +plt.rcParams["font.family"] = "Times New Roman" +matplotlib.rcParams.update({'font.size': 14}) + +c1 = ( 57/255, 106/255, 177/255) # blue +c2 = (218/255, 124/255, 48/255) # orange +c3 = ( 83/255, 81/255, 84/255) # black +c4 = ( 62/255, 150/255, 81/255) # green +c5 = (204/255, 37/255, 41/255) # red + +englishvar_to_dutchnames = {'age_yrs':'Leeftijd (jaren)','gender_male':'Geslacht (man)','gender':'Geslacht (man)','hypertension':'Hypertensie (ja)','Coagulation_disorder_1_1':'Stollingsstoornis (ja)','autoimm_disorder_1':'Auto-immuunziekte (ja)','rheuma_disorder':'Reumatologische ziekte (ja)','organ_transplant_1':'Orgaan donatie ondergaan (ja)', + 'diabetes_without_complications':'DM (zonder complicaties) (ja)','diabetes_complications':'DM (met complicaties) (ja)','ccd':'Chronische hartziekte (ja)','ckd':'Chronische nierziekte (ja)','cnd':'Chronische neurologische aandoening (ja)','cpd':'chronische longziekte (ja)','asthma':'Asthma (ja)','Dementia':'dementie (ja)', + 'mld':'Milde leveraandoening (ja)','live_disease':'Matig-ernstige leveraandoening (ja)','mneoplasm':'Maligniteit (ja)','chd':'Chronische hematologische ziekte (ja)','alcohol':'Alcohol abusus (ja)','Smoking in History or Active':'Roken actief of in verleden (ja)','obesity':'Obesitas (BMI > 30) (ja)', + 'home_medication':'Thuismedicatie (ja)','immune_sup':'Immunosupressiva (ja)','ace_i_spec_1':'Gebruik ACE inhibitor (ja)','Corticosteroid_type_1':'Corticosteroiden gebruik (ja)','sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','rtr':'Ademhalingsfrequentie (1/min)', + 'oxygen_saturation':'Saturatie (%)','Temperature':'Temperatuur (ºC)','Antiviral_agent_1':'Behandeling met antivirale middelen (ja)','Oxygen_therapy_1':'zuurstof therapie (ja)','Non_invasive_ventilation_1':'non-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)', + 'intubate_yesno':'Niet intuberen (ja)','CT_thorax_performed':'CT thorax verricht op SEH (ja)','corads_admission':'CO-RAD score (1-5)','pcr_pos':'Corona virus PCR + op SEH (ja)','Coagulation_disorder_1_1':'Stollingsstoornis (ja)', + 'Acute_renal_injury_Acute_renal_failure_1_1':'Nierfunctie stoornis waarvoor dialyse (ja)','diabetes_complications':'Diabetes met complicaties (ja)','cpd':'chronische longziekte (ja)','Smoking':'Actieve roker (ja)','obesity':'Obesitas (ja)','immune_sup':'immunosupressiva (ja)','ace_i_spec_1':'Gebruik ACE remmer (ja)', + 'sys_bp':'Systolische bloeddruk (mmHg)','dias_bp':'Diastolische bloeddruk (mmHg)','HtR':'hart frequentie (1/min)','capillary_refill':'Verlengde capillaire refill (ja)','rtr':'Ademhalingsfrequentie (1/min)','oxygen_saturation':'Zuurstof saturatie (%)','SaO2_1':'SaO2 (%)','PaO2_1_Arterial':'PaO2 arteriëel (kPa)', + 'PaO2_1_Venous':'PaO2 veneus (kPa)', 'PaO2_1_Capillary':'PaO2 capillair (kPa)', 'PaO2_1_nan':'PaO2 onbekend (kPa)', 'PCO2_1':'pCO2 (kPa)','fio2_1':'FiO2 (%)','Temperature':'Temperatuur (ºC)','eye':'EMV','Antiviral_agent_1':'Anti-virale middelen (ja)','Antibiotic_1':'Antibiotica (ja)', + 'Corticosteroid_1':'Corticosteroiden voor ARDS (ja)','Corticosteroid_type_1':'gebruik corticosteroiden (ja)','Oxygen_therapy_1':'Zuurstof behandeling (ja)', + 'Non_invasive_ventilation_1':'Niet-invasieve beademing (ja)','Invasive_ventilation_1':'Invasieve beademing (ja)','resucitate':'Niet reanimeren (ja)','intubate_yesno':'Niet intuberen (ja)','auxiliary_breathing_muscles':'Gebruik van extra ademhalingsspieren (ja)','fever':'Koorts gehad sinds eerste klachten (ja)', + 'Anosmia':'Anosmie (ja)','Rhinorrhoea':'Rhinorrhoea (ja)','Sore_throat':'Keelpijn (ja)','cough_sputum':'Hoest met sputum (ja)','cough_sputum_haemoptysis':'Hoest met bloederig slijm/ haemoptoë (ja)','Arthralgia':'Arthralgie (ja)','Myalgia':'Spierpijn (ja)','Fatigue_Malaise':'Vermoeidheid/ algehele malaise (ja)', + 'Abdominal_pain':'Buikpijn (ja)','Vomiting_Nausea':'Misselijkheid/ overgeven (ja)','Diarrhoea':'diarree (ja)','Dyspnea':'Dyspneu (ja)','Wheezing':'Piepende ademhaling (ja)','Chest_pain':'Pijn op de borst (ja)','ear_pain':'oorpijn (ja)','Bleeding_Haemorrhage':'bloeding (ja)','Headache':'Hoofdpijn (ja)', + 'confusion':'Veranderd bewustzijn (ja)','Seizures':'Insulten (ja)','infiltrates_2':'Infiltraat op X-thorax (ja)','corads_admission':'CO-RADS (1-5)','Glucose_unit_1_1':'glucose (mmol/L)','Sodium_1_1':'Natrium (mmol/L)','Potassium_1_1':'Kalium (mmol/L)','Blood_Urea_Nitrogen_value_1':'Ureum (mmol/L)', + 'Creatinine_value_1':'Creatinine (µmol/L)','Calcium_adm':'Ca (totaal) (mmol/L)','ferritine_admin_spec':'ferritine (mg/L)','creatininekinase':'CK (U/L)','d_dimer':'D-Dimeer (nmol/L)','AST_SGOT_1_1':'ASAT (U/L)','ALT_SGPT_1_1':'ALAT (U/L)','Total_Bilirubin_2_1':'totaal Bilirubine (IE)','LDH':'LDH (U/L)', + 'PH_value_1':'Zuurgraad (pH)','Lactate_2_1':'Lactaat (mmol/L)','crp_1_1':'CRP (mg/L)','Haemoglobin_value_1':'Hb (mmol/L)','Platelets_value_1':'trombocyten (x10^9/L)','WBC_2_1':'Leukocyten (x10^3/µL)','Lymphocyte_1_1':'Lymfocyten (x10^9/L)','Neutrophil_unit_1':'Neutrofielen (x10^9/L)','INR_1_1':'INR', + 'pt_spec':'PT (sec)','fibrinogen_admin':'Fibrinogeen (g/L)','pcr_pos':'Corona PCR + (ja)','Adenovirus':'Adenovirus (ja)','RSV_':'RS virus (ja)','Influenza':'Influenza virus (ja)','Bacteria':'Sputumkweek (ja)','days_untreated':'Tijd sinds eerste klachten (dagen)','irregular':'Onregelmatige hartslag (ja)', + 'DNR_yesno':' (ja)','healthcare_worker':'Zorgmedewerker (ja)','infec_resp_diagnosis':'Andere infectieuze ademhalingsdiagnose','Blood_albumin_value_1':'Albumine (g/L)','culture':'Bloedkweek (positief)','APT_APTR_1_1':'APTT (sec)','uses_n_medicine':'Aantal thuismedicamenten'} + +import unicodedata +import string + +valid_filename_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) +char_limit = 255 + +def clean_filename(filename, whitelist=valid_filename_chars, replace=' '): + # replace spaces + for r in replace: + filename = filename.replace(r,'_') + + # keep only valid ascii chars + cleaned_filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode() + + # keep only whitelisted chars + cleaned_filename = ''.join(c for c in cleaned_filename if c in whitelist) + if len(cleaned_filename)>char_limit: + print("Warning, filename truncated because it was over {}. Filenames may no longer be unique".format(char_limit)) + return cleaned_filename[:char_limit] + + +class StandardScaler_min2cols(StandardScaler): + def __init__(self,columns,copy=True, with_mean=False, with_std=True): + self.scaler = StandardScaler(copy, with_mean=with_mean, with_std=with_std) + self.columns = columns + + def fit(self, X, y=None): + self.scaler.fit(X[self.columns], y) + return self + + def transform(self, X, y=None, copy=None): + init_col_order = X.columns + X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns) + X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)] + return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order] + +class RobustScaler_min2cols(RobustScaler): + def __init__(self, columns, copy=True, with_centering=True, + with_scaling=True, quantile_range=(25.0, 75.0)): + self.scaler = RobustScaler(with_centering=with_centering, + with_scaling=with_scaling, + quantile_range=quantile_range, + copy=copy) + self.columns = columns + + def fit(self, X, y=None): + self.scaler.fit(X[self.columns], y) + return self + + def transform(self, X, y=None, copy=None): + init_col_order = X.columns + X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns) + X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)] + return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order] + + +def KM_age_groups(features, outcomes): + T = outcomes['duration_mortality'] + E = outcomes['event_mortality'] + + features['Leeftijd (jaren)'] = features['Leeftijd (jaren)'].astype(float) + age_70_plus = features['Leeftijd (jaren)'] >= 70. + print('Alle patiënten: n=', len(age_70_plus)) + print('Leeftijd ≥ 70: n=', sum(age_70_plus)) + print('Leeftijd < 70: n=', sum(~age_70_plus)) + + was_not_on_icu = data[['Levend ontslagen en niet heropgenomen - waarvan niet opgenomen geweest op IC', + 'Levend dag 21 maar nog in het ziekenhuis - niet op IC geweest', + 'Dood op dag 21 - niet op IC geweest']].any(axis=1) + was_on_icu = data[['Levend ontslagen en niet heropgenomen - waarvan opgenomen geweest op IC', + 'Levend dag 21 maar nog in het ziekenhuis - op IC geweest', + 'Levend dag 21 maar nog in het ziekenhuis - waarvan nu nog op IC', + 'Dood op dag 21 - op IC geweest']].any(axis=1) + print('Alle patiënten: n=', (np.nansum(was_on_icu)+np.nansum(was_not_on_icu))) + print('ICU yes: n=', np.nansum(was_on_icu)) + print('ICU no: n=', np.nansum(was_not_on_icu)) + + icu = False + fig, axes = plt.subplots(1, 1) + + if icu: + kmf1 = KaplanMeierFitter().fit(T[was_on_icu], E[was_on_icu], label='Wel op ICU geweest') + kmf2 = KaplanMeierFitter().fit(T[was_not_on_icu], E[was_not_on_icu], label='Niet op ICU geweest') + else: # age 70 + kmf1 = KaplanMeierFitter().fit(T[age_70_plus], E[age_70_plus], label='Leeftijd ≥ 70 jaar') + kmf2 = KaplanMeierFitter().fit(T[~age_70_plus], E[~age_70_plus], label='Leeftijd < 70 jaar') + + kmf3 = KaplanMeierFitter().fit(T, E, label='Alle patienten') + if icu: + kmf1.plot_survival_function(color=c4) + kmf2.plot_survival_function(color=c5) + kmf3.plot_survival_function(color=c3) + else: + kmf1.plot_survival_function(color=c1) + kmf2.plot_survival_function(color=c2) + kmf3.plot_survival_function(color=c3) + + + axes.set_xticks([1, 5, 9, 13, 17, 21]) + axes.set_xticklabels(['1','5','9','13','17','21']) + axes.set_xlabel('Aantal dagen sinds opnamedag') + axes.set_ylabel('Proportie overlevend') + + axes.set_xlim(0, 21) + axes.set_ylim(0, 1) + titledict = {'fontsize': 18, + 'fontweight': 'bold', + 'verticalalignment': 'baseline', + 'horizontalalignment': 'center'} + # plt.title('COVID-PREDICT survival functie tot t = 21 dagen (n='+str(len(T))+')',fontdict=titledict) + plt.tight_layout() + if icu: + plt.savefig('KM_survival_curve_ICU.png', format='png', dpi=300, figsize=(20,20), pad_inches=0, bbox_inches='tight') + else: + plt.savefig('KM_survival_curve_DEATH.png', format='png', dpi=300, figsize=(20,20), pad_inches=0, bbox_inches='tight') + plt.show() + + return kmf1, kmf2, kmf3 + + +def KM_all_vars(features, outcomes): + T = outcomes['duration_mortality'] + E = outcomes['event_mortality'] + + is_binary = features.columns[['ja' in f for f in features.columns]] + is_binary = is_binary.append(features.columns[['positief' in f for f in features.columns]]) + is_binary = is_binary.append(features.columns[['Andere infectieuze ademhalingsdiagnose' in f for f in features.columns]]) + is_binary = is_binary.append(features.columns[['oxygen_saturation_on_cat_' in f for f in features.columns]]) + is_float = ['ALAT (U/L)', 'ASAT (U/L)', 'Ureum (mmol/L)', 'Albumine (g/L)', + 'Ca (totaal) (mmol/L)', 'Creatinine (µmol/L)', 'glucose (mmol/L)', + 'Hb (mmol/L)', 'hart frequentie (1/min)', 'LDH (U/L)', 'Lactaat (mmol/L)', + 'Lymfocyten (x10^9/L)', 'Neutrofielen (x10^9/L)', 'pCO2 (kPa)', + 'Zuurgraad (pH)', 'PaO2 arteriëel (kPa)', 'trombocyten (x10^9/L)', + 'Kalium (mmol/L)', 'SaO2 (%)', 'Natrium (mmol/L)', 'Temperatuur (ºC)', + 'totaal Bilirubine (IE)', 'Leukocyten (x10^3/µL)', 'CK (U/L)', + 'CRP (mg/L)', 'Tijd sinds eerste klachten (dagen)', 'Diastolische bloeddruk (mmHg)', + 'ferritine (mg/L)', 'FiO2 (%)', 'Zuurstof saturatie (%)', + 'Ademhalingsfrequentie (1/min)', 'Systolische bloeddruk (mmHg)', + 'Aantal thuismedicamenten'] + is_categorical_corads = ['corads_admission_cat_1', 'corads_admission_cat_2', + 'corads_admission_cat_3', 'corads_admission_cat_4', + 'corads_admission_cat_5'] + is_categorical_male_female = ['gender_cat_1', 'gender_cat_2'] + ngroups = 3 + statres = [] + for f in features.columns: + print(f) + kmfs = None + fig, axes = plt.subplots(1, 1) + features[f] = features[f].astype(float) + if f in is_binary: + yes = features[f] == 1 + if sum(yes) > 0: + kmf_yes = KaplanMeierFitter().fit(T[yes], E[yes], label='Ja (n={})'.format(np.nansum(yes))) + + no = features[f] == 0 + if sum(no) > 0: + kmf_no = KaplanMeierFitter().fit(T[no], E[no], label='Nee (n={})'.format(np.nansum(no))) + + na = features[f].isna() + if sum(na) > 0: + kmf_na = KaplanMeierFitter().fit(T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na))) + + kmfs = [kmf_yes, kmf_no, kmf_na] + # event_durations (iterable) – a (n,) list-like representing the (possibly partial) durations of all individuals + # groups (iterable) – a (n,) list-like of unique group labels for each individual. + # event_observed (iterable, optional) – a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed. + # t_0 (float, optional (default=-1)) – the period under observation, -1 for all time. + groups = features[f].astype("category") + event_durations = T + event_observed = E + + elif f in is_float: + ff = pd.qcut(features[f], ngroups, labels=np.arange(ngroups)+1) + kmfs = [] + for q in ff.unique(): + sel = ff == q + if sum(sel) > 0: + kmf_q = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(q, np.nansum(sel))) + kmfs += [kmf_q] + sel = ff.isna() + if sum(sel) > 0: + q = 'Onbekend' + kmf_q = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(q, np.nansum(sel))) + kmfs += [kmf_q] + groups = ff + event_durations = T + event_observed = E + + elif f in is_categorical_corads: + kmfs = [] + for f in is_categorical_corads: + sel = features[f] + kmf_f = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(f, np.nansum(sel))) + kmfs += [kmf_f] + na = features[is_categorical_corads].any(axis=1) == False + if sum(na) > 0: + kmf_na = KaplanMeierFitter().fit(T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na))) + groups = is_categorical_corads#.astype("category") + event_durations = T + event_observed = E + + statres += [None] # FIXME + continue # FIXME + + elif f in is_categorical_male_female: + kmfs = [] + for f in is_categorical_male_female: + sel = features[f] == 1.0 + kmf_f = KaplanMeierFitter().fit(T[sel], E[sel], label='{} (n={})'.format(f, np.nansum(sel))) + kmfs += [kmf_f] + na = features[is_categorical_male_female].any(axis=1) == False + if sum(na) > 0: + kmf_na = KaplanMeierFitter().fit(T[na], E[na], label='Onbekend (n={})'.format(np.nansum(na))) + groups = is_categorical_male_female#.astype("category") + event_durations = T + event_observed = E + + statres += [None] # FIXME + continue # FIXME + + else: + # split data in X groups + print('{} not implemented'.format(f)) + + groups = groups.cat.add_categories(-1).fillna(-1) # treat nan as seperate group. + + if len(np.unique(groups)) < 5: + ss = [pairwise_logrank_test(event_durations, groups, event_observed, t_0=21.)] + p = np.min([s.p_value for s in ss]) + print(p) + if p * 289 < 0.05: + print('{} < 0.05 (p: , corrected p: {})'.format(f, p, p*289)) + statres += ss + else: + print('error in groups: {} - {}'.format(f, groups)) + + if kmfs: + [k.plot_survival_function() for k in kmfs] + + axes.set_xticks([1, 5, 9, 13, 17, 21]) + axes.set_xticklabels(['1', '5', '9', '13', '17', '21']) + axes.set_xlabel('Aantal dagen sinds opnamedag') + axes.set_ylabel('Proportie overlevend') + + axes.set_xlim(0, 21) + axes.set_ylim(0, 1) + plt.title('Kaplan-Meier survival - {}'.format(f)) + plt.tight_layout() + + filename = clean_filename('KM_{}.png'.format(f)) + plt.show() + fig.savefig(os.path.join('km_curves', filename), format='png', dpi=300, figsize=(20, 20), pad_inches=0, bbox_inches='tight') + + return statres + + +if __name__ == '__main__': + config = configparser.ConfigParser() + config.read('../user_settings.ini') + path_creds = config['CastorCredentials']['local_private_path'] + + # %% STEP 1: GET THE DATA USING MAARTEN'S FUNCTIONS + # For more info: please check covid19_ICU_util.py:select_x_y() + goal = ['survival', 'mortality_all'] + + # Add all 'Field Variable Name' from data_struct to + # INCLUDE variables from analysis + # NOTE: See get_feature_set.py for preset selections + variables_to_include = { + 'Form Collection Name': [], # groups + 'Form Name': [], # variable subgroups + 'Field Variable Name': get_5_premorbid_clin_rep_lab_rad() # single variables + } + + # Add all 'Field Variable Name' from data_struct to + # EXCLUDE variables from analysis, because <10 yes or no answers are present. + variables_to_exclude = ['Cardiac_consultation','Meningitis_Encephalitis_1_1','CAR_MH_AF','susp_pe','FH_Cor','CP_oedema','Seizure_1_1','MH_cardiacdevice','ear_pain','MH_SVT_AF_1','CAR_T0_PALP','Inhaled_Nitric_Oxide_1','MH_SVT_AFL','Outcome_7d_hosp','Endocarditis_Myocarditis_Pericarditis_1_1','MH_SVT_AF','Adenovirus','VAP','pregyn_rptestcd','carmed_ami','CAR_CM_heartfailure','CP_ortho','vasc_lipidlowering','discharge_AKI','MH_CABG','carmed_cipro','microbiology_worker','discharge_ards','post_partum','MH_HF','CAR_PM','cvrisk_famhist','dx_his_acq_LQT_1','CAR_MH_aLQT','MH_ACS','discharge_CT','vasc_creat','same_id','Med_New_QT_Prol','carmed_arb_chronic','CP_sync','Adm_CT','MH_SVT_AFL_1','CRF_QT','echo_FU','MH_PCI','CAR_T0_PE','ECG_adm','aids_hiv','discharge_swap','CP_PE','MH_ischemia_det','vasc_antifxa_measured','ECG_adm_ST','carmed_cita','corona_ieorres','Pregnancy'] + variables_to_exclude += ['Coronavirus'] # duplicate of PCR information + data, data_struct = load_data(path_creds) + data, data_struct = preprocess(data, data_struct) + + features, outcomes, data, hospital, record_id, days_until_death = prepare_for_learning(data, data_struct, + variables_to_include, + variables_to_exclude, + goal, + remove_records_threshold_above=None, + remove_features_threshold_above=0.75, + pcr_corona_confirmed_only=False) + # # translate feature columns + features.rename(columns=englishvar_to_dutchnames, inplace=True) + + # %% STEP 2: SHOW KM CURVE + # kmf = KM_age_groups(features, outcomes) + kmf_all = KM_all_vars(features, outcomes) diff --git a/error_replacements.py b/error_replacements.py index acdc1fd..3a52c5c 100644 --- a/error_replacements.py +++ b/error_replacements.py @@ -31,7 +31,9 @@ def get_column_fix_dict(): ''' return { 'oxygentherapy_1': [[-98, None]], - 'Smoking': [[-99, None]] + 'Smoking': [[-99, None]], + 'whole_admission_yes_no': [['1;1', None], ['1;2', None]], + 'whole_admission_yes_no_1': [['1;1', None], ['1;2', None]] } diff --git a/generate_figures.py b/generate_figures.py new file mode 100644 index 0000000..3f756d4 --- /dev/null +++ b/generate_figures.py @@ -0,0 +1,469 @@ +import os + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.metrics import confusion_matrix +from sklearn.metrics import roc_auc_score + +import seaborn as sns + +DPI = 600 + +def get_distance_to_corner(y_hat, y): + # np.sqrt((1-sensitivity)**2 + (1-specificity)**2) + # sensitivity (TPR) = TP / (TP+FN) + # specificity (TNR) = TN / (TN+FP) + y_hat = get_cm_label(y, y_hat) + sens = (y_hat==1).sum() / ((y_hat==1).sum() + (y_hat==4).sum()) + spec = (y_hat==2).sum() / ((y_hat==2).sum() + (y_hat==3).sum()) + d = np.sqrt((1-sens)**2 + (1-spec)**2) + return d + +def get_fpr_thr(y, y_hat, fpr_goal=.1, steps=100): + # Finds the threshold corresponding with the + # goal FPR + best_error = 100 + best_thr = 100 + for thr in np.linspace(0, 1, steps+1): + yh = pd.Series(0, index=y.index) + yh.loc[y_hat>thr] = 1 + yh = get_cm_label(y, yh) + fpr = yh[yh==3].count() / (yh[yh==3].count() + yh[yh==2].count()) + error = abs(fpr_goal - fpr) + if error < best_error: + best_error = error + best_thr = thr + result = pd.Series(0, index=y.index, name='y_hat_{}'.format(fpr_goal)) + result.loc[y_hat>best_thr] = 1 + return result, best_thr + +def get_confusion_matrix(y_hat): + # Y_hat labeled by tp/tn/fp/fn + # [[tn, fp], [fn, tp]] + return [[(y_hat==2).sum(), (y_hat==3).sum()], + [(y_hat==4).sum(), (y_hat==1).sum()]] + +def get_cm_label(y, y_hat): + result = pd.Series(None, index=y.index) + result.loc[(y==1) & (y_hat==1)] = 1 # TP + result.loc[(y==0) & (y_hat==0)] = 2 # TN + result.loc[(y==0) & (y_hat==1)] = 3 # FP + result.loc[(y==1) & (y_hat==0)] = 4 # FN + return result + +def get_yhats(y, y_hat, fpr_goals): + yhs = [] + thresholds = [] + for goal in fpr_goals: + yh, threshold = get_fpr_thr(y, y_hat, fpr_goal=goal, steps=100) + yh = get_cm_label(y, yh) + yhs += [yh] + thresholds += [threshold] + + # Include optimal threshold + thresholds = np.linspace(0, 1, 101) + best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat>thr, y) + for thr in thresholds])] + yhs += [get_cm_label(y, y_hat>best_thr)] + threshold += [best_thr] + return pd.concat(yhs, axis=1), thresholds + +def get_avg_ci(arr): + avg = arr.mean(axis=0) + std = arr.std(axis=0) + ci = 1.96*std/np.sqrt(arr.shape[0]) + return np.append(avg, ci) + +def get_results(y, y_hat): + auc = roc_auc_score(y, y_hat) + cm = get_confusion_matrix(get_cm_label(y, y_hat)) + return [auc, cm[0][0], cm[0][1], cm[1][0], cm[1][1]] + +def get_simple_model_performance(y, y_hat_lr, y_hat_xgb, hospital): + age = pd.read_excel('age.xlsx').iloc[:, 1] + y_hat_70 = age > 70 + y_hat_80 = age > 80 + sz = min(y_hat_lr.size, y_hat_xgb.size, y_hat_70.size, y_hat_80.size, y.size, hospital.size) + hospital = hospital[:sz].reset_index(drop=True) + y = y.iloc[:sz].reset_index(drop=True) + y_hat_lr = y_hat_lr.iloc[:sz].reset_index(drop=True) + y_hat_xgb = y_hat_xgb.iloc[:sz].reset_index(drop=True) + y_hat_70 = y_hat_70.iloc[:sz].reset_index(drop=True) + y_hat_80 = y_hat_80.iloc[:sz].reset_index(drop=True) + + + # TODO: AUC CHANGES BY TURNING BINARY WITH THRESHOLD + thresholds = np.linspace(0, 1, 101) + best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat_xgb>thr, y) + for thr in thresholds])] + y_hat_xgb = y_hat_xgb>best_thr + best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat_lr>thr, y) + for thr in thresholds])] + y_hat_lr = y_hat_lr>best_thr + + unique_hospitals = hospital.unique() + result_70 = [] + result_80 = [] + result_lr = [] + result_xgb = [] + for h in unique_hospitals: + mask = hospital==h + result_70 += [get_results(y[mask], y_hat_70[mask])] + result_80 += [get_results(y[mask], y_hat_80[mask])] + result_lr += [get_results(y[mask], y_hat_lr[mask])] + result_xgb += [get_results(y[mask], y_hat_xgb[mask])] + + result = pd.DataFrame(index=['result_70', 'result_80', 'result_lr', 'result_xgb'], + columns=['auc', 'tn', 'fp', 'fn', 'tp', + 'auc_ci', 'tn_ci', 'fp_ci', 'fn_ci', 'tp_ci']) + result_70 = get_avg_ci(np.asarray(result_70)) + result_80 = get_avg_ci(np.asarray(result_80)) + result_lr = get_avg_ci(np.asarray(result_lr)) + result_xgb = get_avg_ci(np.asarray(result_xgb)) + + +def plot_conf_mats(y, ys, hospitals, name, savepath): + fprs = [] + # Overal + + fig, ax = plt.subplots(1, ys.shape[1], sharey=True, figsize=(8, 4)) + for i, thr in enumerate(ys.columns): + y_hat = ys[thr] + tp = (y_hat==1).sum() + tn = (y_hat==2).sum() + fp = (y_hat==3).sum() + fn = (y_hat==4).sum() + df_cm = pd.DataFrame([[tn, fp], [fn, tp]], + index=[0, 1], columns=[0, 1]) + fpr = fp/(fp+tn) + fprs += [fpr] + sns.heatmap(df_cm, annot=True, fmt="d", + cmap=plt.get_cmap('Blues'), cbar=False, + ax=ax[i]) + # ax[i].set_xlabel('Threshold: {}'.format(thr.split('_')[2])) + ax[i].set_title('FPR: {:.2f}'.format(fpr)) + ax[i].set_aspect('equal', 'box') + if i==0: + ax[i].set_ylabel('True label'.format(y_hat.size)) + + plt.suptitle('Confusion matrix\n{} features'.format(name)) + fig.text(0.5, 0.2, 'Predicted label', ha='center', fontstyle='normal') + # fig.text(0.04, 0.5, 'True label\nn={}'.format(y_hat.size), + # va='center', rotation='vertical') + + fig.savefig(savepath + 'confusionmatrices_total.png', dpi=DPI) + + # Per hospital + hosps = hospitals.unique() + fig, ax = plt.subplots(hosps.size, ys.shape[1], + sharex=True, sharey=True, + figsize=(5, 9)) + for i, hosp in enumerate(hosps): + # y_h = y.loc[hospitals==hosp] + ys_h = ys.loc[hospitals==hosp, :] + for j, thr in enumerate(ys.columns): + y_hat = ys_h[thr] + tp = (y_hat==1).sum() + tn = (y_hat==2).sum() + fp = (y_hat==3).sum() + fn = (y_hat==4).sum() + df_cm = pd.DataFrame([[tn, fp], [fn, tp]], + index=[0, 1], columns=[0, 1]) + # fpr = fp/(fp + tn) # == fpr per hospital per threshold + sns.heatmap(df_cm, annot=True, fmt="d", + cmap=plt.get_cmap('Blues'), cbar=False, + ax=ax[i, j]) + + if i == 0: + ax[i, j].set_title('Overall FPR: {:.2f}'.format(fprs[j]), fontsize=8) + if j == 0: + ax[i, j].set_ylabel('{}\nn={}'.format(hosp, y_hat.size), fontsize=6) + ax[i, j].set_aspect('equal', 'box') + + plt.suptitle('Confusion matrix per hospital per threshold\nFeatures: {}'.format(name)) + fig.text(0.5, 0.04, 'Predicted label', ha='center') + fig.text(0.04, 0.5, 'True label', va='center', rotation='vertical') + fig.savefig(savepath + 'confusion_matrices_per_hosptal.png', dpi=DPI) + +def plot_dists(x, y, thresholds, savepath, + auc=None, histogram=False, kde=True): + + title = 'Class distribution per variable' + if auc != None: + title += ' - auc: {:.3f}'.format(auc) + + n_cols = 5 + n_rows = 2 + for i, column in enumerate(x.columns.to_list()): + row = (i//n_cols)%2 + col = i%12 + page = i%(n_rows*n_cols) + + if i%(n_rows*n_cols)==0: + fig, axes = plt.subplots(n_rows, n_cols, sharex=False, sharey=False) + fig.suptitle('{} (page {}/{})'.format(title, page, len(x.columns)//(n_cols*n_rows)+1)) + + sns.distplot(x.loc[y==0, column], hist=histogram, kde=kde, ax=axes[row, col], color='b', label='Alive') + sns.distplot(x.loc[y==1, column], hist=histogram, kde=kde, ax=axes[row, col], color='r', label='Deceased') + + for k, thr in enumerate(thresholds): + axes[row, col].axvline(thr, linewidth=1, label='threshold: {:.2f}'.format(thr)) + + axes[row, col].set_title(column, fontsize=7) + axes[row, col].set_xlabel('') + axes[row, col].set_ylabel('') + + # axes[row, col].get_legend().set_visible(False) + if row == n_rows-1 & col==0: + handles, labels = axes[row, col].get_legend_handles_labels() + fig.legend(handles, labels, loc='upper right') + fig.savefig(savepath + 'feature_distribution_p{}'.format(page), dpi=DPI) + + plt.show() + +def plot_correct_per_day(y, y_hat, dto, thresholds, name='', savepath='./'): + fig, ax = plt.subplots(1, 1, figsize=(16, 9)) + for i, col in enumerate(y_hat.columns): + if i < y_hat.columns.size-1: + continue + y = pd.Series(0, index=y_hat.index) + y[y_hat[col].isin([1, 2])] = 1 + can_use = dto.notna() & (dto <= 21) & (dto >= 0) + + pos = pd.Series(dto[(y==1) & can_use].value_counts().sort_index(), + index=list(range(0, 22))).fillna(0) + neg = pd.Series(dto[(y==0) & can_use].value_counts().sort_index(), + index=list(range(0, 22))).fillna(0) + rel = pos / (pos + neg) + + df = pd.concat([pos, neg, rel], axis=1) + df.columns = ['Correct', 'Incorrect', 'Relative'] + + df.iloc[:, 0:2].plot.bar(ax=ax, legend=False) + ax2 = ax.twinx() + df.iloc[:, -1].plot.bar(ax=ax2, color='g', alpha=0.2, label='Relative') + + ax.set_xlabel('Day') + ax.set_ylabel('Count') + ax2.set_ylabel('Relative correct') + ax.set_title('Prediction per day - {} features'.format(name)) + + bar, labels = ax.get_legend_handles_labels() + bar2, labels2 = ax2.get_legend_handles_labels() + ax2.legend(bar+bar2, labels+labels2) + fig.savefig(savepath+'prediction_per_day_{}'.format(name), dpi=DPI) + +def plot_cm_simple_baseline(y, y_hat, y_hat_xgb, feat, cutoff, name, savepath): + age = pd.read_excel('age.xlsx').iloc[:, 1] + y_hat_s = age > 70 + y_hat_s2 = age > 80 + sz = min(y_hat_s.size, y_hat_xgb.size, y_hat_s2.size, y_hat.size, y.size) + y = y.iloc[:sz] + y_hat = y_hat.iloc[:sz] + y_hat_xgb = y_hat_xgb.iloc[:sz] + y_hat_s = y_hat_s.iloc[:sz] + y_hat_s2 = y_hat_s2.iloc[:sz] + + auc_s = roc_auc_score(y, y_hat_s) + auc_s2 = roc_auc_score(y, y_hat_s2) + auc_p = roc_auc_score(y, y_hat) + auc_xgb = roc_auc_score(y, y_hat_xgb) + + # What happens here? + cm_s = get_confusion_matrix(get_cm_label(y, y_hat_s)) + cm_s2 = get_confusion_matrix(get_cm_label(y, y_hat_s2)) + fpr_s = cm_s[0][1] / (cm_s[0][1] + cm_s[0][0]) + fpr_s2 = cm_s2[0][1] / (cm_s2[0][1] + cm_s2[0][0]) + + _, best_thr_s = get_fpr_thr(y, y_hat, fpr_goal=fpr_s) # this is unecessary? + + + thresholds = np.linspace(0, 1, 101) + + # LR + best_thr = thresholds[np.argmin([get_distance_to_corner(y_hat>thr, y) + for thr in thresholds])] + y_hat = y_hat>best_thr + cm_p = get_confusion_matrix(get_cm_label(y, y_hat)) + fpr = cm_p[0][1] / (cm_p[0][1] + cm_p[0][0]) + + # XGB + best_thr_xgb = thresholds[np.argmin([get_distance_to_corner(y_hat_xgb>thr, y) + for thr in thresholds])] + y_hat_xgb = y_hat_xgb>best_thr + cm_xgb = get_confusion_matrix(get_cm_label(y, y_hat_xgb)) + fpr_xgb = cm_xgb[0][1] / (cm_xgb[0][1] + cm_xgb[0][0]) + + fig, axes = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(12, 12)) + + # LR + sns.heatmap(cm_p, annot=True, fmt="d", + cmap=plt.get_cmap('Blues'), cbar=False, + ax=axes[0, 0]) + # optimal threshold (shortest distance to upperleft corner) + axes[0, 0].set_title('LR \n AUC: {:.2f} - FPR:{:.2f}'.format(auc_p, fpr)) + axes[1, 1].set_ylabel('True label') + # axes[0, 0].set_xlabel('Predicted label') + axes[0, 0].set_aspect('equal', 'box') + + # XGB + sns.heatmap(cm_xgb, annot=True, fmt="d", + cmap=plt.get_cmap('Blues'), cbar=False, + ax=axes[0, 1]) + # optimal threshold (shortest distance to upperleft corner) + axes[0, 1].set_title('XGB \n AUC: {:.2f} - FPR:{:.2f}'.format(auc_xgb, fpr_xgb)) + # axes[0, 1].set_xlabel('Predicted label') + axes[0, 1].set_aspect('equal', 'box') + + # age>70 + sns.heatmap(cm_s, annot=True, fmt="d", + cmap=plt.get_cmap('Blues'), cbar=False, + ax=axes[1, 0]) + axes[1, 0].set_title('Age>70\nAUC: {:.2f} - FPR:{:.2f}'.format(auc_s, fpr_s)) + axes[1, 0].set_ylabel('True label') + axes[1, 0].set_xlabel('Predicted label') + axes[1, 0].set_aspect('equal', 'box') + + # age>80 + sns.heatmap(cm_s2, annot=True, fmt="d", + cmap=plt.get_cmap('Blues'), cbar=False, + ax=axes[1, 1]) + axes[1, 1].set_title('Age>80\nAUC: {:.2f} - FPR:{:.2f}'.format(auc_s2, fpr_s2)) + axes[1, 1].set_xlabel('Predicted label') + axes[1, 1].set_aspect('equal', 'box') + + plt.suptitle('Featureset: {}'.format(name)) + fig.savefig(savepath+'compared_with_simple_baseline_{}'.format(name), dpi=DPI) + +feature_sets = { + 'pm': 'Premorbid', + 'cp': 'Clinical Presentation', + 'lab': 'Laboratory and Radiology', + 'pmcp': 'Premorbid + Clinical Presentation', + 'all': 'All', + 'k10': '10 best' + } + +path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\results_output\logreg/' +path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\results/' +path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\results_final\results_lr_pipeline_random_50/' +path = r'C:\Users\p70066129\Projects\COVID-19 CDSS\FINAL\LR/' +path_xgb = r'C:\Users\p70066129\Projects\COVID-19 CDSS\FINAL\XGB/' +files = [file for file in os.listdir(path) if '.pkl' in file] +folders = [fol for fol in os.listdir(path) if os.path.isdir(path + fol)] +for file in files: + # if 'k10' not in file: + # continue + fullpath = path + file + fullpath_xgb = path_xgb + file + file = file.split('_') + # fset = file[1][-3:] if 'k10' in file else file[1] + fset = file[1] + n = file[2] + y = file[3] + name = feature_sets[fset] + + if name not in folders: + os.mkdir(path+name) + savepath = path + name + '/' + + df = pd.read_pickle(fullpath) + df_xgb = pd.read_pickle(fullpath_xgb) + + hospital = df['hospital'] + dto = df['days_until_death'] + y = df['y'] + y_hat = df['y_hat'] + y_hat_xgb = df_xgb['y_hat'] + x = df.drop(['hospital', 'days_until_death', 'y', 'y_hat'], axis=1) + + + get_simple_model_performance(y, y_hat, y_hat_xgb, hospital) + + # df = df.fillna(-99) + auc = roc_auc_score(y, y_hat) + + if 'age_yrs' in x.columns: + plot_cm_simple_baseline(y, y_hat, y_hat_xgb, x['age_yrs'], .65, name, savepath) # NOT COMPLETELY TRUE + + + + # GET FPRs + fpr_goals = [0, 0.05, 0.1] + y_hats, thresholds = get_yhats(y, y_hat, fpr_goals) + + # PLOT + plot_correct_per_day(y, y_hats, dto, thresholds, name=name, savepath=savepath) + plot_conf_mats(y, y_hats, hospital, name, savepath=savepath) + # plot_dists(x, y, thresholds, savepath, + # auc=auc, histogram=True, kde=False) + + + #### Decision boundaries + # plot_dbs(x, df.y, df.y_hat, ys) + + + # plt.show() + +### GENERAL PLOTS + +with open(path+'results.txt', 'r') as f: + lines = f.readlines() + +scores = [] +errors = [] +names = [] +for l in lines[2:]: + l = l.split(' ') + scores += [float(l[4])] + errors += [float(l[6][:-1])] + names += [l[2]] +print('done') + +fset_order = ['pm', 'cp', 'lab', 'pmcp', 'all']#, 'k10'] +idx = [names.index(i) for i in fset_order] +scores = [scores[i] for i in idx] +errors = [errors[i] for i in idx] +names = [feature_sets[names[i]] for i in idx] +n_bars = list(range(len(scores))) + +fig, ax = plt.subplots(figsize=(9,5)) +ax.bar(n_bars, scores, yerr=errors) +ax.set_ylim(0, 1) +ax.set_xticks(n_bars) +ax.set_xticklabels(names, fontsize=6) +ax.set_ylabel('ROC AUC') +ax.set_title('Performance\nError = 95% CI') +fig.savefig(path+'AUC_performance_per_featureset', dpi=DPI) + +plt.show() + +# def plot_correct_per_day(y, y_hat, dto, thresholds, name='', savepath='./'): +# fig, ax = plt.subplots(1, y_hat.shape[1], figsize=(16, 9)) +# for i, col in enumerate(y_hat.columns): +# y = pd.Series(0, index=y_hat.index) +# y[y_hat[col].isin([1, 2])] = 1 +# can_use = dto.notna() & (dto <= 21) & (dto >= 0) + +# pos = pd.Series(dto[(y==1) & can_use].value_counts().sort_index(), +# index=list(range(0, 22))).fillna(0) +# neg = pd.Series(dto[(y==0) & can_use].value_counts().sort_index(), +# index=list(range(0, 22))).fillna(0) +# rel = pos / (pos + neg) + +# df = pd.concat([pos, neg, rel], axis=1) +# df.columns = ['Correct', 'Wrong', 'Relative'] + +# df.iloc[:, 0:2].plot.bar(ax=ax[i]) +# ax2 = ax[i].twinx() +# df.iloc[:, -1].plot.bar(ax=ax2, color='g', alpha=0.2) + +# ax[i].set_xlabel('Day') +# if i==0: +# ax[i].set_ylabel('count') +# if i==(y_hat.columns.size-1): +# ax2.set_ylabel('Relative (P/(P+N))') +# ax[i].set_title('Thr: {:.3f}'.format(thresholds[i])) +# fig.suptitle('Prediction per day - {}'.format(name)) +# fig.savefig(savepath+'prediction_per_day_{}'.format(name), dpi=DPI) diff --git a/get_feature_set.py b/get_feature_set.py index 12d6801..a1c3b8f 100644 --- a/get_feature_set.py +++ b/get_feature_set.py @@ -21,7 +21,8 @@ def get_3_laboratory_radiology_findings(): 'Lactate_2_1','Creatinine_value_1','Sodium_1_1','Potassium_1_1', 'Calcium_adm','crp_1_1','Blood_albumin_value_1','creatininekinase', 'LDH','d_dimer','ferritine_admin_spec','fibrinogen_admin', - 'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1','PCO2_1','PH_value_1', + 'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1_Arterial', 'PaO2_1_Venous', + 'PaO2_1_Capillary','PCO2_1','PH_value_1', 'Influenza','Coronavirus','Adenovirus','Bacteria','culture', 'infec_resp_diagnosis','infiltrates_2','CT_thorax_performed', 'corads_admission'] @@ -52,7 +53,8 @@ def get_6_all(): 'Lactate_2_1','Creatinine_value_1','Sodium_1_1','Potassium_1_1', 'Calcium_adm','crp_1_1','Blood_albumin_value_1','creatininekinase', 'LDH','d_dimer','ferritine_admin_spec','fibrinogen_admin', - 'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1','PCO2_1','PH_value_1', + 'oxygentherapy_1','fio2_1','SaO2_1','PaO2_1_Arterial', 'PaO2_1_Venous', + 'PaO2_1_Capillary','PCO2_1','PH_value_1', 'Influenza','Coronavirus','RSV_','Adenovirus','Bacteria','culture', 'infec_resp_diagnosis','infiltrates_2','CT_thorax_performed', 'corads_admission', diff --git a/lr_coefs.npy b/lr_coefs.npy new file mode 100644 index 0000000000000000000000000000000000000000..98273b8dda9a09eb9dc446b781f7bd064364e773 GIT binary patch literal 1928 zcmdT@c}&x17%l=stQ>-bIYf&bg;9pZ$dDtiBNNnEj)5GRq8xQ-gf<0)r)Z1_>Yuzv==m7xJ3b~dlumtz ztq+h04*P9LgapFhtu45}%4q1p1Kc({1rz zo5R2SsGnn0^FQwIpGb@3wJX3hrGk#XB!IVDeoM9Dzfk3TZc4vwYMh5Cn|r0Mntfq<-T*^paQmHqpbxv2^j1c56@B%RXsjFhG$!u-%wK~xE%$wT}P zJ$Z;45LKkb#=xIqo8ey1gotMvS-h6QC6NIjq@zLbBMU64$g{U|ev01s*V7}Q;X=x|`O?tW8WsTG5OJBS> zZk3MuXC21EMSkd~``ZG#4&H%|?i8Z>EEkG)eH1HT+=J^w7AiVx? zz)V9c3b5n~ho!Zsi|!gOcHskOh(4Vmxq)Uf^%=vS$&h1kXe5X}H=T8%0PdQY z*;N=~@WcLR!xt+F=+PvS8Oxr6OV-muMokMaO?t=nrp`nB#8JLjT8W~^46AmV-$A-> zvd*SfNl@Kh#c=2OHuPi18D=k`8C)>>(kQD4Qq)~$Lun-OE?4VTSYt~lZ%c23y=B+-u(E38nsl zG+P6?vI^S3^Fi>}uk^97mBNHj^D*!HPH?=gmM@$_=!M3i@_n>=l^@I5b&&N*;fDP8 e|Dx>5to6<+DOgbw+v9gKRejfmdi^}*5c~luJ0@EI literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt index b073469..f5d09ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ pandas==0.25.2 progressbar2==3.50.1 sklearn==0.0 requests==2.22.0 -castorapi==0.1.3 +castorapi==0.2 +xgboost==1.0.2 +missingpy=0.2.0 \ No newline at end of file diff --git a/results/loho_pm_class_dist_per_hosp.txt b/results/loho_pm_class_dist_per_hosp.txt new file mode 100644 index 0000000..2960dd5 --- /dev/null +++ b/results/loho_pm_class_dist_per_hosp.txt @@ -0,0 +1,10 @@ +hospital 0 1 +Center 1 145 24 +Center 3 229 98 +Center 0 211 50 +Center 7 322 118 +Center com 105 24 +Center 5 315 86 +Center 4 97 16 +Center 2 106 12 +Center 6 237 88 diff --git a/results/loho_pm_n2283_y516.0_coefs.txt b/results/loho_pm_n2283_y516.0_coefs.txt new file mode 100644 index 0000000..232eb50 --- /dev/null +++ b/results/loho_pm_n2283_y516.0_coefs.txt @@ -0,0 +1,45 @@ +[[0.02724774 0.07796375 0.04514499 0.037072 0.03693948 0.04072643 + 0.04699144 0.03451858 0.03676676 0.04358888 0.03437743 0.05223805 + 0.04450634 0.03076349 0.02880035 0.04174154 0.03355281 0.02878598 + 0.04131289 0.04650693 0.02872644 0.04124029 0.03936394 0.03553464 + 0.04558886] + [0.02852098 0.11933233 0.02049731 0.03196372 0.0402501 0.04791315 + 0.04758906 0.0272247 0.03244538 0.05959017 0.03316517 0.05397624 + 0.04702694 0.04040785 0.03965527 0.02883011 0.03357292 0.03211337 + 0.03795148 0.02385984 0.03098355 0.03889681 0.02707311 0.02900842 + 0.0481519 ] + [0.03824867 0.20201536 0.03722147 0.02238339 0.03902486 0.04041912 + 0.04165535 0.02382601 0.03506192 0.03819693 0.03310945 0.04155134 + 0.04301092 0.03435294 0.03162336 0.03340763 0.02047763 0.03296867 + 0.03788981 0.02431517 0.02674964 0.01537701 0.02278797 0. + 0.08432537] + [0.02365478 0.15148643 0.01284037 0.02902209 0.03568465 0.03697684 + 0.05815557 0.0291574 0.03424063 0.05271753 0.03415936 0.04179135 + 0.0447507 0.02028611 0.02205772 0.04093557 0.0316455 0.02927355 + 0.03154784 0.02852412 0.03397431 0.03592318 0.03366717 0.03448628 + 0.07304095] + [0.03153612 0.16012011 0.02650238 0.02958032 0.03428465 0.03534453 + 0.07054944 0.01924068 0.03674266 0.04099085 0.03405198 0.03728574 + 0.03685509 0.03123184 0.03651385 0.03296662 0.03361399 0.04126919 + 0.03322199 0.02767796 0.02392729 0.02306266 0.02242733 0.02182049 + 0.07918217] + [0.02297538 0.2050213 0.02518855 0.02765787 0.03315873 0.03289452 + 0.09503685 0. 0.03322658 0.02815784 0.04470608 0.03382082 + 0.03129571 0.02981318 0.03155567 0.03594936 0.05403385 0.0245353 + 0.03034715 0.02750624 0. 0.0184159 0.02812885 0.02183791 + 0.0847363 ] + [0.03794999 0.11937173 0.02292331 0.03781984 0.03594646 0.03787136 + 0.04503115 0.02828448 0.0322755 0.04052298 0.0344321 0.04110741 + 0.04858724 0.04217534 0.03850076 0.03208326 0.03291799 0.03559599 + 0.03548445 0.03230241 0.03144418 0.03115782 0.03739164 0.02933442 + 0.05948817] + [0.03765591 0.1244607 0.02361755 0.03341233 0.03704211 0.03986889 + 0.05130611 0.0245237 0.0476805 0.04277378 0.03831241 0.03791314 + 0.04802676 0.03886366 0.03721868 0.02699121 0.03517297 0.03607577 + 0.03673963 0.02632892 0.03438185 0.02605204 0.0285626 0.02555271 + 0.06146609] + [0.03061492 0.08841494 0.03246649 0.03778347 0.03862301 0.04902279 + 0.04338187 0.03391563 0.03703794 0.05063674 0.0386482 0.04710913 + 0.04242903 0.03407801 0.0321296 0.03596683 0.02974905 0.02953567 + 0.04011565 0.03769885 0.02688617 0.03670201 0.03521456 0.04155528 + 0.05028412]] \ No newline at end of file diff --git a/unit_lookup.py b/unit_lookup.py index f8c3d21..5c861be 100644 --- a/unit_lookup.py +++ b/unit_lookup.py @@ -4,14 +4,14 @@ def get_unit_lookup_dict(): 'units_neutro': {1: 0.001, 2: -999.0, 3: 1.0 }, 'Platelets_unit_1': {1: 1.0, 2: 1.0 }, 'Total_Bilirubin_1_1': {1: 1.0, 2: 17.1 }, - 'Haemoglobin_unit_1': {1: 1.6114, 2: 16.1140, 3: 1 }, + 'Haemoglobin_unit_1': {1: 1.6114, 2: 16.1140, 3: 1.0 }, 'Glucose_unit_2': {1: 1.0, 2: 0.0555 }, 'Blood_Urea_Nitrogen_unit_1': {1: 1.0, 2: 0.3571 }, 'Lactate_1_1': {1: 1.0, 2: 0.1110 }, 'Creatinine_unit_1': {1: 1.0, 2: 88.4956 }, 'sodium_units': {1: 1.0, 2: 1.0 }, 'pot_units_1': {1: 1.0, 2: 1.0 }, - 'd_dimer_units_adm': {1: 1000.0, 2: 0.18, 3: 1 }, + 'd_dimer_units_adm': {1: 5.5556, 2: 0.0055, 3: 1.0 }, 'WBC_1_1': {1: 1.0, 2: 1.0 }, 'units_lymph': {1: 0.001, 2: -999.0, 3: 1.0 }, 'PCO2_unit_1': {1: 1.0, 2: 0.1333 }, @@ -28,8 +28,8 @@ def get_unit_lookup_dict(): 'Creatinine_unit': {1: 1.0, 2: 88.4956 }, 'sodium_units_1': {1: 1.0, 2: 1.0 }, 'pot_units_2': {1: 1.0, 2: 1.0 }, - 'Haemoglobin_unit': {1: 1.6114, 2: 16.1140, 3: 1 }, - 'units_d_dimer': {1: 1000.0, 2: 0.18, 3: 1 }, + 'Haemoglobin_unit': {1: 1.6114, 2: 16.1140, 3: 1.0 }, + 'units_d_dimer': {1: 5.5556, 2: 0.0055, 3: 1.0 }, 'WBC_1': {1: 1.0, 2: 1.0 }} diff --git a/update_slack/covid19_update_slack_all.py b/update_slack/covid19_update_slack_all.py index f29de19..b777c8d 100755 --- a/update_slack/covid19_update_slack_all.py +++ b/update_slack/covid19_update_slack_all.py @@ -7,10 +7,9 @@ @author: wouterpotters """ -import time, statistics, os, site, sys -site.addsitedir('./../') # add directory to path to enable import of castor_api -from castor_api import Castor_api - +import time +import statistics +import castorapi as ca import configparser config = configparser.ConfigParser() config.read('../user_settings.ini') @@ -18,10 +17,11 @@ # put both the secret, client and the tokens_slack file here location_castor_slack_api_data = config['SlackAPI']['local_private_path'] -c = Castor_api(location_castor_slack_api_data) # e.g. in user dir outside of GIT repo +c = ca.CastorApi(location_castor_slack_api_data) # e.g. in user dir outside of GIT repo # get study ID for COVID study -study_id = c.request_study_id('COVID')[0] +study_id = c.select_study_by_name('COVID-19 NL') + # Posting to a Slack channel def send_message_to_slack(text): @@ -38,65 +38,91 @@ def send_message_to_slack(text): for t_url in tokens: req = request.Request(t_url, - data=json_data.encode('ascii'), - headers={'Content-Type': 'application/json'}) + data=json_data.encode('ascii'), + headers={'Content-Type': 'application/json'}) request.urlopen(req) except Exception as em: print("EXCEPTION: " + str(em)) + records = c.request_study_records(study_id) -count = len([x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False]) +count = len([x['_embedded']['institute']['name'] for x in records + if x['_embedded']['institute']['name'] != 'Test Institute' + and x['archived'] == False]) first_update = True while True: try: # renew session with api to avoid timeout errors - c = Castor_api(location_castor_slack_api_data) # e.g. in user dir outside of GIT repo + c = ca.CastorApi(location_castor_slack_api_data) # e.g. in user dir outside of GIT repo records = c.request_study_records(study_id) institutes = [x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False] institutesUnique = [] for inst in institutes: if inst not in institutesUnique: institutesUnique.append(inst) - + count_sub = {} completion_rate_sub = {} completion_rate_100_sub = {} completion_rate_90_sub = {} completion_rate_0_sub = {} - completion_rate_avg_sub = {} + completion_rate_avg_sub = {} sub_messages = {} for inst in institutesUnique: - count_sub[inst] = (len([x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False])) - completion_rate_sub[inst] = [x['progress'] for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False] - completion_rate_100_sub[inst] = sum([x['progress']==100 for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) - completion_rate_90_sub[inst] = sum([x['progress']>90 for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) - completion_rate_0_sub[inst] = sum([x['progress']==0 for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) - completion_rate_avg_sub[inst] = statistics.mean([x['progress'] for x in records if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) - sub_messages[inst] = str(count_sub[inst]) + ' (avg completion rate: ' + str(round(completion_rate_avg_sub[inst],2)) + '%)' - - count_nw = (len([x['_embedded']['institute']['name'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False])) - completion_rate = [x['progress'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False] - completion_rate_100 = sum([x['progress']==100 for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False]) - completion_rate_90 = sum([x['progress']>90 for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False]) - completion_rate_0 = sum([x['progress']==0 for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False]) - completion_rate_avg = statistics.mean([x['progress'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' and x['archived'] == False]) + count_sub[inst] = (len([x['_embedded']['institute']['name'] for x in records + if x['_embedded']['institute']['name'] == inst and x['archived'] == False])) + completion_rate_sub[inst] = [x['progress'] for x in records + if x['_embedded']['institute']['name'] == inst and x['archived'] == False] + completion_rate_100_sub[inst] = sum([x['progress']==100 for x in records + if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) + completion_rate_90_sub[inst] = sum([x['progress']>90 for x in records + if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) + completion_rate_0_sub[inst] = sum([x['progress']==0 for x in records + if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) + completion_rate_avg_sub[inst] = statistics.mean([x['progress'] for x in records + if x['_embedded']['institute']['name'] == inst and x['archived'] == False]) + sub_messages[inst] = str(count_sub[inst]) + ' (Castor completion rate: ' + \ + str(int(completion_rate_avg_sub[inst])) + '%)' + + count_nw = (len([x['_embedded']['institute']['name'] for x in records + if x['_embedded']['institute']['name'] != 'Test Institute' + and x['archived'] is False])) + completion_rate = [x['progress'] for x in records if x['_embedded']['institute']['name'] != 'Test Institute' + and x['archived'] == False] + completion_rate_100 = sum([x['progress']==100 for x in records + if x['_embedded']['institute']['name'] != 'Test Institute' + and x['archived'] == False]) + completion_rate_90 = sum([x['progress']>90 for x in records + if x['_embedded']['institute']['name'] != 'Test Institute' + and x['archived'] == False]) + completion_rate_0 = sum([x['progress']==0 for x in records + if x['_embedded']['institute']['name'] != 'Test Institute' + and x['archived'] == False]) + completion_rate_avg = statistics.mean([x['progress'] for x in records + if x['_embedded']['institute']['name'] != 'Test Institute' + and x['archived'] == False]) if count_nw - count >= 5 or first_update: count = count_nw - message = 'Total number of inclusions is now: ' + str(count) +' records'+\ - '\nTotal number of institutions with data: '+ str(len(institutesUnique)) +\ - '\n - 100% completed: ' + str(completion_rate_100)+'/'+ str(count) +\ - '\n - >90% completed: ' + str(completion_rate_90)+'/'+ str(count) +\ - '\n - 0% completed: ' + str(completion_rate_0)+'/'+ str(count) +\ - '\n - average completion: ' + str(round(completion_rate_avg,2))+'% (n='+ str(count) +')\n' - for c in sub_messages: - message = message + '\n>' + c + ': ' + (max([len(x) for x in institutesUnique])-len(c))*' ' + sub_messages[c] - + message = '```Total # of inclusions : ' + str(count) + ' records' + \ + '\nTotal # of institutions: ' + str(len(institutesUnique)) +\ + '\n- > 90% completed: ' + str(completion_rate_90) + '/' + str(count) + \ + '\n- 0% completed: ' + str(completion_rate_0) + '/' + str(count) + \ + '\n- avg completion rate: ' + str(int(completion_rate_avg)) + '%\n' + sorted_subs = sorted(sub_messages, key=count_sub.get, reverse=True) + for c in sorted_subs: + n = 0 + if (count_sub[c]) < 10: + n += 1 + if (count_sub[c]) < 100: + n += 1 + message = message + '\n> ' + c + ': ' + (max([len(x) for x in institutesUnique])-len(c)+n)*' ' + sub_messages[c] + message += '\n\nNB: Castor completion rates can be unreliable as these are only updated after records are opened/edited.```' send_message_to_slack(message) print(message) - + first_update = False - + else: print('No new entries found; try again in 5 minutes') time.sleep(60*5) # 5 minute updates, only if count increases by 10 or more. @@ -106,4 +132,3 @@ def send_message_to_slack(text): time.sleep(60) # in case of errors; wait 1 minute and try again. - \ No newline at end of file diff --git a/xgb_coefs.npy b/xgb_coefs.npy new file mode 100644 index 0000000000000000000000000000000000000000..05cb0e6153998c55ee7b1e1489e79027835d83cf GIT binary patch literal 1028 zcmbV~{ZkVJ9L5=2Fkl?a#Woo*6mT$y<4qXc_Zbs>36D(yDZG=E8+h}jNgH(WQX0~M z<`toEMu8nsT7X`(1tca21e!09VQ{dxD9AMik<$uD!~TRmzdfJdp68y^noRZ1-3+H! zoJu6Qc}A^4k`O4N{V@_*pk)6MLxCYjzxRkCH}9$6nseBgx8jYuocz3%92zeRjM^9` z3w$l`|Kffjc9II4&*6bpf!m`3jH?Cc8CnNnu?l;%p169J57o&iWZmJQ`p-wUR1ew66c+~1*Of45*yxNc6h5>D{inFlUQWRGO;#$)bzsSQp0u;h zNU;Q?MztB;vq{KsnIz*h3*S9kfS6HNMDnwdc20#a%0lq&OKY(ve3p8SyFqh;31_1S z=E7c54F5yP>0;;yEyVrEPWFT?X!^_#{uYh$twJGkvfJszs2JsMhu~I&6nz_dNl@s`?mDTP87ffh3d$0``ezsH8D`#ee9U9me#g!dx`h`V;2ew;6%k=C!sX@!Bd z7PJiXP?A`rJg_gFzP@TFu5=aHg#uK@=utL4M>B~WJU{mWF8$^KUHEz&PmP0@{1ei) znaHa}f-f5w@VPTj8U+KoW-EOh>Vb%09%^RXP`b$zM{8@qk}**6C`ef@8l(L5KgdhZ zfxh1x38htNY^bNAqkKFwBY~jU1M?k1y!en0>(~=I^^Omm*{-;I(G62>VYtp?};oBy4xsO+6YT*7l{_ql<`@&skeHH3Wi;w>vF-x#%p-m^*tdQTbpO7 z(?1TT3U|!Qo$yV9gUVG5G$ry-m!6HJ$AfgH?l0=y_0xa*EKJl=v3`aQPPb9~$yOL% z7GZsn0~fAerp5cdI7>V*#3Ixh6qqpmN@>w799{}R3)>Tk`$BPuxec6d85Z08apeFL z%n=u?zQsjI78jK}Soq*uC$uV`(4u5Du4sOz$xn^&fiG0kk5OCHLwm>BDC!iTN*#nf z>(VhdJxCv>CZj4^j$*qXhPJPKr+qVy$;D_e2+`_ip-Qed(hszJmjmIU z5fUw1X*Oe+CVNW|baRoWSD8_>#KOI6zK})-Kz#E#$Ul!nfXy93vjDH^cHp!+0B<~0 w!TUxyv^5^ctYgBF#zTtf0o`yckY;BvL Date: Tue, 16 Jun 2020 12:35:27 +0200 Subject: [PATCH 2/2] fix path --- covid19_ICU_admission.py | 4 +- results/loho_pm_n2283_y516.0_coefs.txt | 90 ++++++++++++------------- xgb_coefs.npy | Bin 1028 -> 1028 bytes 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/covid19_ICU_admission.py b/covid19_ICU_admission.py index f7ec90a..3b36bd0 100644 --- a/covid19_ICU_admission.py +++ b/covid19_ICU_admission.py @@ -418,10 +418,10 @@ def run(data, data_struct, goal, variables_to_include, variables_to_exclude, test_set = pd.DataFrame(test_set,columns=x.columns) f = plt.figure() shap.summary_plot(shap_values, test_set) - f.savefig(os.path.join( model.save_path ,"summary_plot1.png"), bbox_inches='tight', dpi=600) + f.savefig((model.save + "_summary_plot1.png"), bbox_inches='tight', dpi=600) f = plt.figure() shap.summary_plot(shap_values, test_set, plot_type="bar") - f.savefig(os.path.join( model.save_path ,"summary_plot2.png"), bbox_inches='tight', dpi=600) + f.savefig((model.save_path + "_summary_plot2.png"), bbox_inches='tight', dpi=600) if not save_figures: plt.show() diff --git a/results/loho_pm_n2283_y516.0_coefs.txt b/results/loho_pm_n2283_y516.0_coefs.txt index 232eb50..6ac06c8 100644 --- a/results/loho_pm_n2283_y516.0_coefs.txt +++ b/results/loho_pm_n2283_y516.0_coefs.txt @@ -1,45 +1,45 @@ -[[0.02724774 0.07796375 0.04514499 0.037072 0.03693948 0.04072643 - 0.04699144 0.03451858 0.03676676 0.04358888 0.03437743 0.05223805 - 0.04450634 0.03076349 0.02880035 0.04174154 0.03355281 0.02878598 - 0.04131289 0.04650693 0.02872644 0.04124029 0.03936394 0.03553464 - 0.04558886] - [0.02852098 0.11933233 0.02049731 0.03196372 0.0402501 0.04791315 - 0.04758906 0.0272247 0.03244538 0.05959017 0.03316517 0.05397624 - 0.04702694 0.04040785 0.03965527 0.02883011 0.03357292 0.03211337 - 0.03795148 0.02385984 0.03098355 0.03889681 0.02707311 0.02900842 - 0.0481519 ] - [0.03824867 0.20201536 0.03722147 0.02238339 0.03902486 0.04041912 - 0.04165535 0.02382601 0.03506192 0.03819693 0.03310945 0.04155134 - 0.04301092 0.03435294 0.03162336 0.03340763 0.02047763 0.03296867 - 0.03788981 0.02431517 0.02674964 0.01537701 0.02278797 0. - 0.08432537] - [0.02365478 0.15148643 0.01284037 0.02902209 0.03568465 0.03697684 - 0.05815557 0.0291574 0.03424063 0.05271753 0.03415936 0.04179135 - 0.0447507 0.02028611 0.02205772 0.04093557 0.0316455 0.02927355 - 0.03154784 0.02852412 0.03397431 0.03592318 0.03366717 0.03448628 - 0.07304095] - [0.03153612 0.16012011 0.02650238 0.02958032 0.03428465 0.03534453 - 0.07054944 0.01924068 0.03674266 0.04099085 0.03405198 0.03728574 - 0.03685509 0.03123184 0.03651385 0.03296662 0.03361399 0.04126919 - 0.03322199 0.02767796 0.02392729 0.02306266 0.02242733 0.02182049 - 0.07918217] - [0.02297538 0.2050213 0.02518855 0.02765787 0.03315873 0.03289452 - 0.09503685 0. 0.03322658 0.02815784 0.04470608 0.03382082 - 0.03129571 0.02981318 0.03155567 0.03594936 0.05403385 0.0245353 - 0.03034715 0.02750624 0. 0.0184159 0.02812885 0.02183791 - 0.0847363 ] - [0.03794999 0.11937173 0.02292331 0.03781984 0.03594646 0.03787136 - 0.04503115 0.02828448 0.0322755 0.04052298 0.0344321 0.04110741 - 0.04858724 0.04217534 0.03850076 0.03208326 0.03291799 0.03559599 - 0.03548445 0.03230241 0.03144418 0.03115782 0.03739164 0.02933442 - 0.05948817] - [0.03765591 0.1244607 0.02361755 0.03341233 0.03704211 0.03986889 - 0.05130611 0.0245237 0.0476805 0.04277378 0.03831241 0.03791314 - 0.04802676 0.03886366 0.03721868 0.02699121 0.03517297 0.03607577 - 0.03673963 0.02632892 0.03438185 0.02605204 0.0285626 0.02555271 - 0.06146609] - [0.03061492 0.08841494 0.03246649 0.03778347 0.03862301 0.04902279 - 0.04338187 0.03391563 0.03703794 0.05063674 0.0386482 0.04710913 - 0.04242903 0.03407801 0.0321296 0.03596683 0.02974905 0.02953567 - 0.04011565 0.03769885 0.02688617 0.03670201 0.03521456 0.04155528 - 0.05028412]] \ No newline at end of file +[[0.02900513 0.10485817 0.0189172 0.03721691 0.03553807 0.04540571 + 0.04054026 0.03142751 0.03802205 0.04737291 0.03735742 0.04617571 + 0.04504537 0.03630833 0.03214922 0.04576809 0.03009612 0.02972055 + 0.04109957 0.03762121 0.02534612 0.03589006 0.03655426 0.03809978 + 0.05446427] + [0.03371864 0.07376102 0.03255043 0.03968529 0.03761422 0.04802226 + 0.04134564 0.03310054 0.03776442 0.05359318 0.03574292 0.04927718 + 0.04543485 0.03519416 0.03937469 0.03208647 0.03053788 0.03389869 + 0.04050232 0.03536687 0.03212772 0.04184384 0.03644386 0.03788628 + 0.04312659] + [0.03507941 0.15744089 0.02616969 0.02849024 0.03828554 0.04440631 + 0.05145083 0.02374118 0.03698253 0.03523904 0.03241921 0.04719491 + 0.03995088 0.03506144 0.04007663 0.02934017 0.03156112 0.03145391 + 0.03467921 0.02535327 0.02256914 0.02781971 0.02548802 0.03285579 + 0.06689094] + [0.03372161 0.08439521 0.0289013 0.04076211 0.03650614 0.05089137 + 0.03374405 0.04437228 0.03895336 0.05374929 0.03740871 0.04490269 + 0.04559904 0.02854626 0.03117101 0.03326504 0.02982679 0.03199616 + 0.0395859 0.02939419 0.02762785 0.04320026 0.04013652 0.04495482 + 0.04638802] + [0.02926653 0.24708247 0. 0.02734006 0.03898028 0.03827675 + 0.08777084 0. 0.0423913 0.0383439 0.02220213 0.04523094 + 0.05577661 0.04846299 0. 0.02791958 0. 0.02541763 + 0.03712561 0.02293024 0.02168907 0.01740101 0.03307582 0. + 0.0933162 ] + [0.029925 0.20497677 0. 0.02398588 0.03220888 0.03252695 + 0.08346679 0.01035944 0.03922718 0.02866674 0.03422621 0.03407473 + 0.03592969 0.03183847 0.02384912 0.02781185 0.04592736 0.04671792 + 0.03090115 0.0223245 0.01321638 0.01738103 0.02445605 0.027792 + 0.09820986] + [0.03334145 0.12808916 0.0167826 0.03656108 0.03405709 0.0379472 + 0.05089486 0.02370403 0.0318248 0.03878355 0.03259754 0.0418744 + 0.04873329 0.04059746 0.04309903 0.0293893 0.03169436 0.04251581 + 0.03647418 0.02938977 0.02605443 0.02558089 0.03959654 0.03090884 + 0.06950831] + [0.03093497 0.06281067 0.04603253 0.03931557 0.0433754 0.04624838 + 0.04940653 0.03764849 0.04165215 0.05599385 0.03610231 0.03991234 + 0.0440371 0.03005461 0.03021684 0.03320304 0.03827928 0.03343385 + 0.03905055 0.03715051 0.03372669 0.03598797 0.03964458 0.03446046 + 0.04132135] + [0.02725543 0.10402164 0.03387656 0.03820056 0.04408296 0.0424456 + 0.04272338 0.03363778 0.03688686 0.0500395 0.03885584 0.04751011 + 0.04281425 0.03273204 0.02841813 0.03399113 0.03231791 0.02896029 + 0.04179978 0.03131918 0.02726615 0.03499264 0.03622282 0.03890293 + 0.05072653]] \ No newline at end of file diff --git a/xgb_coefs.npy b/xgb_coefs.npy index 05cb0e6153998c55ee7b1e1489e79027835d83cf..170c305cff601b26af7af71531ab48c9bd272e88 100644 GIT binary patch delta 914 zcmW;CeM}St9L90r4mgf?@dA#Ei{s7nl9vK`@%KHDF_dB|k%=cSX`qplD3+L(2Mwl7 zM95y$s6Vh8I)Yi2Z%gEAh(rN7C}v53x-cjdB4+`K&!6AtQ>m&{)s?NG`)6&$~Bd^)}Fn&%$m4}WAO+K|cHA=7dIhy5&+d%_)-dg59PsQD0wDR&h&RlQN zQdPN$$G0tFXds;%{)t=)_abLi5ZWCI-d%cMHV&$>-3(>$mk0fbf>#Sn7}GR-8WF%- zpG2{v)sx8hAal>=0m&&ClZ-wcFO0^bz4#2jzG0Ku-M;ib=Y>O;lG3#+sXiXedkt1k z3d2MA&N3;IX3vlPc67OV&~&Ot#zxMH>DH*2yL+YUuoHoE`)Ssz+4@s6MYeG<&4f|5 zPs8h_>xhX{bGj{-+JkzQFUesz!Jm;o;_&Q9WN>gyES7m0Pgaq+VOokj6kH84py{5J zZfn+v*zQL0#MTgM4DsaJo3UK_OQIBSn(ZDe<=&87IiW|>spHA^FIZ^S#${MN9z}+~ z6isRZ%AExSRir)qj-0t63+ChaUUByJBl_nHlJ?mtdAp`oDt9|SH1)bRR)_vA#Wyp} z)&;EE`>pJpP;fudk>ismX*zF|mgdK(7`QE$qujVt?~2mogL{z!EAqdQ*8WjhA1FM% zDT(bNvvPg-E2*l^mJ?fQ#Vfm0+?q!v*>DB7dKEty73QWr)#B5xq15WizTyB3W$T!Z zu*o@#gO&1Yf!I|!Vy!fBs9_aV>0x}hKZ>jElQPw3k7sBE(}fy(Y?Iop7OUIvgRFqBW2{QRHAEHM8q7OM+47?QP&f#PuU1>v-;nGj2O-=MJ z9qa32D7m(bGjkDiTy~-55ntkFGRO$mGjlYB`P@iUi<}7l{&z`Fbfq(2NsxI$l9!t} au-cB)+#xA>)|1Vyd5dYy*7H%vTK)%*JW=KV delta 914 zcmW-f{ZkVJ9ETZOY-8hKF1E>lodSXr9Eic>zMrwcUhrVX(4spryY&NV?TnYZK&}z4 z1I;TU#~B57L}{_ft}%$5RzRBd{0Gl-kC9{Kv^E+}<#?#J z_XO_xmAE!3#&n4o&arin)T^;e7lrf3MNl8vg!1crwEejgMa4PjXRc7+e1JCoEr&(V zMcY__Y`N5pVECv2y0dw>)#In8S3+3!pCsK}IEviKFsZjgcnvmBvXLdKK(CVnv6Me9^yiCn2GReDeSc_QjY&ag<2W( zqizy>8l+&}Q|S3J9tm!Z>YZ8%Uash)!;>-?-%Y{QE;)uax=7p^4P8w9Ha@$w-ok>5vv*I5hoSMq^iAp zwDjHiAPMBFz^N6ZWwRbl)AQuZ=i~YL7jWh`9x5|7;81Zcq7|Q$&T6A*b1J^+Vj%YV z0%?>CRQCGmGea5=naM(!d|Wg<83jYT39NJmn(rs6jM6DOp#6iQ^?c}uW02R-itesX zsxyf2m@gILdL9<~CD?sW1pm}SI{IEL!Z~c*Jk7-{cRel%Ir!NzOLFEA-3dDA?SVe} zWzb56@@}|u93)*VRT;LFUn5uBEY*y&QR#@l>Fx{o&-OzJ2T%9h(U_=e5$7BNkM>F2#nr5Kdh>OG|g+ zaGZo-$fRhmQewvTE0t!m40v@p1!hhZ^7o`+Kl2&z2h*`+O~ARmOfV-Ru==V1DdhsR zY-i!)@55kGJ*1`7)i|g5ojhMuBNlN`dmo^^&Pgv#b5J)RMr%nDcCFK5-aAU46c?g3 zTY>suJjS*?`loLz8Wl2hRY_opH@K-q5Cg@K5Zf9=aGv>rR&Mek**`(j6+g|Djg!Z9 z5J^{-$h*o8(=rRUFT_Efoe0^LXQB8i3yFb9NbF*~QMnx-lqBM