-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathSurvivalAnalysisUtils.py
168 lines (118 loc) · 6.45 KB
/
SurvivalAnalysisUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter
class KaplanMeierUtils():
"""
Module of reusable function and utilities for
survival analysis.
"""
def __init__(self):
pass
def PlotKaplanMeierEstimatesForCategoricalVariables(self, data = None, categorical_columns=[]):
'''
Purpose:
Plots the Kaplan Meier Estimates For Categorical Variables in the data.
**NOTE: The Kaplan-Meier estimator is used to estimate the survival function.
The visual representation of this function is usually called the Kaplan-Meier
curve, and it shows what the probability of an event (for example, survival)
is at a certain time interval. If the sample size is large enough, the curve
should approach the true survival function for the population under
investigation.
Parameters:
1. data: the dataset.
2. categorical_columns: all the categorical data features as a list.
Return Value:
NONE.
Reference: https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#lifelines.fitters.kaplan_meier_fitter.KaplanMeierFitter
'''
categoricalData = data.loc(axis=1)[categorical_columns]
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(18,10))
plt.tight_layout(pad=5.0)
def km_fits(data, curr_Feature = None):
'''
Purpose:
Generates the Kaplan Meier fits for fitting the Kaplan-Meier
estimate for the survival function.
Parameters:
1. data: the dataset.
2. curr_Feature: the current feature under consideration.
Return Value:
kmfits: the Kaplan-Meier estimates.
'''
curr_feature_range = np.unique(data[curr_Feature])
X = [data[data[curr_Feature]==x]['time'] for x in curr_feature_range]
Y = [data[data[curr_Feature]==y]['DEATH_EVENT'] for y in curr_feature_range]
fit_label = [str(curr_Feature + ': ' + str(feature_range_i)) for feature_range_i in curr_feature_range]
kmfits = [KaplanMeierFitter().fit(durations = x_i,
event_observed = y_i,
label = fit_label[i]) for i,(x_i, y_i) in enumerate(zip(X,Y))]
return kmfits
for idx, feature in enumerate(categorical_columns):
cat_fits = km_fits(data = data,
curr_Feature = feature)
[x.plot(title=feature, ylabel="Survival Probability", xlabel="Days",
ylim=(0,1.1), xlim=(0,300),
ci_alpha=0.1, ax=ax.flatten()[idx]) for x in cat_fits]
ax.flatten()[-1].set_visible(False)
fig.suptitle("Kaplan Meier Estimates for Categorical Variables ", fontsize=16.0)
plt.show()
return None
def PlotKaplanMeierEstimatesForContinuousVariables(self, data = None, continuous_columns=[]):
'''
Purpose:
Plots the Kaplan Meier Estimates For Continuous Variables in the data.
**NOTE: The Kaplan-Meier estimator is used to estimate the survival function.
The visual representation of this function is usually called the Kaplan-Meier
curve, and it shows what the probability of an event (for example, survival)
is at a certain time interval. If the sample size is large enough, the curve
should approach the true survival function for the population under
investigation.
Parameters:
1. data: the dataset.
2. continuous_columns: all the continuous data features as a list.
Return Value:
NONE.
Reference:
https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#lifelines.fitters.kaplan_meier_fitter.KaplanMeierFitter
'''
continuousData = data.loc(axis=1)[continuous_columns]
fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(18,15))
plt.tight_layout(pad=5.0)
def km_fits(data, curr_Feature = None, split_points = None):
'''
Purpose:
Generates the Kaplan Meier fits for fitting the Kaplan-Meier
estimate for the survival function.
Parameters:
1. data: the dataset.
2. curr_Feature: the current feature under consideration.
3. split_points: the data split points to cut the data.
Return Value:
kmfits: the Kaplan-Meier estimates.
'''
bins = pd.cut(x=data[curr_Feature],bins=split_points)
curr_feature_range = np.unique(bins)
curr_feature_group = str(curr_Feature) + "_group"
data[curr_feature_group] = pd.cut(x=data[curr_Feature], bins=split_points)
X = [data[data[curr_feature_group] == bin_range]['time'] for bin_range in curr_feature_range]
Y = [data[data[curr_feature_group] == bin_range]['DEATH_EVENT'] for bin_range in curr_feature_range]
fit_label = [str(str(feature_range_i).replace(',',' -').replace(']',')')) for feature_range_i in curr_feature_range]
data.drop(curr_feature_group, axis=1, inplace=True)
kmfits = [KaplanMeierFitter().fit(durations = x_i,
event_observed = y_i,
label=fit_label[i]) for i,(x_i, y_i) in enumerate(zip(X,Y))]
return kmfits
data_split_points = [[39.0,60.0,80.0,100.0],3,[0,30.0,45.0,100.0],3,3,3,3]
for idx, feature in enumerate(continuous_columns):
cont_fits = km_fits(data = data,
curr_Feature = feature,
split_points = data_split_points[idx])
[x.plot(title=feature, ylabel="Survival Probability", xlabel="Days",
ylim=(0,1.1), xlim=(0,300), ci_alpha=0.1,
ax=ax.flatten()[idx]) for x in cont_fits]
ax.flatten()[-1].set_visible(False)
ax.flatten()[-2].set_visible(False)
fig.suptitle("Kaplan Meier Estimates for Continuous Variables ", fontsize=16.0, y=1.0)
plt.show()