-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathConditionalSurvivalUtils.py
More file actions
275 lines (239 loc) · 7.84 KB
/
ConditionalSurvivalUtils.py
File metadata and controls
275 lines (239 loc) · 7.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 2 19:20:16 2018
@author: computer
"""
import pandas as pd
import numpy as np
from typing import Union
from lifelines import CoxPHFitter
from lifelines.utils import k_fold_cross_validation
import os
from os.path import join as opj
import sys
BASEPATH = opj(os.path.expanduser("~"), "Desktop", "MiscSurvivalUtils")
sys.path.insert(0, BASEPATH)
from DataCodingUtils import prep_data_for_conditional_survival
def conditional_cox(
source_table,
source_table_survival_column,
source_table_event_column,
time_passed,
verbose=True,
):
"""
Calculates a cox regression conditional on the patients having survived
to a certain point in time.
Arguments
----------
source_table: pd.DataFrame
pandas DF containing time to event, event, and variables as columns
NOTE: all variables should be either continuous or dummy variables
source_table_survival_column: str
name of time to event column
source_table_event_column: str
name of "event" column -- 1 means dead, 0 means censored
time_passed: Union[int,float]
time that has passed since start of study period
verbose: bool
Returns
-------
pd.DataFrame
cph.summary pandas dataframe
"""
cph = CoxPHFitter()
# Prep for conditional survival
source_table_slice = prep_data_for_conditional_survival(
source_table=source_table,
source_table_survival_column=source_table_survival_column,
time_passed=time_passed,
)
# now fit model
cph.fit(
source_table_slice,
duration_col=source_table_survival_column,
event_col=source_table_event_column,
show_progress=verbose,
)
return cph
def conditional_cox_cross_validation(
source_table,
source_table_survival_column,
source_table_event_column,
time_passed,
number_of_folds,
):
"""
Does cross-validated cox regression using conditional on the patients
having survived to a certain point in time.
Arguments
---------
source_table: pd.DataFrame
pandas DF containing time to event, event, and variables as columns
NOTE: all variables should be either continuous or dummy variables
source_table_survival_column: str
name of time to event column
source_table_event_column: str
name of "event" column -- 1 means dead, 0 means censored
time_passed: Union[int,float]
time that has passed since start of study period
Returns
-------
dict
Various relevant attributes of model
"""
cph = CoxPHFitter()
# Prep for conditional survival
source_table_slice = prep_data_for_conditional_survival(
source_table,
source_table_survival_column=source_table_survival_column,
time_passed=time_passed,
)
# do cross validation
scores = k_fold_cross_validation(
fitters=cph,
df=source_table_slice,
duration_col=source_table_survival_column,
event_col=source_table_event_column,
k=number_of_folds,
scoring_method="concordance_index",
)
to_return = {"c_index_folds": scores, "N": source_table_slice.shape[0]}
return to_return
def get_conditional_survival(
source_table, source_table_label, time_passed, time_to_survive
):
"""conditional survival model"""
CS = (
source_table.loc[time_passed + time_to_survive, source_table_label]
/ source_table.loc[time_passed, source_table_label].copy()
)
CS = CS * 100
return CS
def get_univariable_model(
data_frame, outcome_str, time_passed, durationstr="SURVIVAL", K=5
):
"""
Gets univariable cox regression model, including model accuracy
on testing set (cross-validation), training set, and final
model coefficients.
Arguments
---------
data_frame: pd.DataFrame
a pandas dataframe, contains outcomes are columns as well
outcome_str: str
eg 'OS' for overall survival -- column name of event indicator
durationstr: str
duration column name
K: int
no of folds
time_passed: int
numerical, represents the number of years patients already lived
Returns
--------
pd.DataFrame
dataframe of results
"""
print(
"Getting Univariable model for",
outcome_str,
"after",
time_passed,
"year(s) have passed",
)
varnames = list(data_frame.columns)
varnames.remove(durationstr)
varnames.remove(outcome_str)
model_df = pd.DataFrame(index=varnames)
coeffs_df = pd.DataFrame()
for varname in varnames:
print(" univariable model for", varname)
source_table_slice = data_frame.loc[:, [varname] + [durationstr, outcome_str]]
# testing accuracy (cross validation)
cv_results = conditional_cox_cross_validation(
source_table=source_table_slice,
source_table_survival_column=durationstr,
source_table_event_column=outcome_str,
time_passed=time_passed,
number_of_folds=K,
)
model_df.loc[varname, "N"] = cv_results["N"]
for k in range(1, K + 1):
model_df.loc[varname, "cindex_K-%d" % k] = cv_results["c_index_folds"][
k - 1
]
# model coefficients and training model fit
cph = conditional_cox(
source_table=source_table_slice,
source_table_survival_column=durationstr,
source_table_event_column=outcome_str,
time_passed=time_passed,
verbose=False,
)
cph_summary = cph.summary.copy()
cph_summary.loc[:, "exp(lower 0.95)"] = np.exp(cph_summary.loc[:, "lower 0.95"])
cph_summary.loc[:, "exp(upper 0.95)"] = np.exp(cph_summary.loc[:, "upper 0.95"])
cph_summary.loc[:, "cindex_training"] = cph.score_
coeffs_df = pd.concat((coeffs_df, cph_summary), axis=0)
# concat accuracy and coefficients for nice presentation
model_df = pd.concat((model_df, coeffs_df), axis=1)
print("done")
return model_df
def get_multivariable_model(
data_frame, outcome_str, time_passed, durationstr="SURVIVAL", K=5
):
"""
Gets multivariable cox regression model, including model accuracy
on testing set (cross-validation), training set, and final
model coefficients.
Arguments
---------
data_frame: pd.DataFrame
a pandas dataframe, contains outcomes are columns as well
outcome_str: str
eg 'OS' for overall survival -- column name of event indicator
durationstr: str
duration column name
K: int
no of folds
time_passed: int
represents the number of years patients already lived
Returns
-------
pd.DataFrame
the results of the cox model
pd.DataFrame
the C-index scoresfor each fold
"""
print(
"Getting multivariable model for",
outcome_str,
"after",
time_passed,
"year(s) have passed",
)
source_table_slice = data_frame.copy()
if K > 0:
# testing accuracy (cross validation)
cv_results = conditional_cox_cross_validation(
source_table=source_table_slice,
source_table_survival_column=durationstr,
source_table_event_column=outcome_str,
time_passed=time_passed,
number_of_folds=K,
)
cv_results = pd.DataFrame.from_dict(cv_results)
else:
cv_results = None
# model coefficients and training model fit
cph = conditional_cox(
source_table=source_table_slice,
source_table_survival_column=durationstr,
source_table_event_column=outcome_str,
time_passed=time_passed,
verbose=False,
)
cph_summary = cph.summary.copy()
cph_summary.loc[:, "cindex_training"] = cph.concordance_index_
print("done")
return cph_summary, cv_results