forked from Rayn2402/ThePetaleProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreplicate_study.py
More file actions
410 lines (310 loc) · 14.3 KB
/
replicate_study.py
File metadata and controls
410 lines (310 loc) · 14.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
"""
Filename: replicate_study.py
Author: Nicolas Raymond
Description: Script used to replicate the entire study
Date of last modification: 2022/08/08
"""
from argparse import ArgumentParser
from os import environ, listdir, makedirs, rename
from os.path import exists, join
from pandas import DataFrame, read_csv
from settings.paths import Paths
from shutil import move
from src.data.extraction.constants import DUMMY
from src.utils.metrics import AbsoluteError, Direction, ConcordanceIndex, Pearson,\
RootMeanSquaredError, Sensitivity, Specificity
from subprocess import check_call
from time import time
from webbrowser import open_new_tab
environ['MKL_THREADING_LAYER'] = 'GNU'
# TASK CHOICES
VO2: str = 'vo2'
OB: str = 'obesity'
# DATA SOURCE CHOICES
ORIGINAL: str = 'original'
GENERATED: str = 'generated'
# GNN CHOICE
GAT = 'GAT'
GCN = 'GCN'
# MODEL CONVERSION DICTIONARY
MODEL_TO_ARGS = {'RF': '-rf',
'XGBoost': '-xg',
'enet': '-enet',
'MLP': '-mlp',
'ggaeEnet': '-ggae'}
def add_delimiter(section: str) -> None:
"""
Print a pattern to indicate a new section of the script
Args:
section: name of the new section
Returns: None
"""
print(f"\n{'*'*5} {section} {'*'*5}\n")
def argument_parser():
"""
Creates a parser for the replication of the study
"""
# Parser creation
parser = ArgumentParser(usage='python replicate_study.py -task ["vo2" or "obesity"]'
' -data ["original" or "generated"]')
# Definition of arguments
parser.add_argument('-d', '--data', type=str, default=ORIGINAL, choices=[ORIGINAL, GENERATED],
help=f'Choice of the source of data (default = {ORIGINAL})')
parser.add_argument('-t', '--task', type=str, default=VO2, choices=[VO2, OB],
help=f'Choice of prediction task (default = {VO2})')
parser.add_argument('-gen', '--genomics', default=False, action='store_true',
help='If true, includes the genomic variables (SNPs) in the experiments')
parser.add_argument('-fast', '--fast', default=False, action='store_true',
help='If true, runs a fast version of the experiment with only 2'
' data splits for the model evaluations and 2 inner data splits'
'the evaluation of each set of hyperparameter values (instead of'
' 10)')
# Argument parsing
arguments = parser.parse_args()
return arguments
def extract_best_graph_degree(df: DataFrame,
model: str) -> int:
"""
Finds the the degree of graph that allowed to achieve the best score
Args:
df: pandas dataframe with the scores of all models
model: GCN or GAT
Returns: degree (int)
"""
# Validation of input model
if model not in [GAT, GCN]:
raise ValueError(f'Model must be in {[GAT, GCN]}')
# We filter the dataframe to only consider the row associated to the model
filtered_df = df.filter(like=model, axis=0)
# We find the best model among the filtered dataframe
best = find_best_model(filtered_df)
# We return the degree associated to the model
return int(best[3:])
def summarize_experiments(result_directory: str,
task: str,
final_results: bool = False,
fast: bool = False) -> str:
"""
Moves and renames experiment folders, then summarize results into csv and html files
Args:
result_directory: path of the directory created to store the results of the experiments
task: name of the current task ('obesity' or 'vo2')
final_results: if true, folders will be moved and renamed considering that they contain results
from the final tests on the holdout set
fast: if true, files saved will mention that results were obtained with the fast version of the experiment
Returns: path of the csv created
"""
# We extract the folders name associated to the completed experiments
experiment_folders = [f for f in listdir(Paths.EXPERIMENTS_RECORDS) if f not in ['.gitkeep', OB, VO2,
f'{OB}_fast', f'{VO2}_fast']]
# We extract the keywords that help identify the experiment
keywords = experiment_folders[0].split('_')[1:]
if fast:
keywords.insert(1, 'fast')
# We create of a new subfolder associated to the task and the specific keywords
if not final_results:
new_directory = join(result_directory, '_'.join(keywords[1:]))
else:
new_directory = join(result_directory, '_'.join(keywords[2:] + ['holdout']))
makedirs(new_directory)
# We move the folders into the right directory
for f in experiment_folders:
move(join(Paths.EXPERIMENTS_RECORDS, f), new_directory)
# We rename experiment folders
for f in listdir(new_directory):
if not final_results:
new_name = f.split('_')[0]
else:
keys_list = f.split('_')
new_name = '_'.join([keys_list[0], keys_list[2]])
rename(join(new_directory, f), join(new_directory, new_name))
# We compute classification metrics if we are working with the obesity prediction
if task == OB:
check_call(args=['python', join(Paths.POST_ANALYSES_SCRIPTS, task, 'get_classification_metrics.py'),
'-p', new_directory])
# We create a csv summarizing scores of all models
csv_name = '_'.join(keywords) if not final_results else '_'.join([keywords[0]] + keywords[2:] + ['holdout'])
check_call(args=['python', join(Paths.UTILS_SCRIPTS, 'get_scores_csv.py'),
'-p', new_directory, '-fn', csv_name])
# We create an interactive html file summarizing the results of all models
check_call(args=['python', join(Paths.UTILS_SCRIPTS, 'create_experiments_recap.py'),
'-p', new_directory, '-fn', 'recap'])
# We open the html file
open_new_tab(join(new_directory, 'recap.html'))
# We return the path of the csv
return join(Paths.CSV_FILES, f'{csv_name}.csv')
def reformat_cell(cell_content: str, direction: str) -> float:
"""
Functions that reformat cell of a pandas dataframe that are associated
to a metric considering if it has to be minimized or maximized
Args:
cell_content: str contained in a dataframe cell
direction: "minimize" or "maximize"
Returns: float
"""
temp_list = cell_content.split(' +- ')
mean = float(temp_list[0])
std = float(temp_list[1])
if direction == Direction.MINIMIZE:
return mean + (std * 0.01)
else:
return mean - (std * 0.01)
def reformat_cell_for_min(cell_content: str) -> float:
"""
Functions that reformat cell of a pandas dataframe that are associated
to a metric that has to be minimized
Args:
cell_content: str contained in a dataframe cell
Returns: float
"""
return reformat_cell(cell_content, Direction.MINIMIZE)
def reformat_cell_for_max(cell_content: str) -> float:
"""
Functions that reformat cell of a pandas dataframe that are associated
to a metric that has to be maximized
Args:
cell_content: str contained in a dataframe cell
Returns: float
"""
return reformat_cell(cell_content, Direction.MAXIMIZE)
if __name__ == '__main__':
# We start a timer for the whole experiment
experiment_start = time()
args = argument_parser()
"""
0. Setup
"""
# We set main variables according to the prediction task
metrics = [AbsoluteError(), ConcordanceIndex(), Pearson(), RootMeanSquaredError()]
model_args = ['-rf', '-xg', '-enet', '-mlp']
if args.task == OB:
learning_set_path = Paths.OBESITY_LEARNING_SET_CSV
holdout_set_path = Paths.OBESITY_HOLDOUT_SET_CSV
eval_mask_path = Paths.OBESITY_MASK
holdout_mask_path = Paths.OBESITY_HOLDOUT_MASK
feature_args = ['-b', '-f']
metrics += [Sensitivity(), Specificity()]
if args.genomics:
model_args.append('-ggae')
feature_args += ['-gen']
else:
learning_set_path = Paths.VO2_LEARNING_SET_CSV
holdout_set_path = Paths.VO2_HOLDOUT_SET_CSV
eval_mask_path = Paths.VO2_MASK
holdout_mask_path = Paths.VO2_HOLDOUT_MASK
feature_args = ['-b', '-s', '-r_w']
if args.genomics:
model_args.append('-ggae')
feature_args += ['-gen', '-f']
if args.fast:
feature_args += ['-k', '2', '-l', '2']
to_minimize = [metric.name for metric in metrics if metric.direction == Direction.MINIMIZE]
to_maximize = [metric.name for metric in metrics if metric.direction == Direction.MAXIMIZE]
# We create of a new directory specific to the task
result_folder = join(Paths.EXPERIMENTS_RECORDS, args.task)
result_folder = result_folder + "_fast" if args.fast else result_folder
makedirs(result_folder, exist_ok=True)
def reformat_df(dataframe: DataFrame) -> DataFrame:
"""
Reformat all the cells of a dataframe for further analyses
Args:
dataframe: dataframe with the scores of each model
Returns: dataframe
"""
for metric in to_minimize:
dataframe[metric] = dataframe[metric].map(reformat_cell_for_min)
for metric in to_maximize:
dataframe[metric] = dataframe[metric].map(reformat_cell_for_max)
return dataframe
# Creation of a function specific to the metrics associated with the task
def find_best_model(dataframe: DataFrame) -> str:
"""
Retrieves the model that leads in the greatest number of metrics
Args:
dataframe: dataframe with the scores of each model
Returns: name of the model
"""
count_min = ((dataframe[to_minimize] == dataframe[to_minimize].min()).sum(axis=1))
count_max = ((dataframe[to_maximize] == dataframe[to_maximize].max()).sum(axis=1))
return (count_min + count_max).idxmax()
"""
1. Data preparation
"""
# If we want to use the generated data
add_delimiter("1. Data preparation")
if args.data == GENERATED:
# We add an argument necessary for the experiments
feature_args.append('-from_csv')
# We create the learning set and the holdout set if they don't already exists
if not exists(learning_set_path) or not exists(holdout_set_path):
csv_path = join(Paths.DATA, f"{args.task}_dataset.csv")
script_args = ['-csv', csv_path, '-tc', DUMMY, '-cat', '-nt', args.task.upper()]
check_call(args=['python', join(Paths.UTILS_SCRIPTS, "generate_experiment_tables.py"), *script_args])
# We create the stratified random sampling masks for the evaluation of models
if not exists(eval_mask_path):
script_args = ['-csv', learning_set_path, '-tc', DUMMY, '-cat', '-fn', f"{args.task}_mask"]
check_call(args=['python', join(Paths.UTILS_SCRIPTS, "generate_masks.py"), *script_args])
# We create the stratified random sampling masks for the final tests
if not exists(holdout_mask_path):
check_call(args=['python', join(Paths.EXPERIMENTS_SCRIPTS, args.task, 'holdout_mask_creation.py'),
'-from_csv'])
"""
2.1 Evaluation of models with manually selected hyperparameters
"""
add_delimiter("2.1 Evaluation of models - Manual")
manual_script_path = join(Paths.EXPERIMENTS_SCRIPTS, args.task, 'manual_evaluations.py')
graph_args = ['-deg'] + [str(2 * i) for i in range(2, 6)] + ['-cond_col']
check_call(args=['python', manual_script_path, *feature_args, *model_args, '-gcn', '-gat', *graph_args])
"""
2.2 Results compilation
"""
add_delimiter("2.2 Results compilation - Manual")
# We compile and summarize results
manual_scores_csv = summarize_experiments(result_folder, args.task, fast=args.fast)
# We extract the best observed degrees associated to each GNNs
df = reformat_df(read_csv(manual_scores_csv, index_col=0))
gat_k = extract_best_graph_degree(df, GAT)
gcn_k = extract_best_graph_degree(df, GAT)
"""
3.1 Evaluation of models with automated hyperparameter optimization
"""
add_delimiter("3.1 Evaluation of models - Automated")
automated_script_path = join(Paths.EXPERIMENTS_SCRIPTS, args.task, 'automated_evaluations.py')
check_call(args=['python', automated_script_path, *feature_args, *model_args])
check_call(args=['python', automated_script_path, *feature_args, '-gat', '-deg', str(gat_k), '-cond_col'])
check_call(args=['python', automated_script_path, *feature_args, '-gcn', '-deg', str(gcn_k), '-cond_col'])
"""
3.2 Results compilation - Automated
"""
add_delimiter("3.2 Results compilation - Automated")
# We compile and summarize results
automated_scores_csv = summarize_experiments(result_folder, args.task, fast=args.fast)
"""
4. Selection of the best model
"""
add_delimiter("4. Selection of the best model")
# We load the scores obtained during the manual and the automated evaluation
df = read_csv(manual_scores_csv, index_col=0)
df = df.append(read_csv(automated_scores_csv, index_col=0))
# We find the best model for the final test
best_model = find_best_model(reformat_df(df))
print(f"Best model : {best_model}")
# We infer the correct arguments for the final test
if GCN in best_model or GAT in best_model:
arguments = [f'-{best_model[0:2].lower()}', '-deg', str(best_model[3:]), '-cond_col', '-holdout']
else:
arguments = [MODEL_TO_ARGS[best_model], '-holdout']
"""
5. Final tests
"""
add_delimiter("5.1 Final test - Manual")
check_call(args=['python', manual_script_path, *feature_args, *arguments])
add_delimiter("5.2 Final test - Automated")
check_call(args=['python', automated_script_path, *feature_args, *arguments])
"""
6. Final results compilation
"""
add_delimiter("5.3 Final results compilation")
# We compile and summarize results
_ = summarize_experiments(result_folder, args.task, final_results=True, fast=args.fast)
print(f"Total time of the experiment (min): {(time() - experiment_start) / 60:.2f}")