-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict_unknowns.py
108 lines (86 loc) · 3.77 KB
/
predict_unknowns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
import imp
from toxicity_modul import *
import pandas as pd
'''
This script
- generates the smiles features from given dataset
- loads preprocessed smiles feature and label data
- predicts the toxicity of the chemical compounds from SMILES strings
'''
def toxicity_prediction_weighted_targets_prebuilt(feat_data,target_index, unk_smiles):
'''
model: class weighted + toxicity of other targets
input: smile feature data ,target of interest, list of smiles
output: export the predicted results in a csv file
'''
try:
print("toxcity prediction for target %d" % target_index)
print('data preparation ...')
# dataset for the corresponding taget
X= data_prep_weighted_target(feat_data,target_index,max_row_size,unk_smiles)
folder = 'pre-built_model/'
my_model = load_model(folder+'target'+str(target_index)+'_model_weighted_targets.h5')
y = my_model.predict(X)
predictions = pd.DataFrame(y)
predictions.index = unk_smiles
predictions.columns = ['target'+str(target_index)]
file_out = 'target'+str(target_index)+'_model_wt_pred.csv'
predictions.to_csv(file_out)
# print(predictions)
print('Result is exported to file '+ file_out)
except:
print('please check your input')
def toxicity_prediction_weighted_prebuilt(feat_data,target_index,unk_smiles):
'''
model: class weighted
input: smile feature data ,target of interest, list of smiles
output: export the predicted results in a csv file
'''
try:
print("toxcity prediction for target %d" % target_index)
print('data preparation ...')
# dataset for the corresponding taget
X = data_prep_weighted(feat_data,target_index,max_row_size,unk_smiles)
folder = 'pre-built_model/'
my_model = load_model(folder+'target'+str(target_index)+'_model_weighted.h5')
y = my_model.predict(X)
predictions = pd.DataFrame(y)
predictions.index = unk_smiles
predictions.columns = ['target'+str(target_index)]
file_out = 'target'+str(target_index)+'_model_w_pred.csv'
predictions.to_csv(file_out)
# print(predictions)
print('Result is exported to file '+ file_out)
except:
print('please check your input')
def get_unknown_compounds(processed_data,target_index):
'''
This function returns the compound smiles which toxicity of the target of interest are unknown.
input: processed_data smiles feature and label data
output: The smiles strings.
'''
y_target_ind =processed_data.shape[1]-13+target_index
y_out = processed_data.iloc[:,y_target_ind]
# get the smiles which have no target activity values
smiles_out = y_out.index[y_out.isnull()].tolist()
return smiles_out
def main():
file_path = 'data/'
#------------------------ load processed data -----------------------------
print('loading preprocessed data ...')
processed_data = pd.read_csv(file_path +'preprocessed_data.csv',index_col=0)
print('data loaded.')
# ----------- predict the toxicity for the unknown compounds --------------
print("model: class weighted + toxicity of other targets")
# for i in range(1,13):
# unknown_smiles = get_unknown_compounds(processed_data,i)
# toxicity_prediction_weighted_targets_prebuilt(processed_data,i,unknown_smiles)
# predict the toxicity for the unknown compounds
print("model: class weighted")
for i in range(1,13):
print('-----------target '+ str(i)+'--------------')
unknown_smiles = get_unknown_compounds(processed_data,i)
toxicity_prediction_weighted_prebuilt(processed_data,i,unknown_smiles)
if __name__ == "__main__":
main()