-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathmorgan_rf.py
executable file
·109 lines (83 loc) · 4.05 KB
/
morgan_rf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Script for training a Random Forest model on fingerprint representations of molecules.
"""
import os
import warnings
import argparse
import pandas as pd
import numpy as np
from rdkit.Chem import MolFromSmiles, AllChem
from rdkit import DataStructs
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from helper import scaffold_split
from parse_data import parse_dataset
def generate_fingerprints(smile):
mol = MolFromSmiles(smile)
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024)
array = np.zeros((0,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(fp, array)
#print(array)
return array
def fit_forest(X,y):
# params = {'n_estimators': [100, 1000, 10000], 'max_depth': [1, 2, 3], 'min_samples_split': [2, 4]}
# search = GridSearchCV(RandomForestRegressor(), params, cv=5)
# model = search.fit(X, y).best_estimator_
model = RandomForestRegressor(n_estimators=100)
return model.fit(X,y)
def main(args):
warnings.filterwarnings('ignore')
print('\nTraining ECFP-RF on ' + args.task + ' dataset')
print('\nGenerating features...')
if args.task=='IC50':
print('Subtask: {}'.format(args.subtask))
smiles_list, y = parse_dataset(args.task, subtask=args.subtask)
X = np.arange(len(smiles_list)).reshape(-1,1) # array of data indices
r2_list = []
rmse_list = []
print('\nBeginning training loop...')
j = 0
for i in range(args.n_runs):
if args.split == 'random':
kf = KFold(n_splits=args.n_folds, random_state=i, shuffle=True)
split_list = kf.split(X)
elif args.split == 'scaffold':
train_ind, test_ind = scaffold_split(smiles_list, seed=i)
split_list = [train_ind, test_ind]
for train_ind, test_ind in split_list:
y_train, y_test = y[train_ind], y[test_ind]
smiles_df = pd.DataFrame(smiles_list, columns=['smiles'])
train_smiles = smiles_df.iloc[train_ind]['smiles'].to_list()
test_smiles = smiles_df.iloc[test_ind]['smiles'].to_list()
X_train = np.asarray([generate_fingerprints(s) for s in train_smiles])
X_test = np.asarray([generate_fingerprints(s) for s in test_smiles])
m = fit_forest(X_train, y_train)
y_pred = m.predict(X_test)
score = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("\nR^2: {:.3f}".format(score))
print("RMSE: {:.3f}".format(rmse))
r2_list.append(score)
rmse_list.append(rmse)
# np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred)
# np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test)
# np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var))
j += 1
print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list))))
print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list))))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-task', type=str, default='IC50',
help='Dataset on which to train ECFP-RF')
parser.add_argument('-subtask', type=str, default='A2a',
help='Dataset on which to train ECFP-RF')
parser.add_argument('-split', type=str, default='random',
help='Train/Test splitting method. Possible choices: random/scaffold')
parser.add_argument('-n_runs', type=int, default=3,
help='number of runs for train/test split.')
parser.add_argument('-n_folds', type=int, default=5,
help='number of folds in K-fold cross-validation. Only for random splitting')
args = parser.parse_args()
main(args)