-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathPAM_repeat1000times-regression.py
212 lines (150 loc) · 7.32 KB
/
PAM_repeat1000times-regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import KFold, GridSearchCV
import time
import random
from utils import data_handler, plotter
#handle warnings
import warnings
warnings.filterwarnings("ignore")
np.random.seed(3)
def test(y_pred,y_true,end_ptr):
'''Evaluate the performance of PAM through 3 metrics;
And find the best predicted condition for the next PAM trial
Inputs:
X_test - for testing
y_true - ground true for X_test
end_ptr - range of [0,end_ptr] in y_pred and y_true are values observable to PAM
Uses:(global variables)
pred_result - of shape (total_samp,), storing all the results predicted by PAM
true_results- of shape (total_samp,), the same order as of pred_results, storing all the ground truth
Outputs:
'''
#All sample metrics
r2 = metrics.r2_score(y_true,y_pred)
mse = metrics.mean_squared_error(y_true,y_pred)
[pear,p_value] = pearsonr(y_true,y_pred)
#metrics based on observable samples
y_true_s = y_true[0:(end_ptr+1)]
y_pred_s = y_pred[0:(end_ptr+1)]
r2_s = metrics.r2_score(y_true_s,y_pred_s)
mse_s = metrics.mean_squared_error(y_true_s,y_pred_s)
[pear_s,p_value_s] = pearsonr(y_true_s,y_pred_s)
return [r2,pear,p_value,mse,r2_s,pear_s,p_value_s,mse_s] #, best_pos_ind, best_prob
# data handling
X_df,Y_df = data_handler.load_XY(1)
X = X_df.as_matrix()
Y = Y_df.as_matrix() / 100
# setup and construct initial training dataset
# cross validation settup
inner_nsplits = 10
init_train_size = 20
totalSamp = X.shape[0]
Y_global_max = np.max(Y)
all_ind = np.random.permutation(list(range(0,totalSamp)))
all_ind_wo_max = list(range(0,totalSamp))
all_ind_wo_max.remove(0)
# PAM guided synthesis
def PAM_regression(save_csv= False, verbose=False, to_break=True, title='cqd_PAM_',batch=1):
## start PAM guided synthesis...
init_time = time.time()
Nc = 0
#construct initial training set
results_mat = np.zeros(((totalSamp-init_train_size),12))
train_ind = random.sample(all_ind_wo_max, init_train_size)
test_ind = [x for x in all_ind if x not in train_ind]
if(verbose):
print('initial training set indexes',train_ind)
# set up result storage to compute eval metrics, in the order of PAM
# ignore the initial training set, as it is not determined by PAM
pred_results = np.zeros(totalSamp-init_train_size)
true_results = np.zeros(totalSamp-init_train_size)
# setup the hyperparameter range for tuning
tuned_parameters = dict(learning_rate=[0.01],
n_estimators=[300,500,700], #100,,300,400,500
colsample_bylevel = [0.5,0.7,0.9],
gamma=[0,0.2], #0,0.1,0.2,0.3,0.4
max_depth =[3,7,11], # [3,7,11]]
reg_lambda = [0.1,1,10], #[0.1,1,10]
# reg_alpha = [1],
subsample=[0.4,0.7,1])
j=0
loop_count = 0
mean_y_only_init = np.mean(Y[train_ind])
std_y_only_init = np.std(Y[train_ind])
while(j<totalSamp-init_train_size):
inner_cv = KFold(n_splits=inner_nsplits,shuffle=True, random_state=j)
X_train = X[train_ind]
Y_train = Y[train_ind]
X_test = X[test_ind]
Y_test = Y[test_ind]
last_max = np.max(Y_train)
# GradientBoost
reg = xgb.XGBRegressor(objective="reg:linear",min_child_weight=1,**{'tree_method':'exact'},
silent=True,n_jobs=4,random_state=3,seed=3);
gb_clf = GridSearchCV(reg,tuned_parameters, cv=inner_cv,scoring='r2',verbose=0,n_jobs=4)
gb_clf.fit(X_train, Y_train)
y_pred = gb_clf.predict(X_test)
# choose the batch of conditions with best predicted yield
best_pos_ind = np.argsort(-y_pred)[:batch]
best_prob = y_pred[best_pos_ind]
next_ind = np.array(test_ind)[best_pos_ind]
# update results storage
train_size = len(Y_train)
temp = list(range(0,len(y_pred)))
ind_notbest = [x for x in temp if x not in best_pos_ind]
start_ptr = j
end_ptr = np.min([start_ptr + batch, totalSamp-init_train_size])
pred_results[start_ptr:end_ptr] = best_prob
pred_results[end_ptr:totalSamp-init_train_size]= y_pred[ind_notbest]
true_results[start_ptr:end_ptr] = Y_test[best_pos_ind]
true_results[end_ptr:totalSamp-init_train_size] = Y_test[ind_notbest]
pred_metrics = test(pred_results,true_results,end_ptr-1)
# calculate results
next_best_true_ind = next_ind[np.argmax(Y_test[best_pos_ind])]
next_best_y_true = np.max(Y_test[best_pos_ind])
result_list = [train_size,next_best_true_ind,next_best_y_true,best_prob[0],] + pred_metrics
results_mat[loop_count,:] = np.array(result_list)
loop_count = loop_count +1
j = j + batch
if(verbose):
print(loop_count,'->',j,', best_next_ind=',next_best_true_ind, ' best_Y_true=',"{0:.6f}".format(next_best_y_true),' train_max=',"{0:.6f}".format(last_max),' r2=',pred_metrics[0])
train_ind = [*train_ind , *next_ind]
test_ind = [x for x in test_ind if x not in next_ind]
## critical point
if(next_best_y_true==Y_global_max and Nc == 0):
Nc = j+init_train_size
if(to_break):
break
saved_title = '-'
if(save_csv):
results = pd.DataFrame(data=results_mat[0:j,:],columns=['sample_size','pred_ind','best_pred_result','y_true','r2','pearson','p_value','mse','r2_s','pearson_s','p_value_s','mse_s'])
saved_title = data_handler.save_csv(results,title=title)
# compute stats
mean_y_wo_init = np.mean(true_results[0:j])
std_y_wo_init = np.std(true_results[0:j])
mean_y_w_init = np.mean(Y[train_ind])
std_y_w_init = np.std(Y[train_ind])
run_time = (time.time() - init_time)/60
return [saved_title, Nc,mean_y_wo_init,std_y_wo_init,mean_y_w_init,std_y_w_init,mean_y_only_init,std_y_only_init, run_time]
outer_loop = 10
inner_loop = 100
print('start PAM for ',str(outer_loop*inner_loop),' times...')
# save the results some repetitions for backup
for j in range(0,outer_loop):
#PAM_results = np.zeros((inner_loop,9))
init_time = time.time()
res_arr = []
for i in range(0,inner_loop):
loop_count = j*inner_loop + i
result = PAM_regression(save_csv= False, verbose=False, to_break = True, title='cqd_PAM_'+str(loop_count)+'th_loop_')
res_arr.append(result)
print(str(loop_count),' -> ',str(result[0]),' time=',result[len(result)-1])
PAM_df = pd.DataFrame(data=res_arr, columns=['file-name','num_experiments','mean_y_wo_init','std_y_wo_init','mean_y_w_init','std_y_w_init','mean_y_only_init','std_y_only_init', 'run_time'])
saved_path = data_handler.save_csv(PAM_df,title='cqd_PAM_'+str(inner_loop)+'times_')
print('total = ',str((time.time()-init_time)/3600),' hrs >>-------saved')