23
23
from vqa_dataset_attention import *
24
24
import torch .nn as nn
25
25
import random
26
+ import utils
26
27
27
28
def instance_bce_with_logits (logits , labels ):
28
29
assert logits .dim () == 2
@@ -38,6 +39,56 @@ def compute_score_with_logits(logits, labels):
38
39
scores = (one_hots * labels )
39
40
return scores
40
41
42
+ def evaluate_model (model , valid_dataloader ,device ):
43
+ score = 0
44
+ Validation_loss = 0
45
+ upper_bound = 0
46
+ num_data = 0
47
+ V_loss = 0
48
+ print ('Validation started' )
49
+ #i, (feat, quest, label, target)
50
+ for data in tqdm (valid_dataloader ):
51
+
52
+ feat , quest , label , target = data
53
+ feat = feat .to (device )
54
+ quest = quest .to (device )
55
+ target = target .to (device ) # true labels
56
+
57
+ pred = model (feat , quest , target )
58
+ loss = instance_bce_with_logits (pred , target )
59
+ V_loss += loss .item () * feat .size (0 )
60
+ batch_score = compute_score_with_logits (pred , target .data ).sum ()
61
+ score += batch_score
62
+ upper_bound += (target .max (1 )[0 ]).sum ()
63
+ num_data += pred .size (0 )
64
+
65
+ score = score / len (valid_dataloader .dataset )
66
+ V_loss /= len (valid_dataloader .dataset )
67
+ upper_bound = upper_bound / len (valid_dataloader .dataset )
68
+ print (score ,V_loss )
69
+ return score , upper_bound , V_loss
70
+
71
+ def single_batch_run (model ,train_dataloader ,valid_dataloader ,device ,output_folder ,optim ):
72
+ feat_train , quest_train , label_train , target_train = next (iter (train_dataloader ))
73
+ feat_train = feat_train .to (device_select )
74
+ quest_train = quest_train .to (device_select )
75
+ target_train = target_train .to (device_select ) # true labels
76
+ pred = model (feat_train , quest_train , target_train )
77
+ loss = instance_bce_with_logits (pred , target_train )
78
+ logger = utils .Logger (os .path .join (output_folder , 'log_single_batch.txt' ))
79
+ #print(loss)
80
+ loss .backward ()
81
+ nn .utils .clip_grad_norm_ (model .parameters (), 0.25 )
82
+ optim .step ()
83
+ optim .zero_grad ()
84
+ batch_score = compute_score_with_logits (pred , target_train .data ).sum ()
85
+ model .train (False )
86
+ eval_score , bound , V_loss = evaluate_model (model , valid_dataloader ,device )
87
+ model .train (True )
88
+ #logger.write('epoch %d, time: %.2f' % (epoch, time.time()-t))
89
+ #logger.write('\ttrain_loss: %.3f, score: %.3f' % (total_loss, train_score))
90
+ logger .write ('\t eval loss: %.3f, score: %.3f (%.3f)' % (V_loss , 100 * eval_score , 100 * bound ))
91
+
41
92
def parse_args ():
42
93
parser = argparse .ArgumentParser ()
43
94
parser .add_argument ('--eval' , action = 'store_true' , help = 'set this to evaluate.' )
@@ -52,7 +103,7 @@ def parse_args():
52
103
parser .add_argument ('--norm' , type = str , default = 'weight' , help = 'weight, batch, layer, none' )
53
104
parser .add_argument ('--model' , type = str , default = 'A3x2' )
54
105
parser .add_argument ('--output' , type = str , default = 'saved_models/' )
55
- parser .add_argument ('--batch_size' , type = int , default = 128 )
106
+ parser .add_argument ('--batch_size' , type = int , default = 512 )
56
107
parser .add_argument ('--weight_decay' , type = float , default = 0 )
57
108
parser .add_argument ('--optimizer' , type = str , default = 'Adamax' , help = 'Adam, Adamax, Adadelta, RMSprop' )
58
109
parser .add_argument ('--initializer' , type = str , default = 'kaiming_normal' )
@@ -67,11 +118,14 @@ def parse_args():
67
118
feats_data_path = "/data/digbose92/VQA/COCO/train_hdf5_COCO/"
68
119
data_root = "/proj/digbose92/VQA/VisualQuestion_VQA/common_resources"
69
120
npy_file = "../../VisualQuestion_VQA/Visual_All/data/glove6b_init_300d.npy"
121
+ output_folder = "/proj/digbose92/VQA/VisualQuestion_VQA/Visual_Attention/results"
70
122
seed = 0
71
123
args = parse_args ()
72
124
#device_selection
73
- device = 1
74
- torch .cuda .set_device (device )
125
+ device_ids = [0 ,1 ]
126
+ #device_select=1
127
+ #torch.cuda.set_device(device_select)
128
+ device = torch .device ("cuda" if torch .cuda .is_available () else "cpu" )
75
129
76
130
if args .seed == 0 :
77
131
seed = random .randint (1 , 10000 )
@@ -84,16 +138,22 @@ def parse_args():
84
138
torch .cuda .manual_seed (args .seed )
85
139
86
140
#train dataset
87
- train_dataset = Dataset_VQA (img_root_dir = image_root_dir ,feats_data_path = feats_data_path ,dictionary = dictionary ,dataroot = data_root ,arch_choice = "resnet152" ,layer_option = "pool" )
88
- train_loader = DataLoader (train_dataset , batch_size = args .batch_size , shuffle = True , num_workers = 8 )
141
+ train_dataset = Dataset_VQA (img_root_dir = image_root_dir ,feats_data_path = feats_data_path ,dictionary = dictionary ,choice = 'train' ,dataroot = data_root ,arch_choice = "resnet152" ,layer_option = "pool" )
142
+ valid_dataset = Dataset_VQA (img_root_dir = image_root_dir ,feats_data_path = feats_data_path ,dictionary = dictionary ,choice = 'val' ,dataroot = data_root ,arch_choice = "resnet152" ,layer_option = "pool" )
143
+
144
+ train_loader = DataLoader (train_dataset , batch_size = args .batch_size , shuffle = True , num_workers = 10 )
145
+ val_loader = DataLoader (valid_dataset , batch_size = args .batch_size , shuffle = False , num_workers = 8 )
146
+ print (len (train_loader ))
147
+ print (len (val_loader ))
89
148
total_step = len (train_loader )
90
149
91
150
#model related issues
92
151
model = attention_baseline (train_dataset , num_hid = args .num_hid , dropout = args .dropout , norm = args .norm ,\
93
152
activation = args .activation , drop_L = args .dropout_L , drop_G = args .dropout_G ,\
94
153
drop_W = args .dropout_W , drop_C = args .dropout_C )
95
154
96
- model = model .to (device )
155
+ #model=model.to(device_select)
156
+
97
157
98
158
if args .initializer == 'xavier_normal' :
99
159
model .apply (weights_init_xn )
@@ -105,7 +165,9 @@ def parse_args():
105
165
model .apply (weights_init_ku )
106
166
107
167
model .w_emb .init_embedding (npy_file )
108
-
168
+ if torch .cuda .device_count () > 1 :
169
+ print ("Let's use" , torch .cuda .device_count (), "GPUs!" )
170
+ model = torch .nn .DataParallel (model , device_ids = device_ids ).to (device )
109
171
110
172
if args .optimizer == 'Adadelta' :
111
173
optim = torch .optim .Adadelta (model .parameters (), rho = 0.95 , eps = 1e-6 , weight_decay = args .weight_decay )
@@ -115,39 +177,70 @@ def parse_args():
115
177
optim = torch .optim .Adam (model .parameters (), lr = 0.001 , betas = (0.9 , 0.999 ), eps = 1e-08 , weight_decay = args .weight_decay )
116
178
else :
117
179
optim = torch .optim .Adamax (model .parameters (), weight_decay = args .weight_decay )
118
-
180
+
181
+ logger = utils .Logger (os .path .join (output_folder , 'log.txt' ))
182
+ best_eval_score = 0
119
183
print ('Starting training' )
184
+
185
+ #placeholder for checking training and testuing working or not
186
+ #single_batch_run(model,train_loader,val_loader,device_select,output_folder,optim)
187
+
188
+ device_select = 0
189
+
120
190
for epoch in range (args .epochs ):
121
191
total_loss = 0
122
192
train_score = 0
123
193
t = time .time ()
124
194
correct = 0
125
195
step = 0
196
+ start_time = time .time ()
126
197
for i , (feat , quest , label , target ) in enumerate (train_loader ):
198
+
127
199
feat = feat .to (device )
128
200
quest = quest .to (device )
129
201
target = target .to (device ) # true labels
130
202
131
203
pred = model (feat , quest , target )
132
204
loss = instance_bce_with_logits (pred , target )
133
- print (loss )
205
+ # print(loss)
134
206
loss .backward ()
135
- nn .utils .clip_grad_norm (model .parameters (), 0.25 )
207
+ nn .utils .clip_grad_norm_ (model .parameters (), 0.25 )
136
208
optim .step ()
137
209
optim .zero_grad ()
138
210
139
211
batch_score = compute_score_with_logits (pred , target .data ).sum ()
140
212
total_loss += loss .item () * feat .size (0 )
141
213
train_score += batch_score
142
214
if (step % 10 == 0 ):
143
- #optimizer.zero_grad()
144
- print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
145
- .format (epoch , args .epochs , step , total_step , loss .item ()))
215
+ end_time = time .time ()
216
+ time_elapsed = end_time - start_time
217
+
218
+ print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Time elapsed: {:.4f}'
219
+ .format (epoch , args .epochs , step , total_step , loss .item (), time_elapsed ))
220
+ start_time = end_time
146
221
step = step + 1
147
222
148
223
total_loss /= len (train_loader .dataset )
149
224
train_score = 100 * train_score / len (train_loader .dataset )
150
225
226
+ print ('Epoch [{}/{}], Training Loss: {:.4f}, Training Accuracy {:.4f}'
227
+ .format (epoch , args .epochs , total_loss , train_score ))
228
+
229
+ model .train (False )
230
+ eval_score , bound , V_loss = evaluate_model (model , val_loader , device )
231
+ model .train (True )
232
+
233
+ logger .write ('epoch %d, time: %.2f' % (epoch , time .time ()- t ))
234
+ logger .write ('\t train_loss: %.3f, score: %.3f' % (total_loss , train_score ))
235
+ logger .write ('\t eval loss: %.3f, score: %.3f (%.3f)' % (V_loss , 100 * eval_score , 100 * bound ))
236
+
237
+ if eval_score > best_eval_score :
238
+ model_path = os .path .join (output_folder , 'model.pth' )
239
+ torch .save (model .state_dict (), model_path )
240
+ best_eval_score = eval_score
241
+
242
+
243
+
151
244
152
245
153
246
0 commit comments