use distil bert

gusalsdmlwlq · gusalsdmlwlq · commit 8f4ef80f033a · 2020-03-24T17:54:43.000+09:00
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ MultiWOZ_2.1/
 log/
 save/
 runs/
+config.py
diff --git a/config.py b/config.py
@@ -5,7 +5,7 @@ class Config:
     parser = argparse.ArgumentParser()
 
     parser.add_argument("--data_path", default="data/MultiWOZ_2.1", type=str)
-    parser.add_argument("--batch_size", default=16, type=int)
+    parser.add_argument("--batch_size", default=32, type=int)
     parser.add_argument("--max_len", default=100, type=int)
     parser.add_argument("--max_value_len", default=20, type=int)
     parser.add_argument("--max_context_len", default=450, type=int)
diff --git a/model/dst_no_history.py b/model/dst_no_history.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BertModel, BertTokenizerFast
+from transformers import DistilBertModel, DistilBertTokenizerFast
 import numpy as np
 
 sys.path.append("../")
@@ -34,10 +34,10 @@ def __init__(self, hparams):
         """
         
         super(DST, self).__init__()
-        self.context_encoder = BertModel.from_pretrained("bert-base-uncased")  # use fine-tuning
+        self.context_encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")  # use fine-tuning
         self.context_encoder.train()
-        self.value_encoder = BertModel.from_pretrained("bert-base-uncased").requires_grad_(False)  # fix parameter
-        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        self.value_encoder = DistilBertModel.from_pretrained("distilbert-base-uncased").requires_grad_(False)  # fix parameter
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
         self.hidden_size = self.context_encoder.embeddings.word_embeddings.embedding_dim  # 768
         self.linear_gate = nn.Linear(self.hidden_size, 3)  # none, don't care, prediction
         self.linear_span = nn.Linear(self.hidden_size, 2)  # start, end
@@ -98,11 +98,11 @@ def forward(self, turn_input, turn_context, turn_span, first_turn=False, train=T
                 context[idx, :len(temp)] = torch.tensor(temp, dtype=torch.int64).cuda()
                 if max_len < len(temp):
                     max_len = len(temp)
-            
+    
             context = context[:, :max_len]
             context_mask = (context != 0)
 
-            outputs, _ = self.context_encoder(context, attention_mask=context_mask)  # output: [batch, context_len, hidden]
+            outputs = self.context_encoder(context, attention_mask=context_mask)[0]  # output: [batch, context_len, hidden]
             gate_output = self.linear_gate(outputs[:, 0, :])  # gate_output: [batch, 3]
             gate_output = F.log_softmax(gate_output, dim=1)
 
@@ -133,7 +133,7 @@ def forward(self, turn_input, turn_context, turn_span, first_turn=False, train=T
             value_list = self.value_ontology[slot_] + ["none"]
             for value in value_list:
                 value_output = torch.tensor([self.tokenizer.encode(value)]).cuda()
-                value_output, _ = self.value_encoder(value_output)  # value_outputs: [1, value_len, hidden]
+                value_output = self.value_encoder(value_output)[0]  # value_outputs: [1, value_len, hidden]
                 value_prob = torch.cosine_similarity(outputs[:, 0, :], value_output[:, 0, :], dim=1).unsqueeze(dim=1)  # value_prob: [batch, 1]
                 if value_probs is None:
                     value_probs = value_prob
@@ -142,7 +142,7 @@ def forward(self, turn_input, turn_context, turn_span, first_turn=False, train=T
 
             # cosine similarity of true value with context
             value_mask = (value_label != 0)
-            true_value_output, _ = self.value_encoder(value_label, attention_mask=value_mask)  # true_value_output: [batch, value_len, hidden]
+            true_value_output = self.value_encoder(value_label, attention_mask=value_mask)[0]  # true_value_output: [batch, value_len, hidden]
             true_value_probs = torch.cosine_similarity(outputs[:, 0, :], true_value_output[:, 0, :], dim=1).unsqueeze(dim=1)  # true_value_prob: [batch, 1]
 
             acc_slot = torch.ones(batch_size).cuda()  # acc: [batch]
diff --git a/reader.py b/reader.py
@@ -4,7 +4,7 @@
 import random
 
 import torch
-from transformers import BertTokenizerFast
+from transformers import DistilBertTokenizerFast
 
 import ontology
 
@@ -16,7 +16,7 @@ def __init__(self, hparams):
         self.test = {}
         self.data_turns = {}
         self.data_path = hparams.data_path
-        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
         self.batch_size = hparams.batch_size
         self.max_len = hparams.max_len
         self.max_value_len = hparams.max_value_len
@@ -300,9 +300,7 @@ def make_input(self, batch):
                         turn_context_[idx, :len(turn_context[idx])+2] = torch.tensor([self.tokenizer.cls_token_id] + turn_context[idx] + [self.tokenizer.sep_token_id])
                     for resp in batch[turn]["response"]:
                         prev_resp.append(resp[1:-1])
-                    turn_context_ = turn_context_[:, :context_len+2]
-                    
-                    turn_context_ = turn_context_.cuda()
+                    turn_context_ = turn_context_[:, :context_len+2].cuda()
                     
                     contexts.append(turn_context_.clone().long())
                 else:  # not first turn
@@ -319,9 +317,7 @@ def make_input(self, batch):
                         turn_context_[idx, :len(turn_context[idx])+2] = torch.tensor([self.tokenizer.cls_token_id] + turn_context[idx] + [self.tokenizer.sep_token_id])
                     for resp in batch[turn]["response"]:
                         prev_resp.append(resp[1:-1])
-                    turn_context_ = turn_context_[:, :min(context_len, self.max_context_len)+2]
-
-                    turn_context_ = turn_context_.cuda()
+                    turn_context_ = turn_context_[:, :min(context_len, self.max_context_len)+2].cuda()
 
                     contexts.append(turn_context_.clone().long())
                 
diff --git a/train_distributed_no_history.py b/train_distributed_no_history.py
@@ -12,7 +12,7 @@
 from torch.utils.tensorboard import SummaryWriter
 from apex import amp, parallel
 from tqdm import tqdm
-from transformers import BertTokenizerFast
+from transformers import DistilBertTokenizerFast
 
 from model.dst_no_history import DST
 from config import Config
@@ -69,79 +69,48 @@ def train(model, reader, optimizer, writer, hparams, tokenizer):
         # learning rate scheduling
         for param in optimizer.param_groups:
             param["lr"] = learning_rate_schedule(train.global_step, train.max_iter, hparams)
+            
+        prev_belief = None  # belief for next turn
+        for turn_idx in range(turns):
+            distributed_batch_size = math.ceil(batch_size / hparams.num_gpus)
+            
+            # distribute batches to each gpu
+            for key, value in inputs[turn_idx].items():
+                inputs[turn_idx][key] = distribute_data(value, hparams.num_gpus)[hparams.local_rank]
+            contexts[turn_idx] = distribute_data(contexts[turn_idx], hparams.num_gpus)[hparams.local_rank]
+            spans[turn_idx] = distribute_data(spans[turn_idx], hparams.num_gpus)[hparams.local_rank]
 
-        try:
-            prev_belief = None  # belief for next turn
-            for turn_idx in range(turns):
-                distributed_batch_size = math.ceil(batch_size / hparams.num_gpus)
-                
-                # split batches for gpu memory
-                context_len = 0
-                for idx in range(distributed_batch_size):
-                    context_len_ = len(contexts[turn_idx][idx])
-                    if context_len < context_len_:
-                        context_len = context_len_
-                if context_len >= 40:
-                    small_batch_size = min(int(hparams.batch_size/hparams.num_gpus / 2), distributed_batch_size)
-                else:
-                    small_batch_size = distributed_batch_size
-
-                # distribute batches to each gpu
-                for key, value in inputs[turn_idx].items():
-                    inputs[turn_idx][key] = distribute_data(value, hparams.num_gpus)[hparams.local_rank]
-                contexts[turn_idx] = distribute_data(contexts[turn_idx], hparams.num_gpus)[hparams.local_rank]
-                spans[turn_idx] = distribute_data(spans[turn_idx], hparams.num_gpus)[hparams.local_rank]
+            first_turn = (turn_idx == 0)
 
-                first_turn = (turn_idx == 0)
+            if not first_turn:
+                inputs[turn_idx]["belief_gen"] = prev_belief
 
-                if not first_turn:
-                    inputs[turn_idx]["belief_gen"] = prev_belief
+            optimizer.zero_grad()
+            loss, acc = model.forward(inputs[turn_idx], contexts[turn_idx], spans[turn_idx], first_turn)  # loss: [batch], acc: [batch, slot]
+            
+            if turn_idx+1 < turns:
+                prev_belief = inputs[turn_idx]["belief_gen"]
 
-                prev_belief = []
-
-                for small_batch_idx in range(math.ceil(distributed_batch_size/small_batch_size)):
-                    small_inputs = {}
-                    for key, value in inputs[turn_idx].items():
-                        small_inputs[key] = value[small_batch_size*small_batch_idx:small_batch_size*(small_batch_idx+1)]
-                    small_contexts = contexts[turn_idx][small_batch_size*small_batch_idx:small_batch_size*(small_batch_idx+1)]
-                    small_spans = spans[turn_idx][small_batch_size*small_batch_idx:small_batch_size*(small_batch_idx+1)]
-
-                    optimizer.zero_grad()
-                    loss, acc = model.forward(small_inputs, small_contexts, small_spans, first_turn)  # loss: [batch], acc: [batch, slot]
-                    
-                    prev_belief.append(small_inputs["belief_gen"])
-
-                    total_loss += loss.sum(dim=0).item()
-                    slot_acc += acc.sum(dim=1).sum(dim=0).item()
-                    joint_acc += (acc.mean(dim=1) == 1).sum(dim=0).item()
-                    batch_count += small_batch_size
-                    loss = loss.mean(dim=0)
-
-                    # distributed training
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-
-                    optimizer.step()
-                    torch.cuda.empty_cache()
-
-                prev_belief_ = []
-                for belief in prev_belief:
-                    prev_belief_ += belief
-                prev_belief = prev_belief_ 
-
-            total_loss = total_loss / batch_count
-            slot_acc = slot_acc / batch_count / len(ontology.all_info_slots) * 100
-            joint_acc = joint_acc / batch_count * 100
-            train.global_step += 1
-            if hparams.local_rank == 0:
-                writer.add_scalar("Train/loss", total_loss, train.global_step)
-                t.set_description("iter: {}, loss: {:.4f}, joint accuracy: {:.4f}, slot accuracy: {:.4f}".format(batch_idx+1, total_loss, joint_acc, slot_acc))
-        except RuntimeError as e:
-            if hparams.local_rank == 0:
-                print("\n!!! Error: {}".format(e))
-                print("batch size: {}, context length: {}".format(small_batch_size, context_len))
+            total_loss += loss.sum(dim=0).item()
+            slot_acc += acc.sum(dim=1).sum(dim=0).item()
+            joint_acc += (acc.mean(dim=1) == 1).sum(dim=0).item()
+            batch_count += distributed_batch_size
+            loss = loss.mean(dim=0)
+
+            # distributed training
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+
+            optimizer.step()
             torch.cuda.empty_cache()
-            exit(0)
+
+        total_loss = total_loss / batch_count
+        slot_acc = slot_acc / batch_count / len(ontology.all_info_slots) * 100
+        joint_acc = joint_acc / batch_count * 100
+        train.global_step += 1
+        if hparams.local_rank == 0:
+            writer.add_scalar("Train/loss", total_loss, train.global_step)
+            t.set_description("iter: {}, loss: {:.4f}, joint accuracy: {:.4f}, slot accuracy: {:.4f}".format(batch_idx+1, total_loss, joint_acc, slot_acc))
 
 def validate(model, reader, hparams, tokenizer):
     model.eval()
@@ -193,7 +162,6 @@ def validate(model, reader, hparams, tokenizer):
                 t.set_description("iter: {}".format(batch_idx+1))
 
     model.train()
-    model.module.slot_encoder.eval()
     model.module.value_encoder.eval()  # fix value encoder
     val_loss = val_loss / batch_count
     slot_acc = slot_acc / batch_count / len(ontology.all_info_slots) * 100
@@ -252,7 +220,7 @@ def load(model, optimizer, save_path):
     end = time.time()
     logger.info("Loaded. {} secs".format(end-start))
 
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
 
     model = DST(hparams).cuda()
     optimizer = Adam(model.parameters(), hparams.lr)

-Original file line number
+Diff line change
 log/
 save/
 runs/
 +config.py