updating

gauravsh0812 · gauravsh0812 · commit 622a9f6c44aa · 2024-03-31T14:14:29.000-07:00
diff --git a/config/config.yaml b/config/config.yaml
@@ -22,6 +22,8 @@ dataset:
 
 training:
   model_type:
+    clip_enc: True
+    lmm_model: True
     encoder: "clip"
     decoder1: "roberta"
     decoder2: "llama2"
@@ -45,3 +47,7 @@ training:
   cnn_encoder:
     input_channels: 4
     hid_dim: 256
+  
+  adaptor:
+    in_dim: 768
+    features: [512,256,128,64]
diff --git a/models/adaptor.py b/models/adaptor.py
@@ -24,4 +24,4 @@ def forward(self, x_clip, x_roberta):
         x = torch.cat((xc,xr), dim=0)
         x = torch.flatten(x, start_dim=-2, end_dim=-1)
         
-        return x   # (B, features[-1])
+        return x   # (B, features[-1])
diff --git a/models/lmm_model.py b/models/lmm_model.py
@@ -1,7 +1,35 @@
+import torch 
+import torch.nn as nn
 
-class Lmm_model(nn.Module):
-    def __init__(self,) -> None:
-        super(Lmm_model, self).__init__()
-    
-    def forward(self,):
-        pass
+class LLM_model(nn.Module):
+
+    def __init__(self, 
+                 encoder, 
+                 decoder1,
+                 decoder2,
+                 adaptor,
+                 dim,
+                 image_length,
+                 max_len,
+                 num_classes,
+    ):
+        super(LLM_model, self).__init__()
+        self.enc = encoder
+        self.adaptor = adaptor
+        self.dec1 = decoder1
+        self.dec2 = decoder2
+        self.proj1 = nn.Linear(dim, 768)
+        self.proj2 = nn.Linear(image_length, max_len)
+        self.clf1 = nn.Linear(64, num_classes)
+        self.clf2 = nn.Linear(max_len, num_classes)
+
+    def forward(self, imgs, ids, attns):
+        encoded_imgs = self.enc(imgs)  # (B, L=w*h, dim)
+        last_hidden_roberta = self.dec(ids, attns) # (B, max_len, 768)        
+        output = self.adaptor(encoded_imgs,
+                                    last_hidden_roberta)  # (B, features[-1])
+        # classifier
+        output = self.clf1(output)
+        output = self.clf2(output.permute(0,2,1)).permute(0,2,1) # (B, num_classes, num_classes)
+
+        return output
diff --git a/run.py b/run.py
@@ -18,6 +18,7 @@
 from models.roberta import RobertaEncoder
 from models.llama2 import Llama2Decoder
 from models.model import ClevrMath_model
+from models.adaptor import Adaptor
 from src.training import train
 from src.testing import evaluate
 
@@ -86,9 +87,13 @@ def define_model(max_len):
     if decoder2 == "llama2":
         DEC2 = Llama2Decoder()
 
+    ADA = Adaptor(cfg.training.adaptor.in_dim, 
+                  cfg.training.adaptor.features)
+
     model = ClevrMath_model(ENC, 
                             DEC1,
                             DEC2,
+                            ADA,
                             dim,
                             image_length,
                             max_len,
@@ -191,6 +196,7 @@ def train_model(rank=None):
                     criterion,
                     cfg.training.general.clip,
                     device,
+                    clip_enc=cfg.training.model_type.clip_enc,
                     ddp=cfg.general.ddp,
                     rank=rank,
                 )
@@ -201,6 +207,7 @@ def train_model(rank=None):
                     val_dataloader,
                     criterion,
                     device,
+                    clip_enc=cfg.training.model_type.clip_enc,
                 )
 
                 end_time = time.time()
diff --git a/src/testing.py b/src/testing.py
@@ -8,6 +8,7 @@ def evaluate(
     test_dataloader,
     criterion,
     device,
+    lmm_enc=False,
     is_test=False,
 ):
     model.eval()
diff --git a/src/training.py b/src/training.py
@@ -10,6 +10,7 @@ def train(
     criterion,
     clip,
     device,
+    clip_enc=False,
     ddp=False,
     rank=None,
 ):
@@ -29,14 +30,22 @@ def train(
         ids = ids.to(device)
         attns = attns.to(device)
         labels = labels.to(device, dtype=torch.long)
-        
-        _imgs = list()
-        for im in imgs:
-            tnsr = torch.load(f"{data_path}/image_tensors/{int(im.item())}.pt")
-            _imgs.append(tnsr)
+
+        if not clip_enc:
+            _imgs = list()
+            for im in imgs:
+                tnsr = torch.load(f"{data_path}/image_tensors/{int(im.item())}.pt")
+                _imgs.append(tnsr)
             
+            imgs = torch.stack(_imgs).to(device)
         
-        imgs = torch.stack(_imgs).to(device)
+        else:
+            _imgs = list()
+            for im in imgs:
+                _i = f"{data_path}/images/{int(im.item())}.png"
+                _imgs.append(_i)
+            
+            imgs = torch.stack(_imgs).to(device)
         
         # setting gradients to zero
         optimizer.zero_grad()