qninhdt
/

mini-agent

Model card Files Files and versions Community

qninhdt commited on Dec 3, 2024

Commit

3f792e3

1 Parent(s): 8db8077

cc

Browse files

Files changed (13) hide show

configs/experiment/miniagent-bert-attn-m8.yaml +4 -4
configs/experiment/{miniagent-bert-attn.yaml → miniagent-bert-attn-v1.yaml} +7 -7
configs/experiment/miniagent-bert-attn-v2.yaml +36 -0
configs/experiment/miniagent-bert-mlp-abs_diff-mult.yaml +2 -2
configs/experiment/miniagent-bert-mlp-abs_diff.yaml +2 -2
configs/experiment/miniagent-bert-mlp-mult.yaml +2 -2
configs/experiment/miniagent-bert-mlp.yaml +2 -2
configs/logger/wandb.yaml +1 -1
configs/trainer/default.yaml +3 -3
src/models/{attn_module.py → attn_v1_module.py} +34 -68
src/models/attn_v2_module.py +124 -0
src/models/miniagent_module.py +1 -0
src/train.py +2 -0

configs/experiment/miniagent-bert-attn-m8.yaml CHANGED Viewed

@@ -15,22 +15,22 @@ model:
   inst_proj_model:
     _target_: src.models.attn_module.AttnProjection
     input_dim: 768
-    n_heads: 1
     output_length: 8
   tool_proj_model:
     _target_: src.models.attn_module.AttnProjection
     input_dim: 768
-    n_heads: 1
     output_length: 8
   pred_model:
     _target_: src.models.attn_module.BiAttnPrediction
     input_dim: 768
-    n_heads: 1
 data:
   bert_model: bert-base-uncased
   seed: 42
-  batch_size: 128
   tool_capacity: 16

   inst_proj_model:
     _target_: src.models.attn_module.AttnProjection
     input_dim: 768
+    n_heads: 4
     output_length: 8
   tool_proj_model:
     _target_: src.models.attn_module.AttnProjection
     input_dim: 768
+    n_heads: 4
     output_length: 8
   pred_model:
     _target_: src.models.attn_module.BiAttnPrediction
     input_dim: 768
+    n_heads: 4
 data:
   bert_model: bert-base-uncased
   seed: 42
+  batch_size: 64
   tool_capacity: 16

configs/experiment/{miniagent-bert-attn.yaml → miniagent-bert-attn-v1.yaml} RENAMED Viewed

@@ -13,24 +13,24 @@ model:
   bert_model: bert-base-uncased
   inst_proj_model:
-    _target_: src.models.attn_module.AttnProjection
     input_dim: 768
-    n_heads: 1
     output_length: 16
   tool_proj_model:
-    _target_: src.models.attn_module.AttnProjection
     input_dim: 768
-    n_heads: 1
     output_length: 16
   pred_model:
-    _target_: src.models.attn_module.BiAttnPrediction
     input_dim: 768
-    n_heads: 1
 data:
   bert_model: bert-base-uncased
   seed: 42
-  batch_size: 128
   tool_capacity: 16

   bert_model: bert-base-uncased
   inst_proj_model:
+    _target_: src.models.attn_v1_module.AttnProjection
     input_dim: 768
+    n_heads: 4
     output_length: 16
   tool_proj_model:
+    _target_: src.models.attn_v1_module.AttnProjection
     input_dim: 768
+    n_heads: 4
     output_length: 16
   pred_model:
+    _target_: src.models.attn_v1_module.BiAttnPrediction
     input_dim: 768
+    n_heads: 4
 data:
   bert_model: bert-base-uncased
   seed: 42
+  batch_size: 64
   tool_capacity: 16

configs/experiment/miniagent-bert-attn-v2.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+defaults:
+  - override /data: mixed
+  - override /model: miniagent
+  - override /callbacks: default
+  - override /trainer: gpu
+seed: 42
+model:
+  lr: 1e-5
+  bert_model: bert-base-uncased
+  inst_proj_model:
+    _target_: src.models.attn_v2_module.AttnProjection
+    input_dim: 768
+    n_heads: 4
+    output_length: 16
+  tool_proj_model:
+    _target_: src.models.attn_v2_module.AttnProjection
+    input_dim: 768
+    n_heads: 4
+    output_length: 16
+  pred_model:
+    _target_: src.models.attn_v2_module.BiAttnPrediction
+    input_dim: 768
+    n_heads: 4
+data:
+  bert_model: bert-base-uncased
+  seed: 42
+  batch_size: 64
+  tool_capacity: 16

configs/experiment/miniagent-bert-mlp-abs_diff-mult.yaml CHANGED Viewed

@@ -9,7 +9,7 @@ defaults:
 seed: 42
 model:
-  lr: 0.001
   bert_model: bert-base-uncased
   inst_proj_model:
@@ -33,5 +33,5 @@ model:
 data:
   bert_model: bert-base-uncased
   seed: 42
-  batch_size: 128
   tool_capacity: 16

 seed: 42
 model:
+  lr: 0.0001
   bert_model: bert-base-uncased
   inst_proj_model:
 data:
   bert_model: bert-base-uncased
   seed: 42
+  batch_size: 64
   tool_capacity: 16

configs/experiment/miniagent-bert-mlp-abs_diff.yaml CHANGED Viewed

@@ -9,7 +9,7 @@ defaults:
 seed: 42
 model:
-  lr: 0.001
   bert_model: bert-base-uncased
   inst_proj_model:
@@ -33,5 +33,5 @@ model:
 data:
   bert_model: bert-base-uncased
   seed: 42
-  batch_size: 128
   tool_capacity: 16

 seed: 42
 model:
+  lr: 0.0001
   bert_model: bert-base-uncased
   inst_proj_model:
 data:
   bert_model: bert-base-uncased
   seed: 42
+  batch_size: 64
   tool_capacity: 16

configs/experiment/miniagent-bert-mlp-mult.yaml CHANGED Viewed

@@ -9,7 +9,7 @@ defaults:
 seed: 42
 model:
-  lr: 0.001
   bert_model: bert-base-uncased
   inst_proj_model:
@@ -33,5 +33,5 @@ model:
 data:
   bert_model: bert-base-uncased
   seed: 42
-  batch_size: 128
   tool_capacity: 16

 seed: 42
 model:
+  lr: 0.0001
   bert_model: bert-base-uncased
   inst_proj_model:
 data:
   bert_model: bert-base-uncased
   seed: 42
+  batch_size: 64
   tool_capacity: 16

configs/experiment/miniagent-bert-mlp.yaml CHANGED Viewed

@@ -9,7 +9,7 @@ defaults:
 seed: 42
 model:
-  lr: 0.001
   bert_model: bert-base-uncased
   inst_proj_model:
@@ -33,5 +33,5 @@ model:
 data:
   bert_model: bert-base-uncased
   seed: 42
-  batch_size: 128
   tool_capacity: 16

 seed: 42
 model:
+  lr: 0.0001
   bert_model: bert-base-uncased
   inst_proj_model:
 data:
   bert_model: bert-base-uncased
   seed: 42
+  batch_size: 64
   tool_capacity: 16

configs/logger/wandb.yaml CHANGED Viewed

@@ -13,4 +13,4 @@ wandb:
   # entity: "" # set to name of your wandb team
   group: ""
   tags: []
-  job_type: ""

   # entity: "" # set to name of your wandb team
   group: ""
   tags: []
+  job_type: ""

configs/trainer/default.yaml CHANGED Viewed

@@ -3,7 +3,7 @@ _target_: lightning.pytorch.trainer.Trainer
 default_root_dir: ${paths.output_dir}
 min_epochs: 1 # prevents early stopping
-max_epochs: 50
 accelerator: cpu
 devices: 1
@@ -11,11 +11,11 @@ devices: 1
 log_every_n_steps: 10
 # mixed precision for extra speed-up
-# precision: 16
 # perform a validation loop every N training epochs
 check_val_every_n_epoch: 1
 # set True to to ensure deterministic results
 # makes training slower but gives more reproducibility than just setting seeds
-deterministic: True

 default_root_dir: ${paths.output_dir}
 min_epochs: 1 # prevents early stopping
+max_epochs: 201
 accelerator: cpu
 devices: 1
 log_every_n_steps: 10
 # mixed precision for extra speed-up
+precision: 16-mixed
 # perform a validation loop every N training epochs
 check_val_every_n_epoch: 1
 # set True to to ensure deterministic results
 # makes training slower but gives more reproducibility than just setting seeds
+deterministic: False

src/models/{attn_module.py → attn_v1_module.py} RENAMED Viewed

@@ -10,41 +10,31 @@ class AttnProjection(nn.Module):
         self.query = nn.Parameter(torch.randn(output_length, input_dim))
-        self.attn = nn.MultiheadAttention(input_dim, n_heads, batch_first=True)
         self.norm1 = nn.LayerNorm(input_dim)
-        self.dropout1 = nn.Dropout(0.5)
-        # self.self_attn = nn.MultiheadAttention(input_dim, n_heads, batch_first=True)
-        # self.norm2 = nn.LayerNorm(input_dim)
-        # self.dropout2 = nn.Dropout(0.5)
-        # self.ff = nn.Sequential(
-        #     nn.Linear(input_dim, input_dim * 4),
-        #     nn.SiLU(),
-        #     nn.Dropout(0.5),
-        #     nn.Linear(input_dim * 4, input_dim),
-        # )
-        # self.norm3 = nn.LayerNorm(input_dim)
-        # self.dropout3 = nn.Dropout(0.5)
-        nn.init.xavier_uniform_(self.query)
     def forward(self, x):
         B = x.shape[0]
         query = self.query.unsqueeze(0).repeat(B, 1, 1)
-        z = self.attn(query, x, x)[0]
-        z = self.norm1(z)
-        z = self.dropout1(z)
-        # z = self.self_attn(z, z, z)[0] + z
-        # z = self.norm2(z)
-        # z = self.dropout2(z)
-        # z = self.ff(z) + z
-        # z = self.norm3(z)
-        # z = self.dropout3(z)
         z = z.contiguous().view(B, -1)
@@ -58,43 +48,28 @@ class BiAttnPrediction(nn.Module):
         self.input_dim = input_dim
-        self.attn1 = nn.MultiheadAttention(input_dim, n_heads, batch_first=True)
         self.norm1 = nn.LayerNorm(input_dim)
-        self.dropout1 = nn.Dropout(0.5)
-        self.attn2 = nn.MultiheadAttention(input_dim, n_heads, batch_first=True)
         self.norm2 = nn.LayerNorm(input_dim)
-        self.dropout2 = nn.Dropout(0.5)
-        # self.ff1 = nn.Sequential(
-        #     nn.Linear(input_dim, input_dim * 4),
-        #     nn.SiLU(),
-        #     nn.Dropout(0.5),
-        #     nn.Linear(input_dim * 4, input_dim),
-        # )
-        # self.norm_ff1 = nn.LayerNorm(input_dim)
-        # self.dropout_ff1 = nn.Dropout(0.5)
-        # self.ff2 = nn.Sequential(
-        #     nn.Linear(input_dim, input_dim * 4),
-        #     nn.SiLU(),
-        #     nn.Dropout(0.5),
-        #     nn.Linear(input_dim * 4, input_dim),
-        # )
-        # self.norm_ff2 = nn.LayerNorm(input_dim)
-        # self.dropout_ff2 = nn.Dropout(0.5)
         self.mlp = nn.Sequential(
-            nn.Linear(input_dim * 2, 1024),
             nn.SiLU(),
-            nn.Dropout(0.5),
             nn.Linear(1024, 512),
             nn.SiLU(),
-            nn.Dropout(0.5),
             nn.Linear(512, 256),
             nn.SiLU(),
-            nn.Dropout(0.5),
             nn.Linear(256, 1),
         )
@@ -103,28 +78,19 @@ class BiAttnPrediction(nn.Module):
         x1 = x1.view(B, -1, self.input_dim)  # [B, M x D] -> [B, M, D]
         x2 = x2.view(B, -1, self.input_dim)  # [B, M x D] -> [B, M, D]
-        z1 = self.attn1(x2, x1, x1)[0] + x1
-        z1 = self.norm1(z1)
-        z1 = self.dropout1(z1)
-        z2 = self.attn2(x1, x2, x2)[0] + x2
-        z2 = self.norm2(z2)
-        z2 = self.dropout2(z2)
-        # z1 = self.ff1(z1) + z1
-        # z1 = self.norm_ff1(z1)
-        # z1 = self.dropout_ff1(z1)
-        # z2 = self.ff2(z2) + z2
-        # z2 = self.norm_ff2(z2)
-        # z2 = self.dropout_ff2(z2)
-        # z1 = torch.cat([z1.mean(dim=1), z1.max(dim=1).values], dim=1)  # [B, D * 2]
-        # z2 = torch.cat([z2.mean(dim=1), z2.max(dim=1).values], dim=1)  # [B, D * 2]
         z1 = z1.mean(dim=1)
         z2 = z2.mean(dim=1)
-        z = torch.cat([z1, z2], dim=1)  # [B, D * 4]
         z = self.mlp(z)

         self.query = nn.Parameter(torch.randn(output_length, input_dim))
+        self.attn = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
         self.norm1 = nn.LayerNorm(input_dim)
+        self.self_attn = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
+        self.norm2 = nn.LayerNorm(input_dim)
+        self.dropout = nn.Dropout(0.2)
+        nn.init.xavier_normal_(self.query)
     def forward(self, x):
         B = x.shape[0]
         query = self.query.unsqueeze(0).repeat(B, 1, 1)
+        z = self.norm1(x)
+        z_attn = self.attn(query, z, z)[0]
+        z = z_attn
+        z = self.norm2(z)
+        z_attn = self.self_attn(z, z, z)[0]
+        z = z + self.dropout(z_attn)
         z = z.contiguous().view(B, -1)
         self.input_dim = input_dim
+        self.attn1 = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
         self.norm1 = nn.LayerNorm(input_dim)
+        self.dropout1 = nn.Dropout(0.2)
+        self.attn2 = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
         self.norm2 = nn.LayerNorm(input_dim)
+        self.dropout2 = nn.Dropout(0.2)
         self.mlp = nn.Sequential(
+            nn.Linear(input_dim * 3, 1024),
             nn.SiLU(),
+            nn.Dropout(0.2),
             nn.Linear(1024, 512),
             nn.SiLU(),
+            nn.Dropout(0.2),
             nn.Linear(512, 256),
             nn.SiLU(),
+            nn.Dropout(0.2),
             nn.Linear(256, 1),
         )
         x1 = x1.view(B, -1, self.input_dim)  # [B, M x D] -> [B, M, D]
         x2 = x2.view(B, -1, self.input_dim)  # [B, M x D] -> [B, M, D]
+        x1 = self.norm1(x1)
+        x2 = self.norm2(x2)
+        z1_attn = self.attn1(x2, x1, x1)[0]
+        z1 = x1 + self.dropout1(z1_attn)
+        z2_attn = self.attn2(x1, x2, x2)[0]
+        z2 = x2 + self.dropout2(z2_attn)
         z1 = z1.mean(dim=1)
         z2 = z2.mean(dim=1)
+        z = torch.cat([z1, z2, torch.abs(z1 - z2)], dim=1)  # [B, D * 4]
         z = self.mlp(z)

src/models/attn_v2_module.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class AttnProjection(nn.Module):
+    def __init__(self, input_dim, n_heads, output_length):
+        super().__init__()
+        self.query = nn.Parameter(torch.randn(output_length, input_dim))
+        self.attn = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
+        self.norm1 = nn.LayerNorm(input_dim)
+        self.dropout1 = nn.Dropout(0.2)
+        self.self_attn = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
+        self.norm2 = nn.LayerNorm(input_dim)
+        self.dropout2 = nn.Dropout(0.2)
+        self.cls_mlp = nn.Sequential(
+            nn.Linear(input_dim, input_dim), nn.SiLU(), nn.Dropout(0.2)
+        )
+        self.norm3 = nn.LayerNorm(input_dim)
+        nn.init.xavier_normal_(self.query)
+    def forward(self, x):
+        B = x.shape[0]
+        query = self.query.unsqueeze(0).repeat(B, 1, 1)
+        x_cls = x[:, 0, :]
+        x_other = x[:, 1:, :]
+        z_other = self.norm1(x_other)
+        z_attn = self.attn(query, z_other, z_other)[0]
+        z_other = self.dropout1(z_attn)
+        z_other = self.norm2(z_other)
+        z_attn = self.self_attn(z_other, z_other, z_other)[0]
+        z_other = z_other + self.dropout1(z_attn)
+        z_cls = x_cls + self.cls_mlp(self.norm3(x_cls))
+        z = torch.cat([z_cls.unsqueeze(1), z_other], dim=1)
+        z = z.contiguous().view(B, -1)
+        return z
+class BiAttnPrediction(nn.Module):
+    def __init__(self, input_dim, n_heads):
+        super().__init__()
+        self.input_dim = input_dim
+        self.attn1 = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
+        self.norm1 = nn.LayerNorm(input_dim)
+        self.dropout1 = nn.Dropout(0.2)
+        self.attn2 = nn.MultiheadAttention(
+            input_dim, n_heads, dropout=0.2, batch_first=True
+        )
+        self.norm2 = nn.LayerNorm(input_dim)
+        self.dropout2 = nn.Dropout(0.2)
+        self.mlp = nn.Sequential(
+            nn.Linear(input_dim * 6, 1024),
+            nn.SiLU(),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 512),
+            nn.SiLU(),
+            nn.Dropout(0.2),
+            nn.Linear(512, 256),
+            nn.SiLU(),
+            nn.Dropout(0.2),
+            nn.Linear(256, 1),
+        )
+        self.norm3 = nn.LayerNorm(input_dim)
+    def forward(self, x1, x2):
+        B = x1.shape[0]
+        x1 = x1.view(B, -1, self.input_dim)  # [B, M x D] -> [B, M, D]
+        x2 = x2.view(B, -1, self.input_dim)  # [B, M x D] -> [B, M, D]
+        z1_cls = x1[:, 0, :]
+        z2_cls = x2[:, 0, :]
+        x1_other = self.norm1(x1[:, 1:, :])
+        x2_other = self.norm2(x2[:, 1:, :])
+        z1_attn = self.attn1(x2_other, x1_other, x1_other)[0]
+        z1_other = x1_other + self.dropout1(z1_attn)
+        z2_attn = self.attn2(x1_other, x2_other, x2_other)[0]
+        z2_other = x2_other + self.dropout2(z2_attn)
+        z1_other = z1_other.mean(dim=1)
+        z2_other = z2_other.mean(dim=1)
+        z = torch.cat(
+            [
+                z1_cls,
+                z1_other,
+                z2_cls,
+                z2_other,
+                torch.abs(z1_cls - z2_cls),
+                torch.abs(z1_other - z2_other),
+            ],
+            dim=1,
+        )  # [B, D * 4]
+        z = self.mlp(z)
+        return z

src/models/miniagent_module.py CHANGED Viewed

@@ -76,6 +76,7 @@ class MiniAgentModule(LightningModule):
         target = torch.eye(B, device=pred.device).float()
         pos_weight = torch.tensor([B - 1], device=pred.device)
         loss = F.binary_cross_entropy_with_logits(pred, target, pos_weight=pos_weight)
         self.log("train/loss", loss, on_step=True, sync_dist=True, prog_bar=True)

         target = torch.eye(B, device=pred.device).float()
         pos_weight = torch.tensor([B - 1], device=pred.device)
+        # pos_weight = torch.tensor([1], device=pred.device)
         loss = F.binary_cross_entropy_with_logits(pred, target, pos_weight=pos_weight)
         self.log("train/loss", loss, on_step=True, sync_dist=True, prog_bar=True)

src/train.py CHANGED Viewed

@@ -73,6 +73,8 @@ def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         cfg.trainer, callbacks=callbacks, logger=logger
     )
     object_dict = {
         "cfg": cfg,
         "datamodule": datamodule,

         cfg.trainer, callbacks=callbacks, logger=logger
     )
+    trainer.fit_loop.max_epochs = 150
     object_dict = {
         "cfg": cfg,
         "datamodule": datamodule,