TaehyunKimMotif commited on
Commit
12eff43
·
verified ·
1 Parent(s): c8c4f49

add attn_rms_norm_eps (#8)

Browse files

- use specific rms_norm_eps for attn layer (d417ea779138edf602457609bc43ecb399d188ff)

Files changed (3) hide show
  1. config.json +2 -1
  2. configuration_motif.py +6 -2
  3. modeling_motif.py +1 -1
config.json CHANGED
@@ -22,6 +22,7 @@
22
  "num_hidden_layers": 32,
23
  "num_key_value_heads": 16,
24
  "rms_norm_eps": 1e-06,
 
25
  "rope_scaling": null,
26
  "rope_theta": 500000.0,
27
  "sliding_window": null,
@@ -32,4 +33,4 @@
32
  "use_cache": true,
33
  "use_sliding_window": false,
34
  "vocab_size": 219520
35
- }
 
22
  "num_hidden_layers": 32,
23
  "num_key_value_heads": 16,
24
  "rms_norm_eps": 1e-06,
25
+ "attn_rms_norm_eps": 1e-05,
26
  "rope_scaling": null,
27
  "rope_theta": 500000.0,
28
  "sliding_window": null,
 
33
  "use_cache": true,
34
  "use_sliding_window": false,
35
  "vocab_size": 219520
36
+ }
configuration_motif.py CHANGED
@@ -42,7 +42,9 @@ class MotifConfig(PretrainedConfig):
42
  initializer_range (`float`, *optional*, defaults to 0.02):
43
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
44
  rms_norm_eps (`float`, *optional*, defaults to 1e-06):
45
- The epsilon used by the rms normalization layers.
 
 
46
  use_cache (`bool`, *optional*, defaults to `True`):
47
  Whether or not the model should return the last key/values attentions (not used by all models). Only
48
  relevant if `config.is_decoder=True`.
@@ -120,6 +122,7 @@ class MotifConfig(PretrainedConfig):
120
  max_position_embeddings=32768,
121
  initializer_range=0.02,
122
  rms_norm_eps=1e-6,
 
123
  use_cache=True,
124
  tie_word_embeddings=False,
125
  rope_theta=10000.0,
@@ -149,6 +152,7 @@ class MotifConfig(PretrainedConfig):
149
  self.hidden_act = hidden_act
150
  self.initializer_range = initializer_range
151
  self.rms_norm_eps = rms_norm_eps
 
152
  self.use_cache = use_cache
153
  self.rope_theta = rope_theta
154
  self.rope_scaling = rope_scaling
@@ -164,4 +168,4 @@ class MotifConfig(PretrainedConfig):
164
  tie_word_embeddings=tie_word_embeddings,
165
  **kwargs,
166
  )
167
- logger.info(f' kwargs : {kwargs}')
 
42
  initializer_range (`float`, *optional*, defaults to 0.02):
43
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
44
  rms_norm_eps (`float`, *optional*, defaults to 1e-06):
45
+ The epsilon used by the rms normalization layers, except for the rms normalization in the attention layer.
46
+ attn_rms_norm_eps (`float`, *optional*, defaults to 1e-05):
47
+ The epsilon used by the rms normalization in the attention layer.
48
  use_cache (`bool`, *optional*, defaults to `True`):
49
  Whether or not the model should return the last key/values attentions (not used by all models). Only
50
  relevant if `config.is_decoder=True`.
 
122
  max_position_embeddings=32768,
123
  initializer_range=0.02,
124
  rms_norm_eps=1e-6,
125
+ attn_rms_norm_eps=1e-5,
126
  use_cache=True,
127
  tie_word_embeddings=False,
128
  rope_theta=10000.0,
 
152
  self.hidden_act = hidden_act
153
  self.initializer_range = initializer_range
154
  self.rms_norm_eps = rms_norm_eps
155
+ self.attn_rms_norm_eps = attn_rms_norm_eps
156
  self.use_cache = use_cache
157
  self.rope_theta = rope_theta
158
  self.rope_scaling = rope_scaling
 
168
  tie_word_embeddings=tie_word_embeddings,
169
  **kwargs,
170
  )
171
+ logger.info(f' kwargs : {kwargs}')
modeling_motif.py CHANGED
@@ -362,7 +362,7 @@ class MotifAttention(nn.Module):
362
  setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
363
  getattr(self, name).data.normal_(mean=0.0, std=0.1)
364
 
365
- self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.rms_norm_eps)
366
  self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
367
 
368
  self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
 
362
  setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
363
  getattr(self, name).data.normal_(mean=0.0, std=0.1)
364
 
365
+ self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps)
366
  self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
367
 
368
  self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,