add attn_rms_norm_eps (#8)
Browse files- use specific rms_norm_eps for attn layer (d417ea779138edf602457609bc43ecb399d188ff)
- config.json +2 -1
- configuration_motif.py +6 -2
- modeling_motif.py +1 -1
config.json
CHANGED
@@ -22,6 +22,7 @@
|
|
22 |
"num_hidden_layers": 32,
|
23 |
"num_key_value_heads": 16,
|
24 |
"rms_norm_eps": 1e-06,
|
|
|
25 |
"rope_scaling": null,
|
26 |
"rope_theta": 500000.0,
|
27 |
"sliding_window": null,
|
@@ -32,4 +33,4 @@
|
|
32 |
"use_cache": true,
|
33 |
"use_sliding_window": false,
|
34 |
"vocab_size": 219520
|
35 |
-
}
|
|
|
22 |
"num_hidden_layers": 32,
|
23 |
"num_key_value_heads": 16,
|
24 |
"rms_norm_eps": 1e-06,
|
25 |
+
"attn_rms_norm_eps": 1e-05,
|
26 |
"rope_scaling": null,
|
27 |
"rope_theta": 500000.0,
|
28 |
"sliding_window": null,
|
|
|
33 |
"use_cache": true,
|
34 |
"use_sliding_window": false,
|
35 |
"vocab_size": 219520
|
36 |
+
}
|
configuration_motif.py
CHANGED
@@ -42,7 +42,9 @@ class MotifConfig(PretrainedConfig):
|
|
42 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
43 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
44 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
45 |
-
The epsilon used by the rms normalization layers.
|
|
|
|
|
46 |
use_cache (`bool`, *optional*, defaults to `True`):
|
47 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
48 |
relevant if `config.is_decoder=True`.
|
@@ -120,6 +122,7 @@ class MotifConfig(PretrainedConfig):
|
|
120 |
max_position_embeddings=32768,
|
121 |
initializer_range=0.02,
|
122 |
rms_norm_eps=1e-6,
|
|
|
123 |
use_cache=True,
|
124 |
tie_word_embeddings=False,
|
125 |
rope_theta=10000.0,
|
@@ -149,6 +152,7 @@ class MotifConfig(PretrainedConfig):
|
|
149 |
self.hidden_act = hidden_act
|
150 |
self.initializer_range = initializer_range
|
151 |
self.rms_norm_eps = rms_norm_eps
|
|
|
152 |
self.use_cache = use_cache
|
153 |
self.rope_theta = rope_theta
|
154 |
self.rope_scaling = rope_scaling
|
@@ -164,4 +168,4 @@ class MotifConfig(PretrainedConfig):
|
|
164 |
tie_word_embeddings=tie_word_embeddings,
|
165 |
**kwargs,
|
166 |
)
|
167 |
-
logger.info(f' kwargs : {kwargs}')
|
|
|
42 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
43 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
44 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
45 |
+
The epsilon used by the rms normalization layers, except for the rms normalization in the attention layer.
|
46 |
+
attn_rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
47 |
+
The epsilon used by the rms normalization in the attention layer.
|
48 |
use_cache (`bool`, *optional*, defaults to `True`):
|
49 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
50 |
relevant if `config.is_decoder=True`.
|
|
|
122 |
max_position_embeddings=32768,
|
123 |
initializer_range=0.02,
|
124 |
rms_norm_eps=1e-6,
|
125 |
+
attn_rms_norm_eps=1e-5,
|
126 |
use_cache=True,
|
127 |
tie_word_embeddings=False,
|
128 |
rope_theta=10000.0,
|
|
|
152 |
self.hidden_act = hidden_act
|
153 |
self.initializer_range = initializer_range
|
154 |
self.rms_norm_eps = rms_norm_eps
|
155 |
+
self.attn_rms_norm_eps = attn_rms_norm_eps
|
156 |
self.use_cache = use_cache
|
157 |
self.rope_theta = rope_theta
|
158 |
self.rope_scaling = rope_scaling
|
|
|
168 |
tie_word_embeddings=tie_word_embeddings,
|
169 |
**kwargs,
|
170 |
)
|
171 |
+
logger.info(f' kwargs : {kwargs}')
|
modeling_motif.py
CHANGED
@@ -362,7 +362,7 @@ class MotifAttention(nn.Module):
|
|
362 |
setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
|
363 |
getattr(self, name).data.normal_(mean=0.0, std=0.1)
|
364 |
|
365 |
-
self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.
|
366 |
self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
|
367 |
|
368 |
self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
|
|
|
362 |
setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
|
363 |
getattr(self, name).data.normal_(mean=0.0, std=0.1)
|
364 |
|
365 |
+
self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps)
|
366 |
self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
|
367 |
|
368 |
self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
|