Upload folder using huggingface_hub
Browse files- attention.py +25 -18
- blocks.py +11 -17
- config.json +6 -2
- configuration_mpt.py +21 -4
- fc.py +7 -0
- ffn.py +40 -0
- modeling_mpt.py +13 -10
- norm.py +2 -1
- param_init_fns.py +19 -1
- pytorch_model-00001-of-00013.bin +3 -0
- pytorch_model-00002-of-00013.bin +3 -0
- pytorch_model-00003-of-00013.bin +3 -0
- pytorch_model-00004-of-00013.bin +3 -0
- pytorch_model-00005-of-00013.bin +3 -0
- pytorch_model-00006-of-00013.bin +3 -0
- pytorch_model-00007-of-00013.bin +3 -0
- pytorch_model-00008-of-00013.bin +3 -0
- pytorch_model-00009-of-00013.bin +3 -0
- pytorch_model-00010-of-00013.bin +3 -0
- pytorch_model-00011-of-00013.bin +3 -0
- pytorch_model-00012-of-00013.bin +3 -0
- pytorch_model-00013-of-00013.bin +3 -0
- pytorch_model.bin.index.json +292 -291
attention.py
CHANGED
@@ -7,7 +7,8 @@ import torch.nn as nn
|
|
7 |
from einops import rearrange
|
8 |
from packaging import version
|
9 |
from torch import nn
|
10 |
-
from .
|
|
|
11 |
|
12 |
def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
|
13 |
if original_is_causal and num_query_tokens != num_key_tokens:
|
@@ -46,7 +47,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
|
|
46 |
attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
|
47 |
if is_causal and (not q.size(2) == 1):
|
48 |
s = max(s_q, s_k)
|
49 |
-
causal_mask = attn_weight.new_ones(s, s, dtype=torch.
|
50 |
causal_mask = causal_mask.tril()
|
51 |
causal_mask = causal_mask.to(torch.bool)
|
52 |
causal_mask = ~causal_mask
|
@@ -141,8 +142,8 @@ def triton_flash_attn_fn(query, key, value, n_heads, past_key_value=None, softma
|
|
141 |
key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
142 |
value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
143 |
if multiquery:
|
144 |
-
key = key.
|
145 |
-
value = value.
|
146 |
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
147 |
attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
|
148 |
output = attn_output.view(*attn_output.shape[:2], -1)
|
@@ -155,7 +156,7 @@ class MultiheadAttention(nn.Module):
|
|
155 |
additive bias.
|
156 |
"""
|
157 |
|
158 |
-
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm:
|
159 |
super().__init__()
|
160 |
self.attn_impl = attn_impl
|
161 |
self.clip_qkv = clip_qkv
|
@@ -166,13 +167,16 @@ class MultiheadAttention(nn.Module):
|
|
166 |
if self.softmax_scale is None:
|
167 |
self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
|
168 |
self.attn_dropout_p = attn_pdrop
|
169 |
-
|
|
|
|
|
|
|
170 |
fuse_splits = (d_model, 2 * d_model)
|
171 |
self.Wqkv._fused = (0, fuse_splits)
|
172 |
if self.qk_ln:
|
173 |
-
|
174 |
-
self.q_ln =
|
175 |
-
self.k_ln =
|
176 |
if self.attn_impl == 'flash':
|
177 |
self.attn_fn = flash_attn_fn
|
178 |
elif self.attn_impl == 'triton':
|
@@ -185,13 +189,13 @@ class MultiheadAttention(nn.Module):
|
|
185 |
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
186 |
else:
|
187 |
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
188 |
-
self.out_proj =
|
189 |
self.out_proj._is_residual = True
|
190 |
|
191 |
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
192 |
qkv = self.Wqkv(x)
|
193 |
if self.clip_qkv:
|
194 |
-
qkv.
|
195 |
(query, key, value) = qkv.chunk(3, dim=2)
|
196 |
key_padding_mask = attention_mask
|
197 |
if self.qk_ln:
|
@@ -208,7 +212,7 @@ class MultiQueryAttention(nn.Module):
|
|
208 |
additive bias.
|
209 |
"""
|
210 |
|
211 |
-
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm:
|
212 |
super().__init__()
|
213 |
self.attn_impl = attn_impl
|
214 |
self.clip_qkv = clip_qkv
|
@@ -220,13 +224,16 @@ class MultiQueryAttention(nn.Module):
|
|
220 |
if self.softmax_scale is None:
|
221 |
self.softmax_scale = 1 / math.sqrt(self.head_dim)
|
222 |
self.attn_dropout_p = attn_pdrop
|
223 |
-
|
|
|
|
|
|
|
224 |
fuse_splits = (d_model, d_model + self.head_dim)
|
225 |
self.Wqkv._fused = (0, fuse_splits)
|
226 |
if self.qk_ln:
|
227 |
-
|
228 |
-
self.q_ln =
|
229 |
-
self.k_ln =
|
230 |
if self.attn_impl == 'flash':
|
231 |
self.attn_fn = flash_attn_fn
|
232 |
elif self.attn_impl == 'triton':
|
@@ -239,13 +246,13 @@ class MultiQueryAttention(nn.Module):
|
|
239 |
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
240 |
else:
|
241 |
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
242 |
-
self.out_proj =
|
243 |
self.out_proj._is_residual = True
|
244 |
|
245 |
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
246 |
qkv = self.Wqkv(x)
|
247 |
if self.clip_qkv:
|
248 |
-
qkv.
|
249 |
(query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
|
250 |
key_padding_mask = attention_mask
|
251 |
if self.qk_ln:
|
|
|
7 |
from einops import rearrange
|
8 |
from packaging import version
|
9 |
from torch import nn
|
10 |
+
from .fc import FC_CLASS_REGISTRY
|
11 |
+
from .norm import NORM_CLASS_REGISTRY
|
12 |
|
13 |
def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
|
14 |
if original_is_causal and num_query_tokens != num_key_tokens:
|
|
|
47 |
attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
|
48 |
if is_causal and (not q.size(2) == 1):
|
49 |
s = max(s_q, s_k)
|
50 |
+
causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
|
51 |
causal_mask = causal_mask.tril()
|
52 |
causal_mask = causal_mask.to(torch.bool)
|
53 |
causal_mask = ~causal_mask
|
|
|
142 |
key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
143 |
value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
144 |
if multiquery:
|
145 |
+
key = key.repeat(1, 1, n_heads, 1)
|
146 |
+
value = value.repeat(1, 1, n_heads, 1)
|
147 |
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
148 |
attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
|
149 |
output = attn_output.view(*attn_output.shape[:2], -1)
|
|
|
156 |
additive bias.
|
157 |
"""
|
158 |
|
159 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', verbose: int=0, device: Optional[str]=None):
|
160 |
super().__init__()
|
161 |
self.attn_impl = attn_impl
|
162 |
self.clip_qkv = clip_qkv
|
|
|
167 |
if self.softmax_scale is None:
|
168 |
self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
|
169 |
self.attn_dropout_p = attn_pdrop
|
170 |
+
fc_kwargs = {}
|
171 |
+
if fc_type != 'te':
|
172 |
+
fc_kwargs['device'] = device
|
173 |
+
self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, 3 * self.d_model, **fc_kwargs)
|
174 |
fuse_splits = (d_model, 2 * d_model)
|
175 |
self.Wqkv._fused = (0, fuse_splits)
|
176 |
if self.qk_ln:
|
177 |
+
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
|
178 |
+
self.q_ln = norm_class(self.d_model, device=device)
|
179 |
+
self.k_ln = norm_class(self.d_model, device=device)
|
180 |
if self.attn_impl == 'flash':
|
181 |
self.attn_fn = flash_attn_fn
|
182 |
elif self.attn_impl == 'triton':
|
|
|
189 |
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
190 |
else:
|
191 |
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
192 |
+
self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
|
193 |
self.out_proj._is_residual = True
|
194 |
|
195 |
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
196 |
qkv = self.Wqkv(x)
|
197 |
if self.clip_qkv:
|
198 |
+
qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
|
199 |
(query, key, value) = qkv.chunk(3, dim=2)
|
200 |
key_padding_mask = attention_mask
|
201 |
if self.qk_ln:
|
|
|
212 |
additive bias.
|
213 |
"""
|
214 |
|
215 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', verbose: int=0, device: Optional[str]=None):
|
216 |
super().__init__()
|
217 |
self.attn_impl = attn_impl
|
218 |
self.clip_qkv = clip_qkv
|
|
|
224 |
if self.softmax_scale is None:
|
225 |
self.softmax_scale = 1 / math.sqrt(self.head_dim)
|
226 |
self.attn_dropout_p = attn_pdrop
|
227 |
+
fc_kwargs = {}
|
228 |
+
if fc_type != 'te':
|
229 |
+
fc_kwargs['device'] = device
|
230 |
+
self.Wqkv = FC_CLASS_REGISTRY[fc_type](d_model, d_model + 2 * self.head_dim, **fc_kwargs)
|
231 |
fuse_splits = (d_model, d_model + self.head_dim)
|
232 |
self.Wqkv._fused = (0, fuse_splits)
|
233 |
if self.qk_ln:
|
234 |
+
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
|
235 |
+
self.q_ln = norm_class(d_model, device=device)
|
236 |
+
self.k_ln = norm_class(self.head_dim, device=device)
|
237 |
if self.attn_impl == 'flash':
|
238 |
self.attn_fn = flash_attn_fn
|
239 |
elif self.attn_impl == 'triton':
|
|
|
246 |
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
247 |
else:
|
248 |
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
249 |
+
self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
|
250 |
self.out_proj._is_residual = True
|
251 |
|
252 |
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
253 |
qkv = self.Wqkv(x)
|
254 |
if self.clip_qkv:
|
255 |
+
qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
|
256 |
(query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
|
257 |
key_padding_mask = attention_mask
|
258 |
if self.qk_ln:
|
blocks.py
CHANGED
@@ -3,31 +3,23 @@ from typing import Dict, Optional, Tuple
|
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
from .attention import ATTN_CLASS_REGISTRY
|
|
|
|
|
6 |
from .norm import NORM_CLASS_REGISTRY
|
7 |
|
8 |
-
class MPTMLP(nn.Module):
|
9 |
-
|
10 |
-
def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
|
11 |
-
super().__init__()
|
12 |
-
self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
|
13 |
-
self.act = nn.GELU(approximate='none')
|
14 |
-
self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
|
15 |
-
self.down_proj._is_residual = True
|
16 |
-
|
17 |
-
def forward(self, x):
|
18 |
-
return self.down_proj(self.act(self.up_proj(x)))
|
19 |
-
|
20 |
class MPTBlock(nn.Module):
|
21 |
|
22 |
-
def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
|
23 |
del kwargs
|
24 |
super().__init__()
|
25 |
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
|
26 |
attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
|
27 |
self.norm_1 = norm_class(d_model, device=device)
|
28 |
-
self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'],
|
29 |
-
self.norm_2 =
|
30 |
-
|
|
|
|
|
31 |
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
|
32 |
self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
|
33 |
|
@@ -35,7 +27,9 @@ class MPTBlock(nn.Module):
|
|
35 |
a = self.norm_1(x)
|
36 |
(b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
|
37 |
x = x + self.resid_attn_dropout(b)
|
38 |
-
m =
|
|
|
|
|
39 |
n = self.ffn(m)
|
40 |
x = x + self.resid_ffn_dropout(n)
|
41 |
return (x, attn_weights, past_key_value)
|
|
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
from .attention import ATTN_CLASS_REGISTRY
|
6 |
+
from .fc import FC_CLASS_REGISTRY
|
7 |
+
from .ffn import FFN_CLASS_REGISTRY, build_ffn
|
8 |
from .norm import NORM_CLASS_REGISTRY
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
class MPTBlock(nn.Module):
|
11 |
|
12 |
+
def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, ffn_config: Dict={'ffn_type': 'mptmlp'}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, fc_type: str='torch', device: Optional[str]=None, **kwargs):
|
13 |
del kwargs
|
14 |
super().__init__()
|
15 |
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
|
16 |
attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
|
17 |
self.norm_1 = norm_class(d_model, device=device)
|
18 |
+
self.attn = attn_class(d_model=d_model, n_heads=n_heads, attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], norm_type=norm_type, fc_type=fc_type, verbose=verbose, device=device)
|
19 |
+
self.norm_2 = None
|
20 |
+
if not getattr(FFN_CLASS_REGISTRY[ffn_config['ffn_type']], '_has_norm', False):
|
21 |
+
self.norm_2 = norm_class(d_model, device=device)
|
22 |
+
self.ffn = build_ffn(d_model=d_model, expansion_ratio=expansion_ratio, device=device, **ffn_config)
|
23 |
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
|
24 |
self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
|
25 |
|
|
|
27 |
a = self.norm_1(x)
|
28 |
(b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
|
29 |
x = x + self.resid_attn_dropout(b)
|
30 |
+
m = x
|
31 |
+
if self.norm_2 is not None:
|
32 |
+
m = self.norm_2(x)
|
33 |
n = self.ffn(m)
|
34 |
x = x + self.resid_ffn_dropout(n)
|
35 |
return (x, attn_weights, past_key_value)
|
config.json
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "./mpt-30b-orca-hf/",
|
3 |
"architectures": [
|
4 |
"MPTForCausalLM"
|
5 |
],
|
@@ -23,6 +22,11 @@
|
|
23 |
"emb_pdrop": 0,
|
24 |
"embedding_fraction": 1.0,
|
25 |
"expansion_ratio": 4,
|
|
|
|
|
|
|
|
|
|
|
26 |
"init_config": {
|
27 |
"emb_init_std": null,
|
28 |
"emb_init_uniform_lim": null,
|
@@ -45,7 +49,7 @@
|
|
45 |
"norm_type": "low_precision_layernorm",
|
46 |
"resid_pdrop": 0,
|
47 |
"tokenizer_name": "sam-mosaic/gpt-neox-20b-chatml",
|
48 |
-
"torch_dtype": "
|
49 |
"transformers_version": "4.30.2",
|
50 |
"use_cache": false,
|
51 |
"verbose": 0,
|
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
"MPTForCausalLM"
|
4 |
],
|
|
|
22 |
"emb_pdrop": 0,
|
23 |
"embedding_fraction": 1.0,
|
24 |
"expansion_ratio": 4,
|
25 |
+
"fc_type": "torch",
|
26 |
+
"ffn_config": {
|
27 |
+
"fc_type": "torch",
|
28 |
+
"ffn_type": "mptmlp"
|
29 |
+
},
|
30 |
"init_config": {
|
31 |
"emb_init_std": null,
|
32 |
"emb_init_uniform_lim": null,
|
|
|
49 |
"norm_type": "low_precision_layernorm",
|
50 |
"resid_pdrop": 0,
|
51 |
"tokenizer_name": "sam-mosaic/gpt-neox-20b-chatml",
|
52 |
+
"torch_dtype": "float32",
|
53 |
"transformers_version": "4.30.2",
|
54 |
"use_cache": false,
|
55 |
"verbose": 0,
|
configuration_mpt.py
CHANGED
@@ -1,26 +1,28 @@
|
|
1 |
"""A HuggingFace-style model configuration."""
|
|
|
2 |
from typing import Dict, Optional, Union
|
3 |
from transformers import PretrainedConfig
|
4 |
attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
|
|
|
5 |
init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
|
6 |
|
7 |
class MPTConfig(PretrainedConfig):
|
8 |
model_type = 'mpt'
|
9 |
|
10 |
-
def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
|
11 |
"""The MPT configuration class.
|
12 |
|
13 |
Args:
|
14 |
d_model (int): The size of the embedding dimension of the model.
|
15 |
n_heads (int): The number of attention heads.
|
16 |
n_layers (int): The number of layers in the model.
|
17 |
-
expansion_ratio (int): The ratio of the up/down scale in the
|
18 |
max_seq_len (int): The maximum sequence length of the model.
|
19 |
vocab_size (int): The size of the vocabulary.
|
20 |
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
|
21 |
emb_pdrop (float): The dropout probability for the embedding layer.
|
22 |
learned_pos_emb (bool): Whether to use learned positional embeddings
|
23 |
-
attn_config (Dict):
|
24 |
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
|
25 |
attn_pdrop (float): The dropout probability for the attention layers.
|
26 |
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
|
@@ -38,6 +40,8 @@ class MPTConfig(PretrainedConfig):
|
|
38 |
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
|
39 |
alibi (bool): Whether to use the alibi bias instead of position embeddings.
|
40 |
alibi_bias_max (int): The maximum value of the alibi bias.
|
|
|
|
|
41 |
init_device (str): The device to use for parameter initialization.
|
42 |
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
|
43 |
no_bias (bool): Whether to use bias in all layers.
|
@@ -61,6 +65,7 @@ class MPTConfig(PretrainedConfig):
|
|
61 |
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
|
62 |
---
|
63 |
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
|
|
|
64 |
"""
|
65 |
self.d_model = d_model
|
66 |
self.n_heads = n_heads
|
@@ -72,6 +77,7 @@ class MPTConfig(PretrainedConfig):
|
|
72 |
self.emb_pdrop = emb_pdrop
|
73 |
self.learned_pos_emb = learned_pos_emb
|
74 |
self.attn_config = attn_config
|
|
|
75 |
self.init_device = init_device
|
76 |
self.logit_scale = logit_scale
|
77 |
self.no_bias = no_bias
|
@@ -80,6 +86,7 @@ class MPTConfig(PretrainedConfig):
|
|
80 |
self.norm_type = norm_type
|
81 |
self.use_cache = use_cache
|
82 |
self.init_config = init_config
|
|
|
83 |
if 'name' in kwargs:
|
84 |
del kwargs['name']
|
85 |
if 'loss_fn' in kwargs:
|
@@ -95,6 +102,7 @@ class MPTConfig(PretrainedConfig):
|
|
95 |
|
96 |
def _validate_config(self):
|
97 |
self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
|
|
|
98 |
self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
|
99 |
if self.d_model % self.n_heads != 0:
|
100 |
raise ValueError('d_model must be divisible by n_heads')
|
@@ -115,4 +123,13 @@ class MPTConfig(PretrainedConfig):
|
|
115 |
if self.init_config.get('name', None) is None:
|
116 |
raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
|
117 |
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
118 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""A HuggingFace-style model configuration."""
|
2 |
+
import warnings
|
3 |
from typing import Dict, Optional, Union
|
4 |
from transformers import PretrainedConfig
|
5 |
attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
|
6 |
+
ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
|
7 |
init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
|
8 |
|
9 |
class MPTConfig(PretrainedConfig):
|
10 |
model_type = 'mpt'
|
11 |
|
12 |
+
def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, ffn_config: Dict=ffn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, fc_type: str='torch', **kwargs):
|
13 |
"""The MPT configuration class.
|
14 |
|
15 |
Args:
|
16 |
d_model (int): The size of the embedding dimension of the model.
|
17 |
n_heads (int): The number of attention heads.
|
18 |
n_layers (int): The number of layers in the model.
|
19 |
+
expansion_ratio (int): The ratio of the up/down scale in the ffn.
|
20 |
max_seq_len (int): The maximum sequence length of the model.
|
21 |
vocab_size (int): The size of the vocabulary.
|
22 |
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
|
23 |
emb_pdrop (float): The dropout probability for the embedding layer.
|
24 |
learned_pos_emb (bool): Whether to use learned positional embeddings
|
25 |
+
attn_config (Dict): A dictionary used to configure the model's attention module:
|
26 |
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
|
27 |
attn_pdrop (float): The dropout probability for the attention layers.
|
28 |
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
|
|
|
40 |
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
|
41 |
alibi (bool): Whether to use the alibi bias instead of position embeddings.
|
42 |
alibi_bias_max (int): The maximum value of the alibi bias.
|
43 |
+
ffn_config (Dict): A dictionary used to configure the model's ffn module:
|
44 |
+
ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
|
45 |
init_device (str): The device to use for parameter initialization.
|
46 |
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
|
47 |
no_bias (bool): Whether to use bias in all layers.
|
|
|
65 |
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
|
66 |
---
|
67 |
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
|
68 |
+
fc_type (str): choose fc layer implementaion. Options: torch and te. te layers support fp8 when using H100 GPUs.
|
69 |
"""
|
70 |
self.d_model = d_model
|
71 |
self.n_heads = n_heads
|
|
|
77 |
self.emb_pdrop = emb_pdrop
|
78 |
self.learned_pos_emb = learned_pos_emb
|
79 |
self.attn_config = attn_config
|
80 |
+
self.ffn_config = ffn_config
|
81 |
self.init_device = init_device
|
82 |
self.logit_scale = logit_scale
|
83 |
self.no_bias = no_bias
|
|
|
86 |
self.norm_type = norm_type
|
87 |
self.use_cache = use_cache
|
88 |
self.init_config = init_config
|
89 |
+
self.fc_type = fc_type
|
90 |
if 'name' in kwargs:
|
91 |
del kwargs['name']
|
92 |
if 'loss_fn' in kwargs:
|
|
|
102 |
|
103 |
def _validate_config(self):
|
104 |
self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
|
105 |
+
self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults)
|
106 |
self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
|
107 |
if self.d_model % self.n_heads != 0:
|
108 |
raise ValueError('d_model must be divisible by n_heads')
|
|
|
123 |
if self.init_config.get('name', None) is None:
|
124 |
raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
|
125 |
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
126 |
+
raise warnings.warn(f'Positional information not being provided to the model using either learned_pos_emb or alibi.')
|
127 |
+
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
128 |
+
try:
|
129 |
+
import transformer_engine.pytorch as te
|
130 |
+
except:
|
131 |
+
raise ImportError('TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed.The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\npip install flash-attn==1.0.6 --no-build-isolation \npip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156')
|
132 |
+
if self.ffn_config['ffn_type'] == 'mptmlp':
|
133 |
+
self.ffn_config['fc_type'] = self.fc_type
|
134 |
+
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
135 |
+
self.bias = not self.no_bias
|
fc.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
FC_CLASS_REGISTRY = {'torch': nn.Linear}
|
3 |
+
try:
|
4 |
+
import transformer_engine.pytorch as te
|
5 |
+
FC_CLASS_REGISTRY['te'] = te.Linear
|
6 |
+
except:
|
7 |
+
pass
|
ffn.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""GPT Blocks used for the GPT Model."""
|
2 |
+
from typing import Optional
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from .attention import ATTN_CLASS_REGISTRY
|
6 |
+
from .fc import FC_CLASS_REGISTRY
|
7 |
+
from .norm import NORM_CLASS_REGISTRY
|
8 |
+
try:
|
9 |
+
import transformer_engine.pytorch as te
|
10 |
+
except:
|
11 |
+
te = None
|
12 |
+
|
13 |
+
class MPTMLP(nn.Module):
|
14 |
+
|
15 |
+
def __init__(self, d_model: int, expansion_ratio: int, fc_type: str='torch', device: Optional[str]=None):
|
16 |
+
super().__init__()
|
17 |
+
fc_kwargs = {}
|
18 |
+
if fc_type != 'te':
|
19 |
+
fc_kwargs['device'] = device
|
20 |
+
self.up_proj = FC_CLASS_REGISTRY[fc_type](d_model, expansion_ratio * d_model, **fc_kwargs)
|
21 |
+
self.act = nn.GELU(approximate='none')
|
22 |
+
self.down_proj = FC_CLASS_REGISTRY[fc_type](expansion_ratio * d_model, d_model, **fc_kwargs)
|
23 |
+
self.down_proj._is_residual = True
|
24 |
+
|
25 |
+
def forward(self, x):
|
26 |
+
return self.down_proj(self.act(self.up_proj(x)))
|
27 |
+
FFN_CLASS_REGISTRY = {'mptmlp': MPTMLP}
|
28 |
+
if te is not None:
|
29 |
+
te.LayerNormMLP._has_norm = True
|
30 |
+
FFN_CLASS_REGISTRY['te_ln_mlp'] = te.LayerNormMLP
|
31 |
+
|
32 |
+
def build_ffn(d_model: int, expansion_ratio: int, fc_type: str='torch', device: Optional[str]=None, **kwargs):
|
33 |
+
ffn_type = kwargs.pop('ffn_type')
|
34 |
+
if ffn_type == 'mptmlp':
|
35 |
+
if kwargs is not None and len(kwargs) > 0:
|
36 |
+
raise ValueError(f'MPTMLP got an unexpected keyword argument: {kwargs}')
|
37 |
+
return MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, device=device)
|
38 |
+
elif ffn_type == 'te_ln_mlp':
|
39 |
+
return te.LayerNormMLP(hidden_size=d_model, ffn_hidden_size=d_model * expansion_ratio, **kwargs)
|
40 |
+
raise ValueError(f'ffn_type={ffn_type!r} not recognized.')
|
modeling_mpt.py
CHANGED
@@ -13,12 +13,14 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutpu
|
|
13 |
from .attention import attn_bias_shape, build_attn_bias
|
14 |
from .blocks import MPTBlock
|
15 |
from .custom_embedding import SharedEmbedding
|
|
|
|
|
16 |
from .norm import NORM_CLASS_REGISTRY
|
17 |
from .configuration_mpt import MPTConfig
|
18 |
from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
|
19 |
from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
|
20 |
from .meta_init_context import init_empty_weights
|
21 |
-
from .param_init_fns import
|
22 |
try:
|
23 |
from .flash_attn_triton import flash_attn_func
|
24 |
except:
|
@@ -40,6 +42,7 @@ class MPTModel(MPTPreTrainedModel):
|
|
40 |
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
41 |
self.alibi = config.attn_config['alibi']
|
42 |
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
|
|
43 |
if config.init_device == 'mixed':
|
44 |
if dist.get_local_rank() == 0:
|
45 |
config.init_device = 'cpu'
|
@@ -51,7 +54,7 @@ class MPTModel(MPTPreTrainedModel):
|
|
51 |
norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
|
52 |
self.embedding_fraction = config.embedding_fraction
|
53 |
self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
|
54 |
-
if
|
55 |
self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
|
56 |
self.emb_drop = nn.Dropout(config.emb_pdrop)
|
57 |
self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
|
@@ -80,7 +83,7 @@ class MPTModel(MPTPreTrainedModel):
|
|
80 |
def get_input_embeddings(self):
|
81 |
return self.wte
|
82 |
|
83 |
-
def set_input_embeddings(self, value):
|
84 |
self.wte = value
|
85 |
|
86 |
@torch.no_grad()
|
@@ -166,9 +169,7 @@ class MPTModel(MPTPreTrainedModel):
|
|
166 |
S = input_ids.size(1)
|
167 |
assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
|
168 |
tok_emb = self.wte(input_ids)
|
169 |
-
if self.
|
170 |
-
x = tok_emb
|
171 |
-
else:
|
172 |
past_position = 0
|
173 |
if past_key_values is not None:
|
174 |
if len(past_key_values) != self.config.n_layers:
|
@@ -183,6 +184,8 @@ class MPTModel(MPTPreTrainedModel):
|
|
183 |
pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
|
184 |
pos_emb = self.wpe(pos)
|
185 |
x = tok_emb + pos_emb
|
|
|
|
|
186 |
if self.embedding_fraction == 1:
|
187 |
x = self.emb_drop(x)
|
188 |
else:
|
@@ -228,7 +231,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
|
|
228 |
if not config.tie_word_embeddings:
|
229 |
raise ValueError('MPTForCausalLM only supports tied word embeddings')
|
230 |
print(f'Instantiating an MPTForCausalLM model from {__file__}')
|
231 |
-
self.transformer = MPTModel(config)
|
232 |
for child in self.transformer.children():
|
233 |
if isinstance(child, torch.nn.ModuleList):
|
234 |
continue
|
@@ -275,9 +278,9 @@ class MPTForCausalLM(MPTPreTrainedModel):
|
|
275 |
logits *= self.logit_scale
|
276 |
loss = None
|
277 |
if labels is not None:
|
278 |
-
|
279 |
-
|
280 |
-
loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
|
281 |
return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
|
282 |
|
283 |
def param_init_fn(self, module):
|
|
|
13 |
from .attention import attn_bias_shape, build_attn_bias
|
14 |
from .blocks import MPTBlock
|
15 |
from .custom_embedding import SharedEmbedding
|
16 |
+
from .fc import FC_CLASS_REGISTRY
|
17 |
+
from .ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
|
18 |
from .norm import NORM_CLASS_REGISTRY
|
19 |
from .configuration_mpt import MPTConfig
|
20 |
from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
|
21 |
from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
|
22 |
from .meta_init_context import init_empty_weights
|
23 |
+
from .param_init_fns import generic_param_init_fn_, MODEL_INIT_REGISTRY
|
24 |
try:
|
25 |
from .flash_attn_triton import flash_attn_func
|
26 |
except:
|
|
|
42 |
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
43 |
self.alibi = config.attn_config['alibi']
|
44 |
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
45 |
+
self.learned_pos_emb = config.learned_pos_emb
|
46 |
if config.init_device == 'mixed':
|
47 |
if dist.get_local_rank() == 0:
|
48 |
config.init_device = 'cpu'
|
|
|
54 |
norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
|
55 |
self.embedding_fraction = config.embedding_fraction
|
56 |
self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
|
57 |
+
if self.learned_pos_emb:
|
58 |
self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
|
59 |
self.emb_drop = nn.Dropout(config.emb_pdrop)
|
60 |
self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
|
|
|
83 |
def get_input_embeddings(self):
|
84 |
return self.wte
|
85 |
|
86 |
+
def set_input_embeddings(self, value: nn.Embedding):
|
87 |
self.wte = value
|
88 |
|
89 |
@torch.no_grad()
|
|
|
169 |
S = input_ids.size(1)
|
170 |
assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
|
171 |
tok_emb = self.wte(input_ids)
|
172 |
+
if self.learned_pos_emb:
|
|
|
|
|
173 |
past_position = 0
|
174 |
if past_key_values is not None:
|
175 |
if len(past_key_values) != self.config.n_layers:
|
|
|
184 |
pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
|
185 |
pos_emb = self.wpe(pos)
|
186 |
x = tok_emb + pos_emb
|
187 |
+
else:
|
188 |
+
x = tok_emb
|
189 |
if self.embedding_fraction == 1:
|
190 |
x = self.emb_drop(x)
|
191 |
else:
|
|
|
231 |
if not config.tie_word_embeddings:
|
232 |
raise ValueError('MPTForCausalLM only supports tied word embeddings')
|
233 |
print(f'Instantiating an MPTForCausalLM model from {__file__}')
|
234 |
+
self.transformer: MPTModel = MPTModel(config)
|
235 |
for child in self.transformer.children():
|
236 |
if isinstance(child, torch.nn.ModuleList):
|
237 |
continue
|
|
|
278 |
logits *= self.logit_scale
|
279 |
loss = None
|
280 |
if labels is not None:
|
281 |
+
_labels = torch.roll(labels, shifts=-1)
|
282 |
+
_labels[:, -1] = -100
|
283 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), _labels.to(logits.device).view(-1))
|
284 |
return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
|
285 |
|
286 |
def param_init_fn(self, module):
|
norm.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import torch
|
2 |
|
3 |
def _cast_if_autocast_enabled(tensor):
|
@@ -53,4 +54,4 @@ class LPRMSNorm(RMSNorm):
|
|
53 |
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
|
54 |
with torch.autocast(enabled=False, device_type=x.device.type):
|
55 |
return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
|
56 |
-
NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
|
|
|
1 |
+
from typing import Dict, Type
|
2 |
import torch
|
3 |
|
4 |
def _cast_if_autocast_enabled(tensor):
|
|
|
54 |
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
|
55 |
with torch.autocast(enabled=False, device_type=x.device.type):
|
56 |
return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
|
57 |
+
NORM_CLASS_REGISTRY: Dict[str, Type[torch.nn.Module]] = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
|
param_init_fns.py
CHANGED
@@ -5,7 +5,12 @@ from functools import partial
|
|
5 |
from typing import Optional, Tuple, Union
|
6 |
import torch
|
7 |
from torch import nn
|
|
|
8 |
from .norm import NORM_CLASS_REGISTRY
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
|
11 |
del kwargs
|
@@ -44,7 +49,7 @@ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model:
|
|
44 |
if init_div_is_residual is not False:
|
45 |
if verbose > 1:
|
46 |
warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
|
47 |
-
if isinstance(module,
|
48 |
if hasattr(module, '_fused'):
|
49 |
fused_init_helper_(module, init_fn_)
|
50 |
else:
|
@@ -114,6 +119,19 @@ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model:
|
|
114 |
module.out_proj.weight.div_(div_is_residual)
|
115 |
if module.out_proj.bias is not None:
|
116 |
torch.nn.init.zeros_(module.out_proj.bias)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
else:
|
118 |
for _ in module.parameters(recurse=False):
|
119 |
raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
|
|
|
5 |
from typing import Optional, Tuple, Union
|
6 |
import torch
|
7 |
from torch import nn
|
8 |
+
from .fc import FC_CLASS_REGISTRY
|
9 |
from .norm import NORM_CLASS_REGISTRY
|
10 |
+
try:
|
11 |
+
import transformer_engine.pytorch as te
|
12 |
+
except:
|
13 |
+
te = None
|
14 |
|
15 |
def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
|
16 |
del kwargs
|
|
|
49 |
if init_div_is_residual is not False:
|
50 |
if verbose > 1:
|
51 |
warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
|
52 |
+
if isinstance(module, tuple(set(FC_CLASS_REGISTRY.values()))):
|
53 |
if hasattr(module, '_fused'):
|
54 |
fused_init_helper_(module, init_fn_)
|
55 |
else:
|
|
|
119 |
module.out_proj.weight.div_(div_is_residual)
|
120 |
if module.out_proj.bias is not None:
|
121 |
torch.nn.init.zeros_(module.out_proj.bias)
|
122 |
+
elif te is not None and isinstance(module, te.LayerNormMLP):
|
123 |
+
if module.layer_norm_weight is not None:
|
124 |
+
torch.nn.init.ones_(module.layer_norm_weight)
|
125 |
+
if module.layer_norm_bias is not None:
|
126 |
+
torch.nn.init.zeros_(module.layer_norm_bias)
|
127 |
+
init_fn_(module.fc1_weight)
|
128 |
+
if module.fc1_bias is not None:
|
129 |
+
torch.nn.init.zeros_(module.fc1_bias)
|
130 |
+
init_fn_(module.fc2_weight)
|
131 |
+
if module.fc2_bias is not None:
|
132 |
+
torch.nn.init.zeros_(module.fc2_bias)
|
133 |
+
with torch.no_grad():
|
134 |
+
module.fc2_weight.div_(div_is_residual)
|
135 |
else:
|
136 |
for _ in module.parameters(recurse=False):
|
137 |
raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
|
pytorch_model-00001-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c51240f27be83417d07e38ba6ab0541ed5560611291bf403c3064a5f5b830889
|
3 |
+
size 9901940807
|
pytorch_model-00002-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15a418e63d400aa3e9ddd5c4fe64378eaeed49ca20f2d09a4507cfb6209de2d1
|
3 |
+
size 9865240711
|
pytorch_model-00003-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ec366585fe0612110b1251ad7706e9d0d4841657c702b4393d3df6ca9b0f32a
|
3 |
+
size 9865240711
|
pytorch_model-00004-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:853b9552f51a9c076cd9f043f958c6c78802d9512f01dfb9f852b456008ddfc2
|
3 |
+
size 9865240711
|
pytorch_model-00005-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f394f487f012f860fa6e63504aa4ffdae670101b845b966bfb43951bcfb4798f
|
3 |
+
size 9865240711
|
pytorch_model-00006-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d07e4cabbbeeb55b072dab24d8b152a36fbc0ec5021e4211d584f871fea4791
|
3 |
+
size 9865240711
|
pytorch_model-00007-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c2c9d32c24b95b3d674e1d9a43688f4aed426dd1bd54b6038c63d3c0baf282d
|
3 |
+
size 9865240711
|
pytorch_model-00008-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d80b8726b77a8c55a88ca09bf74f85028f6c0bfe93890942da37f44336fcac3
|
3 |
+
size 9865240711
|
pytorch_model-00009-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f18fa796abea986040439eb79a6c8e40da7214b5bbb873cad047c09e30a954b5
|
3 |
+
size 9865240711
|
pytorch_model-00010-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:032ac755ca09655de532d5b2df927e79e8143c102ccf2bec9da54d64b0e51708
|
3 |
+
size 9865240711
|
pytorch_model-00011-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eaf5276708ad788f6f8e1a73f77b36f54a4413ff1277b1c72cdf8c999d286fea
|
3 |
+
size 9865240711
|
pytorch_model-00012-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46ee5a0ab0d96c7b904be86d8f2f7afae664a3cfd9bfaa3cca08ea3419dd7273
|
3 |
+
size 9865240711
|
pytorch_model-00013-of-00013.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a2f8577fb3fe785e377e032b886d5aa44d5a4786a128d478a66b7d6291369d6
|
3 |
+
size 1644197388
|
pytorch_model.bin.index.json
CHANGED
@@ -1,297 +1,298 @@
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
-
"total_size":
|
4 |
},
|
5 |
"weight_map": {
|
6 |
-
"transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-
|
7 |
-
"transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-
|
8 |
-
"transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-
|
9 |
-
"transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-
|
10 |
-
"transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-
|
11 |
-
"transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-
|
12 |
-
"transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-
|
13 |
-
"transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-
|
14 |
-
"transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-
|
15 |
-
"transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-
|
16 |
-
"transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-
|
17 |
-
"transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-
|
18 |
-
"transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-
|
19 |
-
"transformer.blocks.10.attn.out_proj.weight": "pytorch_model-
|
20 |
-
"transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-
|
21 |
-
"transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-
|
22 |
-
"transformer.blocks.10.norm_1.weight": "pytorch_model-
|
23 |
-
"transformer.blocks.10.norm_2.weight": "pytorch_model-
|
24 |
-
"transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-
|
25 |
-
"transformer.blocks.11.attn.out_proj.weight": "pytorch_model-
|
26 |
-
"transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-
|
27 |
-
"transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-
|
28 |
-
"transformer.blocks.11.norm_1.weight": "pytorch_model-
|
29 |
-
"transformer.blocks.11.norm_2.weight": "pytorch_model-
|
30 |
-
"transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-
|
31 |
-
"transformer.blocks.12.attn.out_proj.weight": "pytorch_model-
|
32 |
-
"transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-
|
33 |
-
"transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-
|
34 |
-
"transformer.blocks.12.norm_1.weight": "pytorch_model-
|
35 |
-
"transformer.blocks.12.norm_2.weight": "pytorch_model-
|
36 |
-
"transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-
|
37 |
-
"transformer.blocks.13.attn.out_proj.weight": "pytorch_model-
|
38 |
-
"transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-
|
39 |
-
"transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-
|
40 |
-
"transformer.blocks.13.norm_1.weight": "pytorch_model-
|
41 |
-
"transformer.blocks.13.norm_2.weight": "pytorch_model-
|
42 |
-
"transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-
|
43 |
-
"transformer.blocks.14.attn.out_proj.weight": "pytorch_model-
|
44 |
-
"transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-
|
45 |
-
"transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-
|
46 |
-
"transformer.blocks.14.norm_1.weight": "pytorch_model-
|
47 |
-
"transformer.blocks.14.norm_2.weight": "pytorch_model-
|
48 |
-
"transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-
|
49 |
-
"transformer.blocks.15.attn.out_proj.weight": "pytorch_model-
|
50 |
-
"transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-
|
51 |
-
"transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-
|
52 |
-
"transformer.blocks.15.norm_1.weight": "pytorch_model-
|
53 |
-
"transformer.blocks.15.norm_2.weight": "pytorch_model-
|
54 |
-
"transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-
|
55 |
-
"transformer.blocks.16.attn.out_proj.weight": "pytorch_model-
|
56 |
-
"transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-
|
57 |
-
"transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-
|
58 |
-
"transformer.blocks.16.norm_1.weight": "pytorch_model-
|
59 |
-
"transformer.blocks.16.norm_2.weight": "pytorch_model-
|
60 |
-
"transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-
|
61 |
-
"transformer.blocks.17.attn.out_proj.weight": "pytorch_model-
|
62 |
-
"transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-
|
63 |
-
"transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-
|
64 |
-
"transformer.blocks.17.norm_1.weight": "pytorch_model-
|
65 |
-
"transformer.blocks.17.norm_2.weight": "pytorch_model-
|
66 |
-
"transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-
|
67 |
-
"transformer.blocks.18.attn.out_proj.weight": "pytorch_model-
|
68 |
-
"transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-
|
69 |
-
"transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-
|
70 |
-
"transformer.blocks.18.norm_1.weight": "pytorch_model-
|
71 |
-
"transformer.blocks.18.norm_2.weight": "pytorch_model-
|
72 |
-
"transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-
|
73 |
-
"transformer.blocks.19.attn.out_proj.weight": "pytorch_model-
|
74 |
-
"transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-
|
75 |
-
"transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-
|
76 |
-
"transformer.blocks.19.norm_1.weight": "pytorch_model-
|
77 |
-
"transformer.blocks.19.norm_2.weight": "pytorch_model-
|
78 |
-
"transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-
|
79 |
-
"transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-
|
80 |
-
"transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-
|
81 |
-
"transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-
|
82 |
-
"transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-
|
83 |
-
"transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-
|
84 |
-
"transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-
|
85 |
-
"transformer.blocks.20.attn.out_proj.weight": "pytorch_model-
|
86 |
-
"transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-
|
87 |
-
"transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-
|
88 |
-
"transformer.blocks.20.norm_1.weight": "pytorch_model-
|
89 |
-
"transformer.blocks.20.norm_2.weight": "pytorch_model-
|
90 |
-
"transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-
|
91 |
-
"transformer.blocks.21.attn.out_proj.weight": "pytorch_model-
|
92 |
-
"transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-
|
93 |
-
"transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-
|
94 |
-
"transformer.blocks.21.norm_1.weight": "pytorch_model-
|
95 |
-
"transformer.blocks.21.norm_2.weight": "pytorch_model-
|
96 |
-
"transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-
|
97 |
-
"transformer.blocks.22.attn.out_proj.weight": "pytorch_model-
|
98 |
-
"transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-
|
99 |
-
"transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-
|
100 |
-
"transformer.blocks.22.norm_1.weight": "pytorch_model-
|
101 |
-
"transformer.blocks.22.norm_2.weight": "pytorch_model-
|
102 |
-
"transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-
|
103 |
-
"transformer.blocks.23.attn.out_proj.weight": "pytorch_model-
|
104 |
-
"transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-
|
105 |
-
"transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-
|
106 |
-
"transformer.blocks.23.norm_1.weight": "pytorch_model-
|
107 |
-
"transformer.blocks.23.norm_2.weight": "pytorch_model-
|
108 |
-
"transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-
|
109 |
-
"transformer.blocks.24.attn.out_proj.weight": "pytorch_model-
|
110 |
-
"transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-
|
111 |
-
"transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-
|
112 |
-
"transformer.blocks.24.norm_1.weight": "pytorch_model-
|
113 |
-
"transformer.blocks.24.norm_2.weight": "pytorch_model-
|
114 |
-
"transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-
|
115 |
-
"transformer.blocks.25.attn.out_proj.weight": "pytorch_model-
|
116 |
-
"transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-
|
117 |
-
"transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-
|
118 |
-
"transformer.blocks.25.norm_1.weight": "pytorch_model-
|
119 |
-
"transformer.blocks.25.norm_2.weight": "pytorch_model-
|
120 |
-
"transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-
|
121 |
-
"transformer.blocks.26.attn.out_proj.weight": "pytorch_model-
|
122 |
-
"transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-
|
123 |
-
"transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-
|
124 |
-
"transformer.blocks.26.norm_1.weight": "pytorch_model-
|
125 |
-
"transformer.blocks.26.norm_2.weight": "pytorch_model-
|
126 |
-
"transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-
|
127 |
-
"transformer.blocks.27.attn.out_proj.weight": "pytorch_model-
|
128 |
-
"transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-
|
129 |
-
"transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-
|
130 |
-
"transformer.blocks.27.norm_1.weight": "pytorch_model-
|
131 |
-
"transformer.blocks.27.norm_2.weight": "pytorch_model-
|
132 |
-
"transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-
|
133 |
-
"transformer.blocks.28.attn.out_proj.weight": "pytorch_model-
|
134 |
-
"transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-
|
135 |
-
"transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-
|
136 |
-
"transformer.blocks.28.norm_1.weight": "pytorch_model-
|
137 |
-
"transformer.blocks.28.norm_2.weight": "pytorch_model-
|
138 |
-
"transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-
|
139 |
-
"transformer.blocks.29.attn.out_proj.weight": "pytorch_model-
|
140 |
-
"transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-
|
141 |
-
"transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-
|
142 |
-
"transformer.blocks.29.norm_1.weight": "pytorch_model-
|
143 |
-
"transformer.blocks.29.norm_2.weight": "pytorch_model-
|
144 |
-
"transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-
|
145 |
-
"transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-
|
146 |
-
"transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-
|
147 |
-
"transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-
|
148 |
-
"transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-
|
149 |
-
"transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-
|
150 |
-
"transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-
|
151 |
-
"transformer.blocks.30.attn.out_proj.weight": "pytorch_model-
|
152 |
-
"transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-
|
153 |
-
"transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-
|
154 |
-
"transformer.blocks.30.norm_1.weight": "pytorch_model-
|
155 |
-
"transformer.blocks.30.norm_2.weight": "pytorch_model-
|
156 |
-
"transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-
|
157 |
-
"transformer.blocks.31.attn.out_proj.weight": "pytorch_model-
|
158 |
-
"transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-
|
159 |
-
"transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-
|
160 |
-
"transformer.blocks.31.norm_1.weight": "pytorch_model-
|
161 |
-
"transformer.blocks.31.norm_2.weight": "pytorch_model-
|
162 |
-
"transformer.blocks.32.attn.Wqkv.weight": "pytorch_model-
|
163 |
-
"transformer.blocks.32.attn.out_proj.weight": "pytorch_model-
|
164 |
-
"transformer.blocks.32.ffn.down_proj.weight": "pytorch_model-
|
165 |
-
"transformer.blocks.32.ffn.up_proj.weight": "pytorch_model-
|
166 |
-
"transformer.blocks.32.norm_1.weight": "pytorch_model-
|
167 |
-
"transformer.blocks.32.norm_2.weight": "pytorch_model-
|
168 |
-
"transformer.blocks.33.attn.Wqkv.weight": "pytorch_model-
|
169 |
-
"transformer.blocks.33.attn.out_proj.weight": "pytorch_model-
|
170 |
-
"transformer.blocks.33.ffn.down_proj.weight": "pytorch_model-
|
171 |
-
"transformer.blocks.33.ffn.up_proj.weight": "pytorch_model-
|
172 |
-
"transformer.blocks.33.norm_1.weight": "pytorch_model-
|
173 |
-
"transformer.blocks.33.norm_2.weight": "pytorch_model-
|
174 |
-
"transformer.blocks.34.attn.Wqkv.weight": "pytorch_model-
|
175 |
-
"transformer.blocks.34.attn.out_proj.weight": "pytorch_model-
|
176 |
-
"transformer.blocks.34.ffn.down_proj.weight": "pytorch_model-
|
177 |
-
"transformer.blocks.34.ffn.up_proj.weight": "pytorch_model-
|
178 |
-
"transformer.blocks.34.norm_1.weight": "pytorch_model-
|
179 |
-
"transformer.blocks.34.norm_2.weight": "pytorch_model-
|
180 |
-
"transformer.blocks.35.attn.Wqkv.weight": "pytorch_model-
|
181 |
-
"transformer.blocks.35.attn.out_proj.weight": "pytorch_model-
|
182 |
-
"transformer.blocks.35.ffn.down_proj.weight": "pytorch_model-
|
183 |
-
"transformer.blocks.35.ffn.up_proj.weight": "pytorch_model-
|
184 |
-
"transformer.blocks.35.norm_1.weight": "pytorch_model-
|
185 |
-
"transformer.blocks.35.norm_2.weight": "pytorch_model-
|
186 |
-
"transformer.blocks.36.attn.Wqkv.weight": "pytorch_model-
|
187 |
-
"transformer.blocks.36.attn.out_proj.weight": "pytorch_model-
|
188 |
-
"transformer.blocks.36.ffn.down_proj.weight": "pytorch_model-
|
189 |
-
"transformer.blocks.36.ffn.up_proj.weight": "pytorch_model-
|
190 |
-
"transformer.blocks.36.norm_1.weight": "pytorch_model-
|
191 |
-
"transformer.blocks.36.norm_2.weight": "pytorch_model-
|
192 |
-
"transformer.blocks.37.attn.Wqkv.weight": "pytorch_model-
|
193 |
-
"transformer.blocks.37.attn.out_proj.weight": "pytorch_model-
|
194 |
-
"transformer.blocks.37.ffn.down_proj.weight": "pytorch_model-
|
195 |
-
"transformer.blocks.37.ffn.up_proj.weight": "pytorch_model-
|
196 |
-
"transformer.blocks.37.norm_1.weight": "pytorch_model-
|
197 |
-
"transformer.blocks.37.norm_2.weight": "pytorch_model-
|
198 |
-
"transformer.blocks.38.attn.Wqkv.weight": "pytorch_model-
|
199 |
-
"transformer.blocks.38.attn.out_proj.weight": "pytorch_model-
|
200 |
-
"transformer.blocks.38.ffn.down_proj.weight": "pytorch_model-
|
201 |
-
"transformer.blocks.38.ffn.up_proj.weight": "pytorch_model-
|
202 |
-
"transformer.blocks.38.norm_1.weight": "pytorch_model-
|
203 |
-
"transformer.blocks.38.norm_2.weight": "pytorch_model-
|
204 |
-
"transformer.blocks.39.attn.Wqkv.weight": "pytorch_model-
|
205 |
-
"transformer.blocks.39.attn.out_proj.weight": "pytorch_model-
|
206 |
-
"transformer.blocks.39.ffn.down_proj.weight": "pytorch_model-
|
207 |
-
"transformer.blocks.39.ffn.up_proj.weight": "pytorch_model-
|
208 |
-
"transformer.blocks.39.norm_1.weight": "pytorch_model-
|
209 |
-
"transformer.blocks.39.norm_2.weight": "pytorch_model-
|
210 |
-
"transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-
|
211 |
-
"transformer.blocks.4.attn.out_proj.weight": "pytorch_model-
|
212 |
-
"transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-
|
213 |
-
"transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-
|
214 |
-
"transformer.blocks.4.norm_1.weight": "pytorch_model-
|
215 |
-
"transformer.blocks.4.norm_2.weight": "pytorch_model-
|
216 |
-
"transformer.blocks.40.attn.Wqkv.weight": "pytorch_model-
|
217 |
-
"transformer.blocks.40.attn.out_proj.weight": "pytorch_model-
|
218 |
-
"transformer.blocks.40.ffn.down_proj.weight": "pytorch_model-
|
219 |
-
"transformer.blocks.40.ffn.up_proj.weight": "pytorch_model-
|
220 |
-
"transformer.blocks.40.norm_1.weight": "pytorch_model-
|
221 |
-
"transformer.blocks.40.norm_2.weight": "pytorch_model-
|
222 |
-
"transformer.blocks.41.attn.Wqkv.weight": "pytorch_model-
|
223 |
-
"transformer.blocks.41.attn.out_proj.weight": "pytorch_model-
|
224 |
-
"transformer.blocks.41.ffn.down_proj.weight": "pytorch_model-
|
225 |
-
"transformer.blocks.41.ffn.up_proj.weight": "pytorch_model-
|
226 |
-
"transformer.blocks.41.norm_1.weight": "pytorch_model-
|
227 |
-
"transformer.blocks.41.norm_2.weight": "pytorch_model-
|
228 |
-
"transformer.blocks.42.attn.Wqkv.weight": "pytorch_model-
|
229 |
-
"transformer.blocks.42.attn.out_proj.weight": "pytorch_model-
|
230 |
-
"transformer.blocks.42.ffn.down_proj.weight": "pytorch_model-
|
231 |
-
"transformer.blocks.42.ffn.up_proj.weight": "pytorch_model-
|
232 |
-
"transformer.blocks.42.norm_1.weight": "pytorch_model-
|
233 |
-
"transformer.blocks.42.norm_2.weight": "pytorch_model-
|
234 |
-
"transformer.blocks.43.attn.Wqkv.weight": "pytorch_model-
|
235 |
-
"transformer.blocks.43.attn.out_proj.weight": "pytorch_model-
|
236 |
-
"transformer.blocks.43.ffn.down_proj.weight": "pytorch_model-
|
237 |
-
"transformer.blocks.43.ffn.up_proj.weight": "pytorch_model-
|
238 |
-
"transformer.blocks.43.norm_1.weight": "pytorch_model-
|
239 |
-
"transformer.blocks.43.norm_2.weight": "pytorch_model-
|
240 |
-
"transformer.blocks.44.attn.Wqkv.weight": "pytorch_model-
|
241 |
-
"transformer.blocks.44.attn.out_proj.weight": "pytorch_model-
|
242 |
-
"transformer.blocks.44.ffn.down_proj.weight": "pytorch_model-
|
243 |
-
"transformer.blocks.44.ffn.up_proj.weight": "pytorch_model-
|
244 |
-
"transformer.blocks.44.norm_1.weight": "pytorch_model-
|
245 |
-
"transformer.blocks.44.norm_2.weight": "pytorch_model-
|
246 |
-
"transformer.blocks.45.attn.Wqkv.weight": "pytorch_model-
|
247 |
-
"transformer.blocks.45.attn.out_proj.weight": "pytorch_model-
|
248 |
-
"transformer.blocks.45.ffn.down_proj.weight": "pytorch_model-
|
249 |
-
"transformer.blocks.45.ffn.up_proj.weight": "pytorch_model-
|
250 |
-
"transformer.blocks.45.norm_1.weight": "pytorch_model-
|
251 |
-
"transformer.blocks.45.norm_2.weight": "pytorch_model-
|
252 |
-
"transformer.blocks.46.attn.Wqkv.weight": "pytorch_model-
|
253 |
-
"transformer.blocks.46.attn.out_proj.weight": "pytorch_model-
|
254 |
-
"transformer.blocks.46.ffn.down_proj.weight": "pytorch_model-
|
255 |
-
"transformer.blocks.46.ffn.up_proj.weight": "pytorch_model-
|
256 |
-
"transformer.blocks.46.norm_1.weight": "pytorch_model-
|
257 |
-
"transformer.blocks.46.norm_2.weight": "pytorch_model-
|
258 |
-
"transformer.blocks.47.attn.Wqkv.weight": "pytorch_model-
|
259 |
-
"transformer.blocks.47.attn.out_proj.weight": "pytorch_model-
|
260 |
-
"transformer.blocks.47.ffn.down_proj.weight": "pytorch_model-
|
261 |
-
"transformer.blocks.47.ffn.up_proj.weight": "pytorch_model-
|
262 |
-
"transformer.blocks.47.norm_1.weight": "pytorch_model-
|
263 |
-
"transformer.blocks.47.norm_2.weight": "pytorch_model-
|
264 |
-
"transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-
|
265 |
-
"transformer.blocks.5.attn.out_proj.weight": "pytorch_model-
|
266 |
-
"transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-
|
267 |
-
"transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-
|
268 |
-
"transformer.blocks.5.norm_1.weight": "pytorch_model-
|
269 |
-
"transformer.blocks.5.norm_2.weight": "pytorch_model-
|
270 |
-
"transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-
|
271 |
-
"transformer.blocks.6.attn.out_proj.weight": "pytorch_model-
|
272 |
-
"transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-
|
273 |
-
"transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-
|
274 |
-
"transformer.blocks.6.norm_1.weight": "pytorch_model-
|
275 |
-
"transformer.blocks.6.norm_2.weight": "pytorch_model-
|
276 |
-
"transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-
|
277 |
-
"transformer.blocks.7.attn.out_proj.weight": "pytorch_model-
|
278 |
-
"transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-
|
279 |
-
"transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-
|
280 |
-
"transformer.blocks.7.norm_1.weight": "pytorch_model-
|
281 |
-
"transformer.blocks.7.norm_2.weight": "pytorch_model-
|
282 |
-
"transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-
|
283 |
-
"transformer.blocks.8.attn.out_proj.weight": "pytorch_model-
|
284 |
-
"transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-
|
285 |
-
"transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-
|
286 |
-
"transformer.blocks.8.norm_1.weight": "pytorch_model-
|
287 |
-
"transformer.blocks.8.norm_2.weight": "pytorch_model-
|
288 |
-
"transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-
|
289 |
-
"transformer.blocks.9.attn.out_proj.weight": "pytorch_model-
|
290 |
-
"transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-
|
291 |
-
"transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-
|
292 |
-
"transformer.blocks.9.norm_1.weight": "pytorch_model-
|
293 |
-
"transformer.blocks.9.norm_2.weight": "pytorch_model-
|
294 |
-
"transformer.norm_f.weight": "pytorch_model-
|
295 |
-
"transformer.
|
|
|
296 |
}
|
297 |
}
|
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
+
"total_size": 120063684608
|
4 |
},
|
5 |
"weight_map": {
|
6 |
+
"transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
|
7 |
+
"transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
|
8 |
+
"transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
|
9 |
+
"transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
|
10 |
+
"transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00013.bin",
|
11 |
+
"transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00013.bin",
|
12 |
+
"transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
|
13 |
+
"transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
|
14 |
+
"transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
|
15 |
+
"transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
|
16 |
+
"transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00013.bin",
|
17 |
+
"transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00013.bin",
|
18 |
+
"transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
|
19 |
+
"transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
|
20 |
+
"transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
|
21 |
+
"transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
|
22 |
+
"transformer.blocks.10.norm_1.weight": "pytorch_model-00003-of-00013.bin",
|
23 |
+
"transformer.blocks.10.norm_2.weight": "pytorch_model-00003-of-00013.bin",
|
24 |
+
"transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
|
25 |
+
"transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
|
26 |
+
"transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
|
27 |
+
"transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
|
28 |
+
"transformer.blocks.11.norm_1.weight": "pytorch_model-00003-of-00013.bin",
|
29 |
+
"transformer.blocks.11.norm_2.weight": "pytorch_model-00003-of-00013.bin",
|
30 |
+
"transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
|
31 |
+
"transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
|
32 |
+
"transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
|
33 |
+
"transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
|
34 |
+
"transformer.blocks.12.norm_1.weight": "pytorch_model-00004-of-00013.bin",
|
35 |
+
"transformer.blocks.12.norm_2.weight": "pytorch_model-00004-of-00013.bin",
|
36 |
+
"transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
|
37 |
+
"transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
|
38 |
+
"transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
|
39 |
+
"transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
|
40 |
+
"transformer.blocks.13.norm_1.weight": "pytorch_model-00004-of-00013.bin",
|
41 |
+
"transformer.blocks.13.norm_2.weight": "pytorch_model-00004-of-00013.bin",
|
42 |
+
"transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
|
43 |
+
"transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
|
44 |
+
"transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
|
45 |
+
"transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
|
46 |
+
"transformer.blocks.14.norm_1.weight": "pytorch_model-00004-of-00013.bin",
|
47 |
+
"transformer.blocks.14.norm_2.weight": "pytorch_model-00004-of-00013.bin",
|
48 |
+
"transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
|
49 |
+
"transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
|
50 |
+
"transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
|
51 |
+
"transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
|
52 |
+
"transformer.blocks.15.norm_1.weight": "pytorch_model-00004-of-00013.bin",
|
53 |
+
"transformer.blocks.15.norm_2.weight": "pytorch_model-00004-of-00013.bin",
|
54 |
+
"transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
|
55 |
+
"transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
|
56 |
+
"transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
|
57 |
+
"transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
|
58 |
+
"transformer.blocks.16.norm_1.weight": "pytorch_model-00005-of-00013.bin",
|
59 |
+
"transformer.blocks.16.norm_2.weight": "pytorch_model-00005-of-00013.bin",
|
60 |
+
"transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
|
61 |
+
"transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
|
62 |
+
"transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
|
63 |
+
"transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
|
64 |
+
"transformer.blocks.17.norm_1.weight": "pytorch_model-00005-of-00013.bin",
|
65 |
+
"transformer.blocks.17.norm_2.weight": "pytorch_model-00005-of-00013.bin",
|
66 |
+
"transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
|
67 |
+
"transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
|
68 |
+
"transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
|
69 |
+
"transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
|
70 |
+
"transformer.blocks.18.norm_1.weight": "pytorch_model-00005-of-00013.bin",
|
71 |
+
"transformer.blocks.18.norm_2.weight": "pytorch_model-00005-of-00013.bin",
|
72 |
+
"transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
|
73 |
+
"transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
|
74 |
+
"transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
|
75 |
+
"transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
|
76 |
+
"transformer.blocks.19.norm_1.weight": "pytorch_model-00005-of-00013.bin",
|
77 |
+
"transformer.blocks.19.norm_2.weight": "pytorch_model-00005-of-00013.bin",
|
78 |
+
"transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
|
79 |
+
"transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
|
80 |
+
"transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
|
81 |
+
"transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
|
82 |
+
"transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00013.bin",
|
83 |
+
"transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00013.bin",
|
84 |
+
"transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
|
85 |
+
"transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
|
86 |
+
"transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
|
87 |
+
"transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
|
88 |
+
"transformer.blocks.20.norm_1.weight": "pytorch_model-00006-of-00013.bin",
|
89 |
+
"transformer.blocks.20.norm_2.weight": "pytorch_model-00006-of-00013.bin",
|
90 |
+
"transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
|
91 |
+
"transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
|
92 |
+
"transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
|
93 |
+
"transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
|
94 |
+
"transformer.blocks.21.norm_1.weight": "pytorch_model-00006-of-00013.bin",
|
95 |
+
"transformer.blocks.21.norm_2.weight": "pytorch_model-00006-of-00013.bin",
|
96 |
+
"transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
|
97 |
+
"transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
|
98 |
+
"transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
|
99 |
+
"transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
|
100 |
+
"transformer.blocks.22.norm_1.weight": "pytorch_model-00006-of-00013.bin",
|
101 |
+
"transformer.blocks.22.norm_2.weight": "pytorch_model-00006-of-00013.bin",
|
102 |
+
"transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
|
103 |
+
"transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
|
104 |
+
"transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
|
105 |
+
"transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
|
106 |
+
"transformer.blocks.23.norm_1.weight": "pytorch_model-00006-of-00013.bin",
|
107 |
+
"transformer.blocks.23.norm_2.weight": "pytorch_model-00006-of-00013.bin",
|
108 |
+
"transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
|
109 |
+
"transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
|
110 |
+
"transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
|
111 |
+
"transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
|
112 |
+
"transformer.blocks.24.norm_1.weight": "pytorch_model-00007-of-00013.bin",
|
113 |
+
"transformer.blocks.24.norm_2.weight": "pytorch_model-00007-of-00013.bin",
|
114 |
+
"transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
|
115 |
+
"transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
|
116 |
+
"transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
|
117 |
+
"transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
|
118 |
+
"transformer.blocks.25.norm_1.weight": "pytorch_model-00007-of-00013.bin",
|
119 |
+
"transformer.blocks.25.norm_2.weight": "pytorch_model-00007-of-00013.bin",
|
120 |
+
"transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
|
121 |
+
"transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
|
122 |
+
"transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
|
123 |
+
"transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
|
124 |
+
"transformer.blocks.26.norm_1.weight": "pytorch_model-00007-of-00013.bin",
|
125 |
+
"transformer.blocks.26.norm_2.weight": "pytorch_model-00007-of-00013.bin",
|
126 |
+
"transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
|
127 |
+
"transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
|
128 |
+
"transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
|
129 |
+
"transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
|
130 |
+
"transformer.blocks.27.norm_1.weight": "pytorch_model-00007-of-00013.bin",
|
131 |
+
"transformer.blocks.27.norm_2.weight": "pytorch_model-00007-of-00013.bin",
|
132 |
+
"transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
|
133 |
+
"transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
|
134 |
+
"transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
|
135 |
+
"transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
|
136 |
+
"transformer.blocks.28.norm_1.weight": "pytorch_model-00008-of-00013.bin",
|
137 |
+
"transformer.blocks.28.norm_2.weight": "pytorch_model-00008-of-00013.bin",
|
138 |
+
"transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
|
139 |
+
"transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
|
140 |
+
"transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
|
141 |
+
"transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
|
142 |
+
"transformer.blocks.29.norm_1.weight": "pytorch_model-00008-of-00013.bin",
|
143 |
+
"transformer.blocks.29.norm_2.weight": "pytorch_model-00008-of-00013.bin",
|
144 |
+
"transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
|
145 |
+
"transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
|
146 |
+
"transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
|
147 |
+
"transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
|
148 |
+
"transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00013.bin",
|
149 |
+
"transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00013.bin",
|
150 |
+
"transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
|
151 |
+
"transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
|
152 |
+
"transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
|
153 |
+
"transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
|
154 |
+
"transformer.blocks.30.norm_1.weight": "pytorch_model-00008-of-00013.bin",
|
155 |
+
"transformer.blocks.30.norm_2.weight": "pytorch_model-00008-of-00013.bin",
|
156 |
+
"transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
|
157 |
+
"transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
|
158 |
+
"transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
|
159 |
+
"transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
|
160 |
+
"transformer.blocks.31.norm_1.weight": "pytorch_model-00008-of-00013.bin",
|
161 |
+
"transformer.blocks.31.norm_2.weight": "pytorch_model-00008-of-00013.bin",
|
162 |
+
"transformer.blocks.32.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
|
163 |
+
"transformer.blocks.32.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
|
164 |
+
"transformer.blocks.32.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
|
165 |
+
"transformer.blocks.32.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
|
166 |
+
"transformer.blocks.32.norm_1.weight": "pytorch_model-00009-of-00013.bin",
|
167 |
+
"transformer.blocks.32.norm_2.weight": "pytorch_model-00009-of-00013.bin",
|
168 |
+
"transformer.blocks.33.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
|
169 |
+
"transformer.blocks.33.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
|
170 |
+
"transformer.blocks.33.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
|
171 |
+
"transformer.blocks.33.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
|
172 |
+
"transformer.blocks.33.norm_1.weight": "pytorch_model-00009-of-00013.bin",
|
173 |
+
"transformer.blocks.33.norm_2.weight": "pytorch_model-00009-of-00013.bin",
|
174 |
+
"transformer.blocks.34.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
|
175 |
+
"transformer.blocks.34.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
|
176 |
+
"transformer.blocks.34.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
|
177 |
+
"transformer.blocks.34.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
|
178 |
+
"transformer.blocks.34.norm_1.weight": "pytorch_model-00009-of-00013.bin",
|
179 |
+
"transformer.blocks.34.norm_2.weight": "pytorch_model-00009-of-00013.bin",
|
180 |
+
"transformer.blocks.35.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
|
181 |
+
"transformer.blocks.35.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
|
182 |
+
"transformer.blocks.35.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
|
183 |
+
"transformer.blocks.35.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
|
184 |
+
"transformer.blocks.35.norm_1.weight": "pytorch_model-00009-of-00013.bin",
|
185 |
+
"transformer.blocks.35.norm_2.weight": "pytorch_model-00009-of-00013.bin",
|
186 |
+
"transformer.blocks.36.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
|
187 |
+
"transformer.blocks.36.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
|
188 |
+
"transformer.blocks.36.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
|
189 |
+
"transformer.blocks.36.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
|
190 |
+
"transformer.blocks.36.norm_1.weight": "pytorch_model-00010-of-00013.bin",
|
191 |
+
"transformer.blocks.36.norm_2.weight": "pytorch_model-00010-of-00013.bin",
|
192 |
+
"transformer.blocks.37.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
|
193 |
+
"transformer.blocks.37.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
|
194 |
+
"transformer.blocks.37.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
|
195 |
+
"transformer.blocks.37.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
|
196 |
+
"transformer.blocks.37.norm_1.weight": "pytorch_model-00010-of-00013.bin",
|
197 |
+
"transformer.blocks.37.norm_2.weight": "pytorch_model-00010-of-00013.bin",
|
198 |
+
"transformer.blocks.38.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
|
199 |
+
"transformer.blocks.38.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
|
200 |
+
"transformer.blocks.38.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
|
201 |
+
"transformer.blocks.38.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
|
202 |
+
"transformer.blocks.38.norm_1.weight": "pytorch_model-00010-of-00013.bin",
|
203 |
+
"transformer.blocks.38.norm_2.weight": "pytorch_model-00010-of-00013.bin",
|
204 |
+
"transformer.blocks.39.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
|
205 |
+
"transformer.blocks.39.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
|
206 |
+
"transformer.blocks.39.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
|
207 |
+
"transformer.blocks.39.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
|
208 |
+
"transformer.blocks.39.norm_1.weight": "pytorch_model-00010-of-00013.bin",
|
209 |
+
"transformer.blocks.39.norm_2.weight": "pytorch_model-00010-of-00013.bin",
|
210 |
+
"transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
|
211 |
+
"transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
|
212 |
+
"transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
|
213 |
+
"transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
|
214 |
+
"transformer.blocks.4.norm_1.weight": "pytorch_model-00002-of-00013.bin",
|
215 |
+
"transformer.blocks.4.norm_2.weight": "pytorch_model-00002-of-00013.bin",
|
216 |
+
"transformer.blocks.40.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
|
217 |
+
"transformer.blocks.40.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
|
218 |
+
"transformer.blocks.40.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
|
219 |
+
"transformer.blocks.40.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
|
220 |
+
"transformer.blocks.40.norm_1.weight": "pytorch_model-00011-of-00013.bin",
|
221 |
+
"transformer.blocks.40.norm_2.weight": "pytorch_model-00011-of-00013.bin",
|
222 |
+
"transformer.blocks.41.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
|
223 |
+
"transformer.blocks.41.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
|
224 |
+
"transformer.blocks.41.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
|
225 |
+
"transformer.blocks.41.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
|
226 |
+
"transformer.blocks.41.norm_1.weight": "pytorch_model-00011-of-00013.bin",
|
227 |
+
"transformer.blocks.41.norm_2.weight": "pytorch_model-00011-of-00013.bin",
|
228 |
+
"transformer.blocks.42.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
|
229 |
+
"transformer.blocks.42.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
|
230 |
+
"transformer.blocks.42.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
|
231 |
+
"transformer.blocks.42.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
|
232 |
+
"transformer.blocks.42.norm_1.weight": "pytorch_model-00011-of-00013.bin",
|
233 |
+
"transformer.blocks.42.norm_2.weight": "pytorch_model-00011-of-00013.bin",
|
234 |
+
"transformer.blocks.43.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
|
235 |
+
"transformer.blocks.43.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
|
236 |
+
"transformer.blocks.43.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
|
237 |
+
"transformer.blocks.43.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
|
238 |
+
"transformer.blocks.43.norm_1.weight": "pytorch_model-00011-of-00013.bin",
|
239 |
+
"transformer.blocks.43.norm_2.weight": "pytorch_model-00011-of-00013.bin",
|
240 |
+
"transformer.blocks.44.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
|
241 |
+
"transformer.blocks.44.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
|
242 |
+
"transformer.blocks.44.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
|
243 |
+
"transformer.blocks.44.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
|
244 |
+
"transformer.blocks.44.norm_1.weight": "pytorch_model-00012-of-00013.bin",
|
245 |
+
"transformer.blocks.44.norm_2.weight": "pytorch_model-00012-of-00013.bin",
|
246 |
+
"transformer.blocks.45.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
|
247 |
+
"transformer.blocks.45.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
|
248 |
+
"transformer.blocks.45.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
|
249 |
+
"transformer.blocks.45.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
|
250 |
+
"transformer.blocks.45.norm_1.weight": "pytorch_model-00012-of-00013.bin",
|
251 |
+
"transformer.blocks.45.norm_2.weight": "pytorch_model-00012-of-00013.bin",
|
252 |
+
"transformer.blocks.46.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
|
253 |
+
"transformer.blocks.46.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
|
254 |
+
"transformer.blocks.46.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
|
255 |
+
"transformer.blocks.46.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
|
256 |
+
"transformer.blocks.46.norm_1.weight": "pytorch_model-00012-of-00013.bin",
|
257 |
+
"transformer.blocks.46.norm_2.weight": "pytorch_model-00012-of-00013.bin",
|
258 |
+
"transformer.blocks.47.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
|
259 |
+
"transformer.blocks.47.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
|
260 |
+
"transformer.blocks.47.ffn.down_proj.weight": "pytorch_model-00013-of-00013.bin",
|
261 |
+
"transformer.blocks.47.ffn.up_proj.weight": "pytorch_model-00013-of-00013.bin",
|
262 |
+
"transformer.blocks.47.norm_1.weight": "pytorch_model-00012-of-00013.bin",
|
263 |
+
"transformer.blocks.47.norm_2.weight": "pytorch_model-00012-of-00013.bin",
|
264 |
+
"transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
|
265 |
+
"transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
|
266 |
+
"transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
|
267 |
+
"transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
|
268 |
+
"transformer.blocks.5.norm_1.weight": "pytorch_model-00002-of-00013.bin",
|
269 |
+
"transformer.blocks.5.norm_2.weight": "pytorch_model-00002-of-00013.bin",
|
270 |
+
"transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
|
271 |
+
"transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
|
272 |
+
"transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
|
273 |
+
"transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
|
274 |
+
"transformer.blocks.6.norm_1.weight": "pytorch_model-00002-of-00013.bin",
|
275 |
+
"transformer.blocks.6.norm_2.weight": "pytorch_model-00002-of-00013.bin",
|
276 |
+
"transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
|
277 |
+
"transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
|
278 |
+
"transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
|
279 |
+
"transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
|
280 |
+
"transformer.blocks.7.norm_1.weight": "pytorch_model-00002-of-00013.bin",
|
281 |
+
"transformer.blocks.7.norm_2.weight": "pytorch_model-00002-of-00013.bin",
|
282 |
+
"transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
|
283 |
+
"transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
|
284 |
+
"transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
|
285 |
+
"transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
|
286 |
+
"transformer.blocks.8.norm_1.weight": "pytorch_model-00003-of-00013.bin",
|
287 |
+
"transformer.blocks.8.norm_2.weight": "pytorch_model-00003-of-00013.bin",
|
288 |
+
"transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
|
289 |
+
"transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
|
290 |
+
"transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
|
291 |
+
"transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
|
292 |
+
"transformer.blocks.9.norm_1.weight": "pytorch_model-00003-of-00013.bin",
|
293 |
+
"transformer.blocks.9.norm_2.weight": "pytorch_model-00003-of-00013.bin",
|
294 |
+
"transformer.norm_f.weight": "pytorch_model-00013-of-00013.bin",
|
295 |
+
"transformer.wpe.weight": "pytorch_model-00001-of-00013.bin",
|
296 |
+
"transformer.wte.weight": "pytorch_model-00001-of-00013.bin"
|
297 |
}
|
298 |
}
|