yujiepan commited on
Commit
d463a50
·
verified ·
1 Parent(s): d5bc491

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: text-generation
4
+ inference: true
5
+ widget:
6
+ - text: Hello!
7
+ example_title: Hello world
8
+ group: Python
9
+ base_model:
10
+ - swiss-ai/Apertus-70B-Instruct-2509
11
+ ---
12
+
13
+ This tiny model is for debugging. It is randomly initialized with the config adapted from [swiss-ai/Apertus-70B-Instruct-2509](https://huggingface.co/swiss-ai/Apertus-70B-Instruct-2509).
14
+
15
+ ### Example usage:
16
+
17
+ - vLLM
18
+
19
+ ```bash
20
+ vllm serve tiny-random/apertus
21
+ ```
22
+
23
+ - Transformers
24
+
25
+ ```python
26
+ import os
27
+ import re
28
+
29
+ import torch
30
+ from transformers import AutoModelForCausalLM, AutoTokenizer
31
+
32
+ model_id = "tiny-random/apertus"
33
+
34
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
35
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
36
+ messages = [
37
+ {"role": "user", "content": "How to make pasta?"},
38
+ ]
39
+ tokenized_chat = tokenizer.apply_chat_template(
40
+ messages,
41
+ tokenize=True,
42
+ add_generation_prompt=True,
43
+ return_tensors="pt",
44
+ thinking_budget=64 # control the thinking budget
45
+ )
46
+
47
+ outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=128)
48
+ output_text = tokenizer.decode(outputs[0])
49
+ print(output_text)
50
+ ```
51
+
52
+ ### Codes to create this repo:
53
+
54
+ ```python
55
+ import json
56
+ from pathlib import Path
57
+
58
+ import accelerate
59
+ import torch
60
+ from huggingface_hub import file_exists, hf_hub_download
61
+ from transformers import (
62
+ AutoConfig,
63
+ AutoModelForCausalLM,
64
+ AutoProcessor,
65
+ GenerationConfig,
66
+ set_seed,
67
+ )
68
+
69
+ source_model_id = "swiss-ai/Apertus-70B-Instruct-2509"
70
+ save_folder = "/tmp/tiny-random/apertus"
71
+
72
+ processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
73
+ processor.save_pretrained(save_folder)
74
+
75
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
76
+ config_json = json.load(f)
77
+ config_json['hidden_size'] = 8
78
+ config_json['head_dim'] = 32 # vllm requirement
79
+ config_json['intermediate_size'] = 32
80
+ config_json['num_attention_heads'] = 8
81
+ config_json['num_hidden_layers'] = 2
82
+ config_json['num_key_value_heads'] = 4 # better support tensor parallel
83
+ config_json['tie_word_embeddings'] = False
84
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
85
+ json.dump(config_json, f, indent=2)
86
+
87
+ config = AutoConfig.from_pretrained(
88
+ save_folder,
89
+ trust_remote_code=True,
90
+ )
91
+ print(config)
92
+ torch.set_default_dtype(torch.bfloat16)
93
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
94
+ torch.set_default_dtype(torch.float32)
95
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
96
+ model.generation_config = GenerationConfig.from_pretrained(
97
+ source_model_id, trust_remote_code=True,
98
+ )
99
+ model.generation_config.do_sample = True
100
+ set_seed(42)
101
+ model = model.cpu() # cpu is more stable for random initialization across machines
102
+ with torch.no_grad():
103
+ for name, p in sorted(model.named_parameters()):
104
+ torch.nn.init.normal_(p, 0, 0.1)
105
+ print(name, p.shape)
106
+ model.save_pretrained(save_folder)
107
+ ```
108
+
109
+ ### Printing the model:
110
+
111
+ ```text
112
+ ApertusForCausalLM(
113
+ (model): ApertusModel(
114
+ (embed_tokens): Embedding(131072, 8, padding_idx=3)
115
+ (layers): ModuleList(
116
+ (0-1): 2 x ApertusDecoderLayer(
117
+ (self_attn): ApertusAttention(
118
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
119
+ (k_proj): Linear(in_features=8, out_features=128, bias=False)
120
+ (v_proj): Linear(in_features=8, out_features=128, bias=False)
121
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
122
+ (q_norm): ApertusRMSNorm((32,), eps=1e-05)
123
+ (k_norm): ApertusRMSNorm((32,), eps=1e-05)
124
+ )
125
+ (mlp): ApertusMLP(
126
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
127
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
128
+ (act_fn): XIELUActivation()
129
+ )
130
+ (attention_layernorm): ApertusRMSNorm((8,), eps=1e-05)
131
+ (feedforward_layernorm): ApertusRMSNorm((8,), eps=1e-05)
132
+ )
133
+ )
134
+ (norm): ApertusRMSNorm((8,), eps=1e-05)
135
+ (rotary_emb): ApertusRotaryEmbedding()
136
+ )
137
+ (lm_head): Linear(in_features=8, out_features=131072, bias=False)
138
+ )
139
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
2
+ {%- if param_spec.type == "array" -%}
3
+ {%- if param_spec['items'] -%}
4
+ {%- if param_spec['items']['type'] == "string" -%}
5
+ {{- "string[]" }}
6
+ {%- elif param_spec['items']['type'] == "number" -%}
7
+ {{- "number[]" }}
8
+ {%- elif param_spec['items']['type'] == "integer" -%}
9
+ {{- "number[]" }}
10
+ {%- elif param_spec['items']['type'] == "boolean" -%}
11
+ {{- "boolean[]" }}
12
+ {%- else -%}
13
+ {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
14
+ {%- if inner_type == "object | object" or inner_type|length > 50 -%}
15
+ {{- "any[]" }}
16
+ {%- else -%}
17
+ {{- inner_type + "[]" }}
18
+ {%- endif -%}
19
+ {%- endif -%}
20
+ {%- if param_spec.nullable -%}
21
+ {{- " | null" }}
22
+ {%- endif -%}
23
+ {%- else -%}
24
+ {{- "any[]" }}
25
+ {%- if param_spec.nullable -%}
26
+ {{- " | null" }}
27
+ {%- endif -%}
28
+ {%- endif -%}
29
+ {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
30
+ {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
31
+ {%- if param_spec.type | length > 1 -%}
32
+ {{- param_spec.type | join(" | ") }}
33
+ {%- else -%}
34
+ {{- param_spec.type[0] }}
35
+ {%- endif -%}
36
+ {%- elif param_spec.oneOf -%}
37
+ {#- Handle oneOf schemas - check for complex unions and fallback to any #}
38
+ {%- set has_object_variants = false -%}
39
+ {%- for variant in param_spec.oneOf -%}
40
+ {%- if variant.type == "object" -%}
41
+ {%- set has_object_variants = true -%}
42
+ {%- endif -%}
43
+ {%- endfor -%}
44
+ {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
45
+ {{- "any" }}
46
+ {%- else -%}
47
+ {%- for variant in param_spec.oneOf -%}
48
+ {{- render_typescript_type(variant, required_params) -}}
49
+ {%- if variant.description %}
50
+ {{- "// " + variant.description }}
51
+ {%- endif -%}
52
+ {%- if variant.default is defined %}
53
+ {{ "// default: " + variant.default|tojson }}
54
+ {%- endif -%}
55
+ {%- if not loop.last %}
56
+ {{- " | " }}
57
+ {% endif -%}
58
+ {%- endfor -%}
59
+ {%- endif -%}
60
+ {%- elif param_spec.type == "string" -%}
61
+ {%- if param_spec.enum -%}
62
+ {{- '"' + param_spec.enum|join('" | "') + '"' -}}
63
+ {%- else -%}
64
+ {{- "string" }}
65
+ {%- if param_spec.nullable %}
66
+ {{- " | null" }}
67
+ {%- endif -%}
68
+ {%- endif -%}
69
+ {%- elif param_spec.type == "number" -%}
70
+ {{- "number" }}
71
+ {%- elif param_spec.type == "integer" -%}
72
+ {{- "number" }}
73
+ {%- elif param_spec.type == "boolean" -%}
74
+ {{- "boolean" }}
75
+ {%- elif param_spec.type == "object" -%}
76
+ {%- if param_spec.properties -%}
77
+ {{- "{\n" }}
78
+ {%- for prop_name, prop_spec in param_spec.properties.items() -%}
79
+ {{- prop_name -}}
80
+ {%- if prop_name not in (param_spec.required or []) -%}
81
+ {{- "?" }}
82
+ {%- endif -%}
83
+ {{- ": " }}
84
+ {{ render_typescript_type(prop_spec, param_spec.required or []) }}
85
+ {%- if not loop.last -%}
86
+ {{-", " }}
87
+ {%- endif -%}
88
+ {%- endfor -%}
89
+ {{- "}" }}
90
+ {%- else -%}
91
+ {{- "object" }}
92
+ {%- endif -%}
93
+ {%- else -%}
94
+ {{- "any" }}
95
+ {%- endif -%}
96
+ {%- endmacro -%}
97
+
98
+ {%- macro render_tools(tools) -%}
99
+ {%- for tool in tools %}
100
+ {{- "// " + tool.description + "\n" }}
101
+ {{- "type "+ tool.name + " = " }}
102
+ {%- if tool.parameters and tool.parameters.properties %}
103
+ {{- "(_: {\n" }}
104
+ {%- for param_name, param_spec in tool.parameters.properties.items() %}
105
+ {%- if param_spec.description %}
106
+ {{- "// " + param_spec.description + "\n" }}
107
+ {%- endif %}
108
+ {{- param_name }}
109
+ {%- if param_name not in (tool.parameters.required or []) -%}
110
+ {{- "?" }}
111
+ {%- endif -%}
112
+ {{- ": " }}
113
+ {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
114
+ {%- if param_spec.default is defined -%}
115
+ {%- if param_spec.enum %}
116
+ {{- ", // default: " + param_spec.default }}
117
+ {%- elif param_spec.oneOf %}
118
+ {{- "// default: " + param_spec.default }}
119
+ {%- else %}
120
+ {{- ", // default: " + param_spec.default|tojson }}
121
+ {%- endif -%}
122
+ {%- endif -%}
123
+ {%- if not loop.last %}
124
+ {{- ",\n" }}
125
+ {%- else %}
126
+ {{- "\n" }}
127
+ {%- endif -%}
128
+ {%- endfor %}
129
+ {{- "}) => any;" }}
130
+ {%- else -%}
131
+ {{- "() => any;" }}
132
+ {%- endif -%}
133
+ {%- if not loop.last -%}
134
+ {{- "\n" }}
135
+ {%- endif -%}
136
+ {%- endfor %}
137
+ {%- endmacro -%}
138
+
139
+ {{ bos_token }}
140
+
141
+ {%- set system_token = '<|system_start|>' -%}
142
+ {%- set end_system_token = '<|system_end|>' -%}
143
+ {%- set developer_token = '<|developer_start|>' -%}
144
+ {%- set end_developer_token = '<|developer_end|>' -%}
145
+ {%- set user_token = '<|user_start|>' -%}
146
+ {%- set end_user_token = '<|user_end|>' -%}
147
+ {%- set assistant_token = '<|assistant_start|>' -%}
148
+ {%- set end_assistant_token = '<|assistant_end|>' -%}
149
+ {%- set inner_token = '<|inner_prefix|>' -%}
150
+ {%- set outer_token = '<|inner_suffix|>' -%}
151
+ {%- set tool_calls_token = '<|tools_prefix|>' -%}
152
+ {%- set end_tool_calls_token = '<|tools_suffix|>' -%}
153
+
154
+ {%- set ns = namespace(in_assistant=false, in_tool=false, in_inner=false, assistant_format=none) -%}
155
+
156
+ {%- if messages and messages[0].role == 'system' -%}
157
+ {%- if "content" in messages[0] -%}
158
+ {%- if messages[0].content is string -%}
159
+ {{ system_token + messages[0].content + end_system_token }}
160
+ {%- elif messages[0].content is mapping and "text" in messages[0].content -%}
161
+ {{ system_token + messages[0].content.text + end_system_token }}
162
+ {%- else -%}
163
+ {{- raise_exception("Invalid system message") -}}
164
+ {%- endif -%}
165
+ {%- else -%}
166
+ {{- raise_exception("Invalid system message") -}}
167
+ {%- endif -%}
168
+ {%- set loop_messages = messages[1:] -%}
169
+ {%- else -%}
170
+ {{ system_token + 'You are Apertus, a helpful assistant created by the SwissAI initiative.\nKnowledge cutoff: 2024-04\nCurrent date: ' + strftime_now('%Y-%m-%d') + end_system_token }}
171
+ {%- set loop_messages = messages -%}
172
+ {%- endif -%}
173
+
174
+ {{ developer_token + 'Deliberation: ' }}
175
+ {%- if enable_thinking is defined and enable_thinking -%}
176
+ {{ 'enabled\n' }}
177
+ {%- else -%}
178
+ {{ 'disabled\n' }}
179
+ {%- endif -%}
180
+ {%- if tools is defined and tools -%}
181
+ {{ 'Tool Capabilities:\n' + render_tools(tools) }}
182
+ {%- else -%}
183
+ {{ 'Tool Capabilities: disabled' }}
184
+ {%- endif -%}
185
+ {{ end_developer_token }}
186
+
187
+ {%- for message in loop_messages -%}
188
+ {%- if message.role == 'user' -%}
189
+ {%- set ns.in_inner = false -%}
190
+ {%- if ns.in_tool -%}
191
+ {{ ']' }}
192
+ {%- set ns.in_tool = false -%}
193
+ {%- endif -%}
194
+ {%- if ns.in_assistant -%}
195
+ {{ end_assistant_token }}
196
+ {%- set ns.in_assistant = false -%}
197
+ {%- endif -%}
198
+ {%- if "content" in message -%}
199
+ {{ user_token }}
200
+ {%- if message.content is string -%}
201
+ {{ message.content }}
202
+ {%- elif message.content is mapping and "parts" in message.content -%}
203
+ {%- set parts = message.content.parts -%}
204
+ {%- for part in parts -%}
205
+ {%- if part.type == "text" -%}
206
+ {{ part.text }}
207
+ {%- else -%}
208
+ {{- raise_exception("Invalid user part: " + part.type) -}}
209
+ {%- endif -%}
210
+ {%- endfor -%}
211
+ {%- else -%}
212
+ {{- raise_exception("Invalid user message: " + message.role) -}}
213
+ {%- endif -%}
214
+ {{ end_user_token }}
215
+ {%- endif -%}
216
+ {%- elif message.role == 'assistant' -%}
217
+ {%- if not ns.in_assistant -%}
218
+ {{ assistant_token }}
219
+ {%- set ns.in_assistant = true -%}
220
+ {%- endif -%}
221
+ {%- if "content" in message -%}
222
+ {%- if message.content is string and (ns.assistant_format is none or ns.assistant_format == "string") -%}
223
+ {%- if ns.in_tool -%}
224
+ {{ ']' }}
225
+ {%- set ns.in_tool = false -%}
226
+ {%- endif -%}
227
+ {%- set ns.assistant_format = "string" -%}
228
+ {{ message.content }}
229
+ {%- elif message.content is mapping and "blocks" in message.content and (ns.assistant_format is none or ns.assistant_format == "mapping") -%}
230
+ {%- set ns.assistant_format = "mapping" -%}
231
+ {%- set blocks = message.content.blocks -%}
232
+ {%- for block in blocks -%}
233
+ {%- if block.type == 'thoughts' -%}
234
+ {%- if ns.in_tool -%}
235
+ {{ ']' }}
236
+ {%- set ns.in_tool = false -%}
237
+ {%- endif -%}
238
+ {%- if not ns.in_inner -%}
239
+ {%- set ns.in_inner = true -%}
240
+ {{ inner_token }}
241
+ {%- endif -%}
242
+ {{ block.text }}
243
+ {%- elif block.type == 'tool_calls' -%}
244
+ {%- if ns.in_tool -%}
245
+ {{ ']' }}
246
+ {%- set ns.in_tool = false -%}
247
+ {%- endif -%}
248
+ {%- if ns.in_inner and not loop.first and block.calls|length == 1 and block.calls[0].name == 'display_answers' -%}
249
+ {%- set ns.in_inner = false -%}
250
+ {{ outer_token }}
251
+ {%- endif -%}
252
+ {{ tool_calls_token + '[' }}
253
+ {%- for tool_call in block.calls -%}
254
+ {{- '{"' + tool_call.name + '": ' + tool_call.arguments + '}' }}
255
+ {%- if not loop.last -%}
256
+ {{- ", " }}
257
+ {%- endif -%}
258
+ {%- endfor -%}
259
+ {{ ']' + end_tool_calls_token }}
260
+ {%- elif block.type == 'tool_outputs' -%}
261
+ {%- if ns.in_tool -%}
262
+ {{- raise_exception("Cannot have both tool outputs as separate messages and tool outputs as blocks") -}}
263
+ {%- endif -%}
264
+ {{ '[' }}
265
+ {%- for tool_output in block.outputs -%}
266
+ {{- tool_output.output }}
267
+ {%- if not loop.last -%}
268
+ {{- ", " }}
269
+ {%- endif -%}
270
+ {%- endfor -%}
271
+ {{- ']' }}
272
+ {%- elif block.type == 'response' -%}
273
+ {%- if ns.in_tool -%}
274
+ {{ ']' }}
275
+ {%- set ns.in_tool = false -%}
276
+ {%- endif -%}
277
+ {%- if (not loop.first and ns.in_inner) or (ns.in_assistant and ns.in_inner) -%}
278
+ {%- set ns.in_inner = false -%}
279
+ {{ outer_token }}
280
+ {%- endif -%}
281
+ {{ block.text }}
282
+ {%- else -%}
283
+ {{- raise_exception("Invalid assistant block type: " + block.type) -}}
284
+ {%- endif -%}
285
+ {%- endfor -%}
286
+ {%- else -%}
287
+ {{- raise_exception("Invalid assistant content") -}}
288
+ {%- endif -%}
289
+ {%- else -%}
290
+ {{- raise_exception("Invalid assistant message") -}}
291
+ {%- endif -%}
292
+ {%- if "tool_calls" in message and message.tool_calls -%}
293
+ {{ tool_calls_token + '[' }}
294
+ {%- for tool_call in message.tool_calls -%}
295
+ {%- if tool_call.type == 'function' -%}
296
+ {%- set function = tool_call.function -%}
297
+ {{- '{"' + function.name + '": ' + function.arguments + '}' }}
298
+ {%- if not loop.last -%}
299
+ {{- ", " }}
300
+ {%- endif -%}
301
+ {%- else -%}
302
+ {{- raise_exception("Invalid tool call type: " + tool_call.type) -}}
303
+ {%- endif -%}
304
+ {%- endfor -%}
305
+ {{ ']' + end_tool_calls_token }}
306
+ {%- endif -%}
307
+ {%- elif message.role == 'tool' -%}
308
+ {%- if not ns.in_assistant -%}
309
+ {{- raise_exception("Tool message outside of assistant") -}}
310
+ {%- endif -%}
311
+ {%- if not ns.in_tool -%}
312
+ {{ '[' }}
313
+ {%- set ns.in_tool = true -%}
314
+ {%- else -%}
315
+ {{ ", "}}
316
+ {%- endif -%}
317
+ {{ message.content }}
318
+ {%- else -%}
319
+ {{- raise_exception("Invalid message role") -}}
320
+ {%- endif -%}
321
+ {%- endfor -%}
322
+ {%- if ns.in_tool -%}
323
+ {{ ']' }}
324
+ {%- endif -%}
325
+ {%- if add_generation_prompt -%}
326
+ {{ assistant_token }}
327
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ApertusForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 68,
10
+ "head_dim": 32,
11
+ "hidden_act": "xielu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 8,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 32,
16
+ "max_position_embeddings": 65536,
17
+ "mlp_bias": false,
18
+ "model_type": "apertus",
19
+ "num_attention_heads": 8,
20
+ "num_hidden_layers": 2,
21
+ "num_key_value_heads": 4,
22
+ "pad_token_id": 3,
23
+ "post_norm": false,
24
+ "qk_norm": true,
25
+ "rms_norm_eps": 1e-05,
26
+ "rope_scaling": {
27
+ "factor": 8.0,
28
+ "high_freq_factor": 4.0,
29
+ "low_freq_factor": 1.0,
30
+ "original_max_position_embeddings": 8192,
31
+ "rope_type": "llama3",
32
+ "type": "llama3"
33
+ },
34
+ "rope_theta": 12000000,
35
+ "tie_word_embeddings": false,
36
+ "transformers_version": "4.56.1",
37
+ "use_cache": false,
38
+ "vocab_size": 131072
39
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 2,
7
+ 68,
8
+ 72
9
+ ],
10
+ "transformers_version": "4.56.1"
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a5a95fb0b2ab7eee9b21c6c8cc250ed0321460094c89ee6d69cd365a2701e16
3
+ size 4224456
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|assistant_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb201fb226cde11f66c3cf51c5344fb37b1611f00c21e75c324546d854eff2e1
3
+ size 17078480
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff