Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +169 -0
chat_template.jinja +327 -0
config.json +73 -0
generation_config.json +10 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +843 -0
recipe.yaml +27 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,169 @@

+---
+license: apache-2.0
+base_model:
+- swiss-ai/Apertus-8B-Instruct-2509
+pipeline_tag: text-generation
+library_name: transformers
+tags:
+  - multilingual
+  - compliant
+  - swiss-ai
+  - apertus
+extra_gated_prompt: "### Apertus LLM Acceptable Use Policy  \n(1.0 | September 1, 2025)\n\"Agreement\" The Swiss National AI Institute (SNAI) is a partnership between the two Swiss Federal Institutes of Technology, ETH Zurich and EPFL. \n\nBy using the Apertus LLM you agree to indemnify, defend, and hold harmless ETH Zurich and EPFL against any third-party claims arising from your use of Apertus LLM. \n\nThe training data and the Apertus LLM may contain or generate information that directly or indirectly refers to an identifiable individual (Personal Data). You process Personal Data as independent controller in accordance with applicable data protection law. SNAI will regularly provide a file with hash values for download which you can apply as an output filter to your use of our Apertus LLM. The file reflects data protection deletion requests which have been addressed to SNAI as the developer of the Apertus LLM. It allows you to remove Personal Data contained in the model output. We strongly advise downloading and applying this output filter from SNAI every six months following the release of the model.  "
+extra_gated_fields:
+  Your Name: text
+  Country: country
+  Affiliation: text
+  geo: ip_location
+  By clicking Submit below I accept the terms of use: checkbox
+extra_gated_button_content: Submit
+---
+# Apertus
+![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/6639f08490b7db8dcbf1a2aa/YKux3SpTciL4O60L3Ol-6.jpeg)
+##  Table of Contents
+1. [Model Summary](#model-summary)
+2. [How to use](#how-to-use)
+3. [Evaluation](#evaluation)
+4. [Training](#training)
+5. [Limitations](#limitations)
+6. [Legal Aspects](#legal-aspects)
+## Model Summary
+Apertus is a 70B and 8B parameter language model designed to push the boundaries of fully-open multilingual and transparent models.
+The model supports over 1000 languages and long context, it uses only fully compliant and open training data, and achieves comparable performance to models trained behind closed doors.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/654baf61d625e083383dfd00/gKDv_6dpIpvmgyquenbXt.png)
+The model is a decoder-only transformer, pretrained on 15T tokens with a staged curriculum of web, code and math data. The model uses a new xIELU activation function and is trained from scratch with the AdEMAMix optimizer. Post-training included supervised fine-tuning and alignment via QRPO.
+### Key features
+- **Fully open model**: open weights + open data + full training details including all data and training recipes
+- **Massively Multilingual**: 1811 natively supported languages
+- **Compliant** Apertus is trained while respecting opt-out consent of data owners (even retrospectivey), and avoiding memorization of training data
+For more details refer to our [technical report](https://github.com/swiss-ai/apertus-tech-report/blob/main/Apertus_Tech_Report.pdf)
+## How to use
+The modeling code for Apertus is available in transformers `v4.56.0`, so make sure to upgrade your transformers version. You can also load the model with the latest `vLLM` which uses transformers as a backend.
+```bash
+pip install -U transformers
+```
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "swiss-ai/Apertus-8B-Instruct-2509"
+device = "cuda"  # for GPU usage or "cpu" for CPU usage
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+).to(device)
+# prepare the model input
+prompt = "Give me a brief explanation of gravity in simple terms."
+messages_think = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages_think,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+model_inputs = tokenizer([text], return_tensors="pt", add_special_tokens=False).to(model.device)
+# Generate the output
+generated_ids = model.generate(**model_inputs, max_new_tokens=32768)
+# Get and decode the output
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
+print(tokenizer.decode(output_ids, skip_special_tokens=True))
+```
+>[!TIP]
+> We recommend setting `temperature=0.8` and `top_p=0.9` in the sampling parameters.
+### Long context processing
+Apertus by default supports a context length up to 65,536 tokens.
+### Agentic Usage
+Apertus supports tool use
+### vLLM and SGLang
+You can use vLLM and SGLang to deploy the model in an API compatible with OpenAI format.
+## Evaluation
+In this section, we report the evaluation results of Apertus model.
+### Base Pre-Trained Model
+- see [Apertus_Tech_Report.pdf](https://github.com/swiss-ai/apertus-tech-report/blob/main/Apertus_Tech_Report.pdf)
+### Instruction Model
+- see [Apertus_Tech_Report.pdf](https://github.com/swiss-ai/apertus-tech-report/blob/main/Apertus_Tech_Report.pdf)
+## Training
+### Model
+- **Architecture:** Transformer decoder
+- **Pretraining tokens:** 15T
+- **Precision:** bfloat16
+### Software & hardware
+- **GPUs:** 4096 GH200
+- **Training Framework:** [Megatron-LM](https://github.com/swiss-ai/Megatron-LM)
+- ...
+### Open resources
+All elements used in the training process are made openly available
+- **Training data reconstruction scripts:** [github.com/swiss-ai/pretrain-data](https://github.com/swiss-ai/pretrain-data)
+- The training intermediate checkpoints are available on the different branches of this same repository
+## Limitations
+Apertus can produce text on a variety of topics, but the generated content may not always be factually accurate, logically consistent, or free from biases present in the training data. These models should be used as assistive tools rather than definitive sources of information. Users should always verify important information and critically evaluate any generated content.
+## Legal Aspects
+#### EU AI Act Transparency Documentation and Code of Practice
+- [Apertus_EU_Public_Summary.pdf](https://huggingface.co/swiss-ai/Apertus-70B-2509/blob/main/Apertus_EU_Public_Summary.pdf)
+- [Apertus_EU_Code_of_Practice.pdf](https://huggingface.co/swiss-ai/Apertus-70B-2509/blob/main/Apertus_EU_Code_of_Practice.pdf)
+#### Data Protection and Copyright Requests
+For removal requests of personally identifiable information (PII) or of copyrighted content, please contact the respective dataset owners or us directly
+- [email protected]
+- [email protected]
+#### Output Filter for PII
+- Currently no output filter is provided.
+- Please check this site regularly for an output filter that can be used on top of the Apertus LLM. The filter reflects data protection deletion requests which have been addressed to us as the developer of the Apertus LLM. It allows you to remove Personal Data contained in the model output. We strongly advise downloading and applying this output filter from this site every six months.
+## Contact
+To contact us, please send an email to
+[email protected]
+## Citation
+```bash
+@misc{swissai2025apertus,
+  title={{Apertus: Democratizing Open and Compliant LLMs for Global Language Environments}},
+  author={Apertus Team},
+  year={2025},
+  howpublished={\url{https://huggingface.co/swiss-ai/Apertus-70B-2509}}
+}
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,327 @@

+{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
+    {%- if param_spec.type == "array" -%}
+        {%- if param_spec['items'] -%}
+            {%- if param_spec['items']['type'] == "string" -%}
+                {{- "string[]" }}
+            {%- elif param_spec['items']['type'] == "number" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "integer" -%}
+                {{- "number[]" }}
+            {%- elif param_spec['items']['type'] == "boolean" -%}
+                {{- "boolean[]" }}
+            {%- else -%}
+                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
+                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
+                    {{- "any[]" }}
+                {%- else -%}
+                    {{- inner_type + "[]" }}
+                {%- endif -%}
+            {%- endif -%}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- else -%}
+            {{- "any[]" }}
+            {%- if param_spec.nullable -%}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
+        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
+        {%- if param_spec.type | length > 1 -%}
+            {{- param_spec.type | join(" | ") }}
+        {%- else -%}
+            {{- param_spec.type[0] }}
+        {%- endif -%}
+    {%- elif param_spec.oneOf -%}
+        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
+        {%- set has_object_variants = false -%}
+        {%- for variant in param_spec.oneOf -%}
+            {%- if variant.type == "object" -%}
+                {%- set has_object_variants = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
+            {{- "any" }}
+        {%- else -%}
+            {%- for variant in param_spec.oneOf -%}
+                {{- render_typescript_type(variant, required_params) -}}
+                {%- if variant.description %}
+                    {{- "// " + variant.description }}
+                {%- endif -%}
+                {%- if variant.default is defined %}
+                    {{ "// default: " + variant.default|tojson }}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- " | " }}
+                {% endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "string" -%}
+        {%- if param_spec.enum -%}
+            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
+        {%- else -%}
+            {{- "string" }}
+            {%- if param_spec.nullable %}
+                {{- " | null" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- elif param_spec.type == "number" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "integer" -%}
+        {{- "number" }}
+    {%- elif param_spec.type == "boolean" -%}
+        {{- "boolean" }}
+    {%- elif param_spec.type == "object" -%}
+        {%- if param_spec.properties -%}
+            {{- "{\n" }}
+            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
+                {{- prop_name -}}
+                {%- if prop_name not in (param_spec.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
+                {%- if not loop.last -%}
+                    {{-", " }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{- "}" }}
+        {%- else -%}
+            {{- "object" }}
+        {%- endif -%}
+    {%- else -%}
+        {{- "any" }}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro render_tools(tools) -%}
+    {%- for tool in tools %}
+        {{- "// " + tool.description + "\n" }}
+        {{- "type "+ tool.name + " = " }}
+        {%- if tool.parameters and tool.parameters.properties %}
+            {{- "(_: {\n" }}
+            {%- for param_name, param_spec in tool.parameters.properties.items() %}
+                {%- if param_spec.description %}
+                    {{- "// " + param_spec.description + "\n" }}
+                {%- endif %}
+                {{- param_name }}
+                {%- if param_name not in (tool.parameters.required or []) -%}
+                    {{- "?" }}
+                {%- endif -%}
+                {{- ": " }}
+                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
+                {%- if param_spec.default is defined -%}
+                    {%- if param_spec.enum %}
+                        {{- ", // default: " + param_spec.default }}
+                    {%- elif param_spec.oneOf %}
+                        {{- "// default: " + param_spec.default }}
+                    {%- else %}
+                        {{- ", // default: " + param_spec.default|tojson }}
+                    {%- endif -%}
+                {%- endif -%}
+                {%- if not loop.last %}
+                    {{- ",\n" }}
+                {%- else %}
+                    {{- "\n" }}
+                {%- endif -%}
+            {%- endfor %}
+            {{- "}) => any;" }}
+        {%- else -%}
+            {{- "() => any;" }}
+        {%- endif -%}
+        {%- if not loop.last -%}
+            {{- "\n" }}
+        {%- endif -%}
+    {%- endfor %}
+{%- endmacro -%}
+{{ bos_token }}
+{%- set system_token = '<|system_start|>' -%}
+{%- set end_system_token = '<|system_end|>' -%}
+{%- set developer_token = '<|developer_start|>' -%}
+{%- set end_developer_token = '<|developer_end|>' -%}
+{%- set user_token = '<|user_start|>' -%}
+{%- set end_user_token = '<|user_end|>' -%}
+{%- set assistant_token = '<|assistant_start|>' -%}
+{%- set end_assistant_token = '<|assistant_end|>' -%}
+{%- set inner_token = '<|inner_prefix|>' -%}
+{%- set outer_token = '<|inner_suffix|>' -%}
+{%- set tool_calls_token = '<|tools_prefix|>' -%}
+{%- set end_tool_calls_token = '<|tools_suffix|>' -%}
+{%- set ns = namespace(in_assistant=false, in_tool=false, in_inner=false, assistant_format=none) -%}
+{%- if messages and messages[0].role == 'system' -%}
+    {%- if "content" in messages[0] -%}
+        {%- if messages[0].content is string -%}
+            {{ system_token + messages[0].content + end_system_token }}
+        {%- elif messages[0].content is mapping and "text" in messages[0].content -%}
+            {{ system_token + messages[0].content.text + end_system_token }}
+        {%- else -%}
+            {{- raise_exception("Invalid system message") -}}
+        {%- endif -%}
+    {%- else -%}
+        {{- raise_exception("Invalid system message") -}}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {{ system_token + 'You are Apertus, a helpful assistant created by the SwissAI initiative.\nKnowledge cutoff: 2024-04\nCurrent date: ' + strftime_now('%Y-%m-%d') + end_system_token }}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{{ developer_token + 'Deliberation: ' }}
+{%- if enable_thinking is defined and enable_thinking -%}
+    {{ 'enabled\n' }}
+{%- else -%}
+    {{ 'disabled\n' }}
+{%- endif -%}
+{%- if tools is defined and tools -%}
+    {{ 'Tool Capabilities:\n' + render_tools(tools) }}
+{%- else -%}
+    {{ 'Tool Capabilities: disabled' }}
+{%- endif -%}
+{{ end_developer_token }}
+{%- for message in loop_messages -%}
+    {%- if message.role == 'user' -%}
+        {%- set ns.in_inner = false -%}
+        {%- if ns.in_tool -%}
+            {{ ']' }}
+            {%- set ns.in_tool = false -%}
+        {%- endif -%}
+        {%- if ns.in_assistant -%}
+            {{ end_assistant_token }}
+            {%- set ns.in_assistant = false -%}
+        {%- endif -%}
+        {%- if "content" in message -%}
+            {{ user_token }}
+            {%- if message.content is string -%}
+                {{ message.content }}
+            {%- elif message.content is mapping and "parts" in message.content -%}
+                {%- set parts = message.content.parts -%}
+                {%- for part in parts -%}
+                    {%- if part.type == "text" -%}
+                        {{ part.text }}
+                    {%- else -%}
+                        {{- raise_exception("Invalid user part: " + part.type) -}}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- else -%}
+                {{- raise_exception("Invalid user message: " + message.role) -}}
+            {%- endif -%}
+            {{ end_user_token }}
+        {%- endif -%}
+    {%- elif message.role == 'assistant' -%}
+        {%- if not ns.in_assistant -%}
+            {{ assistant_token }}
+            {%- set ns.in_assistant = true -%}
+        {%- endif -%}
+        {%- if "content" in message -%}
+            {%- if message.content is string and (ns.assistant_format is none or ns.assistant_format == "string") -%}
+                {%- if ns.in_tool -%}
+                    {{ ']' }}
+                    {%- set ns.in_tool = false -%}
+                {%- endif -%}
+                {%- set ns.assistant_format = "string" -%}
+                {{ message.content }}
+            {%- elif message.content is mapping and "blocks" in message.content and (ns.assistant_format is none or ns.assistant_format == "mapping") -%}
+                {%- set ns.assistant_format = "mapping" -%}
+                {%- set blocks = message.content.blocks -%}
+                {%- for block in blocks -%}
+                    {%- if block.type == 'thoughts' -%}
+                        {%- if ns.in_tool -%}
+                            {{ ']' }}
+                            {%- set ns.in_tool = false -%}
+                        {%- endif -%}
+                        {%- if not ns.in_inner -%}
+                            {%- set ns.in_inner = true -%}
+                            {{ inner_token }}
+                        {%- endif -%}
+                        {{ block.text }}
+                    {%- elif block.type == 'tool_calls' -%}
+                        {%- if ns.in_tool -%}
+                            {{ ']' }}
+                            {%- set ns.in_tool = false -%}
+                        {%- endif -%}
+                        {%- if ns.in_inner and not loop.first and block.calls|length == 1 and block.calls[0].name == 'display_answers' -%}
+                            {%- set ns.in_inner = false -%}
+                            {{ outer_token }}
+                        {%- endif -%}
+                        {{ tool_calls_token + '[' }}
+                        {%- for tool_call in block.calls -%}
+                            {{- '{"' + tool_call.name + '": ' + tool_call.arguments + '}' }}
+                            {%- if not loop.last -%}
+                                {{- ", " }}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {{ ']' + end_tool_calls_token }}
+                    {%- elif block.type == 'tool_outputs' -%}
+                        {%- if ns.in_tool -%}
+                            {{- raise_exception("Cannot have both tool outputs as separate messages and tool outputs as blocks") -}}
+                        {%- endif -%}
+                        {{ '[' }}
+                        {%- for tool_output in block.outputs -%}
+                            {{- tool_output.output }}
+                            {%- if not loop.last -%}
+                                {{- ", " }}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {{- ']' }}
+                    {%- elif block.type == 'response' -%}
+                        {%- if ns.in_tool -%}
+                            {{ ']' }}
+                            {%- set ns.in_tool = false -%}
+                        {%- endif -%}
+                        {%- if (not loop.first and ns.in_inner) or (ns.in_assistant and ns.in_inner) -%}
+                            {%- set ns.in_inner = false -%}
+                            {{ outer_token }}
+                        {%- endif -%}
+                        {{ block.text }}
+                    {%- else -%}
+                        {{- raise_exception("Invalid assistant block type: " + block.type) -}}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- else -%}
+                {{- raise_exception("Invalid assistant content") -}}
+            {%- endif -%}
+        {%- else -%}
+            {{- raise_exception("Invalid assistant message") -}}
+        {%- endif -%}
+        {%- if "tool_calls" in message and message.tool_calls -%}
+            {{ tool_calls_token + '[' }}
+            {%- for tool_call in message.tool_calls -%}
+                {%- if tool_call.type == 'function' -%}
+                    {%- set function = tool_call.function -%}
+                    {{- '{"' + function.name + '": ' + function.arguments + '}' }}
+                    {%- if not loop.last -%}
+                        {{- ", " }}
+                    {%- endif -%}
+                {%- else -%}
+                    {{- raise_exception("Invalid tool call type: " + tool_call.type) -}}
+                {%- endif -%}
+            {%- endfor -%}
+            {{ ']' + end_tool_calls_token }}
+        {%- endif -%}
+    {%- elif message.role == 'tool' -%}
+        {%- if not ns.in_assistant -%}
+            {{- raise_exception("Tool message outside of assistant") -}}
+        {%- endif -%}
+        {%- if not ns.in_tool -%}
+            {{ '[' }}
+            {%- set ns.in_tool = true -%}
+        {%- else -%}
+            {{ ", "}}
+        {%- endif -%}
+        {{ message.content }}
+    {%- else -%}
+        {{- raise_exception("Invalid message role") -}}
+    {%- endif -%}
+{%- endfor -%}
+{%- if ns.in_tool -%}
+    {{ ']' }}
+{%- endif -%}
+{%- if add_generation_prompt -%}
+    {{ assistant_token }}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "architectures": [
+    "ApertusForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 68,
+  "hidden_act": "xielu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 21504,
+  "max_position_embeddings": 65536,
+  "mlp_bias": false,
+  "model_type": "apertus",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_token_id": 3,
+  "post_norm": false,
+  "qk_norm": true,
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "format": "pack-quantized",
+        "input_activations": null,
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 32,
+          "num_bits": 4,
+          "observer": "mse",
+          "observer_kwargs": {},
+          "strategy": "group",
+          "symmetric": true,
+          "type": "int"
+        }
+      }
+    },
+    "format": "pack-quantized",
+    "global_compression_ratio": null,
+    "ignore": [
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed",
+    "sparsity_config": {},
+    "transform_config": {},
+    "version": "0.10.3.dev47+ge463fe6"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3",
+    "type": "llama3"
+  },
+  "rope_theta": 12000000,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.0.dev0",
+  "use_cache": false,
+  "vocab_size": 131072
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2,
+    68,
+    72
+  ],
+  "transformers_version": "4.57.0.dev0"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:925ae36ef0fa4fc062eeb9330e288a1c55396a7e49f2b35153fa595d2f496154
+size 4994725960

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fea0f54b6a75c64189e5e311dae6860c9c8cd74409d166e3588e49f41851070
+size 1079272416

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,843 @@

+{
+  "metadata": {
+    "total_parameters": 2164535744,
+    "total_size": 6073904384
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.1.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.10.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.11.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.12.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.13.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.14.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.15.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.16.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.17.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.18.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.19.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.2.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.20.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.21.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.22.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.23.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.24.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.25.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.26.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.27.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.28.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.29.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.3.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.30.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.31.attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.act_fn.alpha_n": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.act_fn.alpha_p": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.act_fn.beta": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.act_fn.eps": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight_scale": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.4.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.5.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.6.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.7.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.8.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.9.attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.act_fn.alpha_n": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.act_fn.alpha_p": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.act_fn.beta": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.act_fn.eps": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight_packed": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight_scale": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight_shape": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

recipe.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      config_groups:
+        group_0:
+          targets: [Linear]
+          weights:
+            num_bits: 4
+            type: int
+            symmetric: true
+            group_size: 32
+            strategy: group
+            block_structure: null
+            dynamic: false
+            actorder: null
+            observer: mse
+            observer_kwargs: {}
+          input_activations: null
+          output_activations: null
+          format: null
+      targets: [Linear]
+      ignore: [lm_head, model.embed_tokens, 're:.*attention_layernorm$', 're:.*feedforward_layernorm$',
+        're:.*act_fn.*', model.norm]
+      sequential_update: true
+      block_size: 128
+      dampening_frac: 0.01
+      offload_hessians: false

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|assistant_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb201fb226cde11f66c3cf51c5344fb37b1611f00c21e75c324546d854eff2e1
+size 17078480

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff