AtAndDev commited on 3 days ago

Commit

787e370

verified ·

1 Parent(s): 4eacf8c

Upload MoLA-LM: Mixture of LoRA Adapters Language Model

Browse files

Files changed (44) hide show

.gitattributes +1 -0
README.md +74 -0
__init__.py +8 -0
added_tokens.json +28 -0
chat_template.jinja +86 -0
config.json +24 -0
configuration_mola_lm.py +39 -0
loras/0/README.md +202 -0
loras/0/adapter_config.json +34 -0
loras/0/adapter_model.safetensors +3 -0
loras/1/README.md +202 -0
loras/1/adapter_config.json +34 -0
loras/1/adapter_model.safetensors +3 -0
loras/2/README.md +202 -0
loras/2/adapter_config.json +34 -0
loras/2/adapter_model.safetensors +3 -0
loras/3/README.md +202 -0
loras/3/adapter_config.json +34 -0
loras/3/adapter_model.safetensors +3 -0
loras/4/README.md +202 -0
loras/4/adapter_config.json +34 -0
loras/4/adapter_model.safetensors +3 -0
loras/5/README.md +202 -0
loras/5/adapter_config.json +34 -0
loras/5/adapter_model.safetensors +3 -0
loras/6/README.md +202 -0
loras/6/adapter_config.json +34 -0
loras/6/adapter_model.safetensors +3 -0
loras/7/README.md +202 -0
loras/7/adapter_config.json +34 -0
loras/7/adapter_model.safetensors +3 -0
loras/8/README.md +202 -0
loras/8/adapter_config.json +34 -0
loras/8/adapter_model.safetensors +3 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +513 -0
modeling_mola_lm.py +598 -0
router_weights.pth +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +239 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+---
+license: apache-2.0
+library_name: transformers
+tags:
+- pytorch
+- mixture-of-experts
+- lora
+- adapter
+- causal-lm
+- text-generation
+language:
+- en
+pipeline_tag: text-generation
+---
+Image here
+# MoLA-LM: Mixture of LoRA Adapters LLM
+MoLA-LM combines multiple LoRA adapters with an intelligent router to automatically select the best adapter for each input prompt. This approach enables specialized performance across different tasks while maintaining efficiency.
+Evals are coming...
+## Model Details
+- **Model Type**: Mixture of LoRA Adapters Language Model
+- **Base Model**: Qwen/Qwen3-4B-Thinking-2507
+- **Total Adapters**: 9
+- **Architecture**: Custom MoLAForCausalLM with automatic adapter routing
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Load the model (trust_remote_code=True is required for custom architecture)
+model = AutoModelForCausalLM.from_pretrained(
+    "MoLA-LLM/MoLA-v0.5-9x4b",
+    trust_remote_code=True,
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained("MoLA-LLM/MoLA-v0.5-9x4b", trust_remote_code=True)
+# Use like any other language model - adapter selection is automatic
+prompt = "Write a Python function to calculate fibonacci numbers"
+messages = [{"role": "user", "content": prompt}]
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=8192, temperature=.6, do_sample=True)
+response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
+print(f"Selected LoRA: {model.get_current_lora()}")
+print(response)
+```
+*You can also use load_in_4bit and load_in_8bit directly when loading!*
+## Architecture
+The MoLA-LM architecture consists of:
+1. **Base Model**: Qwen/Qwen3-4B-Thinking-2507
+2. **Router Network**: Frozen encoder as Sentence transformer + decoder as one layer MLP for adapter selection
+3. **LoRA Adapters**: 9 task-specific fine-tuned adapters
+4. **Dynamic Switching**: Automatic adapter application based on input
+---
+##*Paper coming soon™*

__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+MoLA-LM: Mixture of LoRA Adapters Language Model
+"""
+from .configuration_mola_lm import MoLAConfig
+from .modeling_mola_lm import MoLAForCausalLM
+__all__ = ["MoLAConfig", "MoLAForCausalLM"]

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,86 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "MoLAForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mola_lm.MoLAConfig",
+    "AutoModelForCausalLM": "modeling_mola_lm.MoLAForCausalLM"
+  },
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "task_labels": [
+    "0",
+    "1",
+    "2",
+    "3",
+    "4",
+    "5",
+    "6",
+    "7",
+    "8"
+  ],
+  "num_loras": 9,
+  "model_type": "mola_lm",
+  "transformers_version": "4.36.0"
+}

configuration_mola_lm.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+Configuration class for MoLA-LM
+"""
+from transformers import PretrainedConfig
+from typing import Dict, List
+EXPERTS_LIST = [
+    "0",
+    "1",
+    "2",
+    "3",
+    "4",
+    "5",
+    "6",
+    "7",
+    "8",
+]
+class MoLAConfig(PretrainedConfig):
+    """Configuration class for MoLA-LM model."""
+    model_type = "mola_lm"
+    def __init__(
+        self,
+        base_model_name_or_path: str = "Qwen/Qwen2.5-3B-Instruct",
+        task_labels: List[str] = None,
+        router_config: Dict = None,
+        lora_configs: Dict[str, Dict] = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.base_model_name_or_path = base_model_name_or_path
+        self.task_labels = task_labels or EXPERTS_LIST
+        self.router_config = router_config or {}
+        self.lora_configs = lora_configs or {}
+        self.num_loras = len(self.task_labels)

loras/0/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/0/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "k_proj",
+    "v_proj",
+    "up_proj",
+    "gate_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/0/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5188249779649a7bd2fccd0893dbd4d5ba46bf6dbefb4d3aa9c00c48446966ac
+size 66126768

loras/1/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/1/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/1/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b75845af51d525de7d85dd7132eec30f6c4d36761e843449c526cf07daafd3b
+size 66126768

loras/2/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/2/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a3824581f02be957b6e82217233d3b08b85f93632bcef1c1ff2089bf18f912
+size 66126768

loras/3/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/3/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "o_proj",
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/3/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c658489b09b599292dd5a0f22a5989323e27860c6612b83ab89140b7cb2e705
+size 66126768

loras/4/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/4/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/4/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf0ec70d994ce232c05cc1a6ec47980b138f3dad85a910b56b384b00b55b939
+size 66126768

loras/5/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b692ebde6d90eb0b492fde12dd51c3fca9d73e68eb9a519b01a434124010cd6
+size 66126768

loras/6/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/6/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/6/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69981b3d52a31dc4631d35e6929ee18489413365570ad287d9bf544d05de10cd
+size 66126768

loras/7/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/7/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/7/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daacefa493abc4a6606c29f2f3c143369f298722066e656fa6c28a1e7bdc88b2
+size 66126768

loras/8/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen3-4B-Thinking-2507
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

loras/8/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

loras/8/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10256aef4d2832e826f1a341849b1dc0ea60e451c74dfdb7fd17df09e8e1b1fd
+size 66126768

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4a0ea0a2e096f17d2540516d5ad87c17965d0c8478cb91850823855ca164ea
+size 4967217648

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4a211ece672744d6763cc52a55d5fc491292f5865c77f6ec09dc9f72554ab82
+size 3168785724

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,513 @@

+{
+  "metadata": {
+    "total_parameters": 4045219145,
+    "total_size": 8135940388
+  },
+  "weight_map": {
+    "base_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "base_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "base_model.model.norm.weight": "model-00002-of-00002.safetensors",
+    "router_decoder.0.bias": "model-00002-of-00002.safetensors",
+    "router_decoder.0.weight": "model-00002-of-00002.safetensors",
+    "router_decoder.3.bias": "model-00002-of-00002.safetensors",
+    "router_decoder.3.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.embeddings.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.embeddings.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.embeddings.position_embeddings.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.embeddings.token_type_embeddings.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.embeddings.word_embeddings.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.self.key.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.self.key.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.self.query.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.self.query.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.self.value.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.attention.self.value.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.intermediate.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.intermediate.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.0.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.self.key.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.self.key.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.self.query.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.self.query.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.self.value.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.attention.self.value.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.intermediate.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.intermediate.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.1.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.self.key.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.self.key.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.self.query.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.self.query.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.self.value.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.attention.self.value.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.intermediate.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.intermediate.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.2.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.self.key.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.self.key.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.self.query.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.self.query.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.self.value.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.attention.self.value.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.intermediate.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.intermediate.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.3.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.self.key.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.self.key.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.self.query.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.self.query.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.self.value.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.attention.self.value.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.intermediate.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.intermediate.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.4.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.self.key.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.self.key.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.self.query.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.self.query.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.self.value.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.attention.self.value.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.intermediate.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.intermediate.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.output.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.encoder.layer.5.output.dense.weight": "model-00002-of-00002.safetensors",
+    "router_encoder.pooler.dense.bias": "model-00002-of-00002.safetensors",
+    "router_encoder.pooler.dense.weight": "model-00002-of-00002.safetensors"
+  }
+}

modeling_mola_lm.py ADDED Viewed

	@@ -0,0 +1,598 @@

+import os
+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Optional, Union
+from transformers import (
+    AutoConfig, AutoTokenizer, AutoModelForCausalLM,
+    PretrainedConfig, PreTrainedModel, GenerationMixin
+)
+from transformers.models.auto import CONFIG_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING
+from peft import PeftModel, LoraConfig, get_peft_model
+EXPERTS_LIST = [
+    "0",
+    "1",
+    "2",
+    "3",
+    "4",
+    "5",
+    "6",
+    "7",
+    "8",
+]
+class MoLAConfig(PretrainedConfig):
+    """Configuration class for MoLA-LM model."""
+    model_type = "mola_lm"
+    def __init__(
+        self,
+        base_model_name_or_path: str = "Qwen/Qwen3-4B-Thinking-2507",
+        task_labels: List[str] = None,
+        router_config: Dict = None,
+        lora_configs: Dict[str, Dict] = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.base_model_name_or_path = base_model_name_or_path
+        self.task_labels = task_labels or EXPERTS_LIST
+        self.router_config = router_config or {}
+        self.lora_configs = lora_configs or {}
+        self.num_loras = len(self.task_labels)
+class MoLAForCausalLM(PreTrainedModel, GenerationMixin):
+    """
+    MoLA Language Model for Causal Language Modeling - AutoModel Compatible
+    """
+    config_class = MoLAConfig
+    base_model_prefix = "mola_model"  # Avoid recursion by using unique prefix
+    supports_gradient_checkpointing = True
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # Store model path for loading resources
+        self.model_path = getattr(config, '_name_or_path', None)
+        # Load base model (use base_model_prefix name)
+        print(f"Loading base model: {self.config.base_model_name_or_path}")
+        self.mola_model = AutoModelForCausalLM.from_pretrained(
+            self.config.base_model_name_or_path,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None
+        )
+        # Load tokenizer
+        if self.model_path:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model_name_or_path)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Initialize router
+        self._init_router()
+        # Initialize current model state (will be updated by _load_lora_adapters)
+        self._current_lora = None
+        self._current_adapted_model = self.mola_model
+        # Load LoRA configurations and adapters (this will update _current_adapted_model)
+        self._load_lora_adapters()
+        # Initialize device property (needed for PreTrainedModel compatibility)
+        self._device = next(self.mola_model.parameters()).device
+        # Load router weights if available
+        self._load_router_weights()
+        print("MoLA-LM initialized successfully!")
+    def _load_router_weights(self):
+        """Load router weights from the saved checkpoint."""
+        if self.model_path:
+            try:
+                # Handle both local and Hub paths for router weights
+                if os.path.exists(self.model_path):
+                    # Local path
+                    router_weights_path = os.path.join(self.model_path, "router_weights.pth")
+                    if os.path.exists(router_weights_path):
+                        checkpoint = torch.load(router_weights_path, map_location='cpu')
+                    else:
+                        print("⚠️ No router weights found locally")
+                        return
+                else:
+                    # Hub path - download router weights
+                    try:
+                        from huggingface_hub import hf_hub_download
+                        router_weights_path = hf_hub_download(
+                            repo_id=self.model_path,
+                            filename="router_weights.pth",
+                            local_files_only=False
+                        )
+                        checkpoint = torch.load(router_weights_path, map_location='cpu')
+                        print("📥 Downloaded router weights from Hub")
+                    except Exception as hub_e:
+                        print(f"⚠️ Failed to download router weights from Hub: {hub_e}")
+                        print("🔄 Router will use random initialization (reduced performance)")
+                        return
+                # Load router decoder weights
+                router_state_dict = {}
+                for key, value in checkpoint.items():
+                    if not key.startswith('encoder.'):  # Skip encoder weights
+                        router_state_dict[key] = value
+                if router_state_dict:
+                    self.router_decoder.load_state_dict(router_state_dict, strict=False)
+                    print("✅ Loaded router weights successfully!")
+                    # Verify weights loaded by checking if they're not all zeros
+                    first_layer = next(iter(self.router_decoder.parameters()))
+                    if torch.all(first_layer == 0):
+                        print("⚠️ Warning: Router weights appear to be zero-initialized")
+                    else:
+                        print("🎯 Router weights verified - non-zero values detected")
+                else:
+                    print("⚠️ No valid router weights found in checkpoint")
+            except Exception as e:
+                print(f"❌ Failed to load router weights: {e}")
+                print("🔄 Router will use random initialization (reduced performance)")
+    def _init_router(self):
+        """Initialize the router model for LoRA selection."""
+        try:
+            from transformers import AutoModel
+            print("Initializing router components...")
+            # Router components
+            self.router_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+            self.router_encoder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+            # Freeze encoder
+            for param in self.router_encoder.parameters():
+                param.requires_grad = False
+            # Router decoder
+            encoder_dim = self.router_encoder.config.hidden_size
+            self.router_decoder = nn.Sequential(
+                nn.Linear(encoder_dim, 96),
+                nn.ReLU(),
+                nn.Dropout(0.2),
+                nn.Linear(96, self.config.num_loras)
+            )
+            # Move router to device
+            if torch.cuda.is_available():
+                self.router_encoder = self.router_encoder.cuda()
+                self.router_decoder = self.router_decoder.cuda()
+            print("Router initialized successfully!")
+        except ImportError as e:
+            raise ImportError(f"Required dependencies not found: {e}")
+    def _load_lora_adapters(self):
+        """Load LoRA adapters using PEFT (single wrapper, multiple adapters)."""
+        from huggingface_hub import hf_hub_download
+        if not self.model_path:
+            print("No model path specified, skipping LoRA loading")
+            return
+        print("Loading LoRA adapters (single wrapper)...")
+        # Get the first adapter to create the initial PEFT wrapper
+        first_adapter = str(self.config.task_labels[0])
+        first_lora_path = None
+        try:
+            # Handle both local and Hub paths for first adapter
+            if os.path.exists(self.model_path):
+                # Local path
+                first_lora_path = os.path.join(self.model_path, "loras", first_adapter)
+                if not os.path.exists(first_lora_path):
+                    raise FileNotFoundError(f"First adapter directory not found: {first_lora_path}")
+            else:
+                # Hub path - download first adapter
+                try:
+                    # Download first adapter to get local path
+                    adapter_file = hf_hub_download(
+                        repo_id=self.model_path,
+                        filename=f"loras/{first_adapter}/adapter_model.safetensors"
+                    )
+                    first_lora_path = os.path.dirname(adapter_file)
+                    print(f"Downloaded first adapter to: {first_lora_path}")
+                except Exception as e:
+                    raise Exception(f"Failed to download first adapter {first_adapter}: {e}")
+            # Create the initial PEFT wrapper with unique adapter name
+            peft_model = PeftModel.from_pretrained(
+                self.mola_model,
+                first_lora_path,
+                adapter_name=first_adapter,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+            )
+            print(f"✅ Loaded first LoRA: {first_adapter}")
+            # Load remaining adapters into the same wrapper with unique names
+            for task_name in self.config.task_labels[1:]:
+                try:
+                    lora_path = None
+                    if os.path.exists(self.model_path):
+                        # Local path
+                        lora_path = os.path.join(self.model_path, "loras", task_name)
+                        if not os.path.exists(lora_path):
+                            print(f"⚠️ LoRA directory not found: {lora_path}")
+                            continue
+                    else:
+                        # Hub path - download adapter
+                        try:
+                            adapter_file = hf_hub_download(
+                                repo_id=self.model_path,
+                                filename=f"loras/{task_name}/adapter_model.safetensors"
+                            )
+                            lora_path = os.path.dirname(adapter_file)
+                        except Exception as e:
+                            print(f"❌ Failed to download LoRA {task_name}: {e}")
+                            continue
+                    # Load adapter into the same PEFT model with unique name
+                    peft_model.load_adapter(lora_path, adapter_name=task_name)
+                    print(f"✅ Loaded LoRA: {task_name}")
+                except Exception as e:
+                    print(f"❌ Failed to load LoRA {task_name}: {e}")
+            # Store single PEFT model for all adapters
+            self.lora_models = {str(name): peft_model for name in self.config.task_labels}
+            self._current_lora = first_adapter
+            self._current_adapted_model = peft_model
+            print(f"Loaded {len(self.config.task_labels)} LoRA adapters into one PEFT model.")
+            print(f"Available adapter names: {list(peft_model.peft_config.keys())}")
+        except Exception as e:
+            print(f"❌ Failed to initialize LoRA loading: {e}")
+            self.lora_models = {}
+            self._current_adapted_model = self.mola_model
+            self._current_lora = None
+    def predict_best_lora(self, text: str) -> str:
+        """Predict the best LoRA adapter for given text."""
+        # Set models to eval mode
+        self.router_encoder.eval()
+        self.router_decoder.eval()
+        # Encode text
+        inputs = self.router_tokenizer(
+            [text],
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors="pt"
+        )
+        # Move to device
+        device = next(self.router_decoder.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.router_encoder(**inputs)
+            embeddings = outputs.last_hidden_state.mean(dim=1)
+            logits = self.router_decoder(embeddings)
+        # Get best LoRA
+        best_idx = torch.argmax(logits, dim=-1).item()
+        predicted_label = self.config.task_labels[best_idx]
+        # Debug output
+        # print(f"Debug - Text: {text[:50]}...")
+        # print(f"Debug - Logits: {logits[0].cpu().numpy()}")
+        # print(f"Debug - Best idx: {best_idx}, Label: {predicted_label}")
+        return predicted_label
+    def _apply_lora(self, lora_name: str):
+        """Apply the selected LoRA adapter using set_adapter."""
+        if hasattr(self, '_current_adapted_model') and isinstance(self._current_adapted_model, PeftModel):
+            # Check if the adapter exists in the PEFT model
+            if str(lora_name) in self._current_adapted_model.peft_config:
+                if lora_name != self._current_lora:
+                    self._current_adapted_model.set_adapter(str(lora_name))
+                    self._current_lora = str(lora_name)
+                    # print(f"🎯 Applied LoRA: {lora_name}")  # Uncomment for debugging
+            else:
+                print(f"⚠️ LoRA adapter '{lora_name}' not found in PEFT model. Available: {list(self._current_adapted_model.peft_config.keys())}")
+                # Keep current adapter if requested one doesn't exist
+        else:
+            # Fallback to base model if no PEFT model available
+            self._current_adapted_model = self.mola_model
+            self._current_lora = None
+            print(f"⚠️ No PEFT model available, using base model")
+    def get_available_loras(self) -> List[str]:
+        """Get list of available LoRA adapter names."""
+        if hasattr(self, '_current_adapted_model') and isinstance(self._current_adapted_model, PeftModel):
+            return list(self._current_adapted_model.peft_config.keys())
+        else:
+            return []
+    def test_adapter_uniqueness(self, layer_name: str = "base_model.model.model.layers.33.mlp.down_proj"):
+        """
+        Regression test to verify that adapters have different weights.
+        Args:
+            layer_name: The layer to test (default is a common MLP layer)
+        Returns:
+            Dict[str, str]: Mapping of adapter names to their weight hashes
+        """
+        import hashlib
+        if not hasattr(self, '_current_adapted_model') or not isinstance(self._current_adapted_model, PeftModel):
+            print("⚠️ No PEFT model available for testing")
+            return {}
+        names = self.get_available_loras()
+        if len(names) <= 1:
+            print(f"⚠️ Need at least 2 adapters for uniqueness test, found {len(names)}")
+            return {}
+        def fused_sha(adapter_name, layer_name):
+            """Compute SHA256 hash of fused LoRA weights for given adapter and layer."""
+            # Switch to the adapter
+            self._apply_lora(adapter_name)
+            # Navigate to the specified layer
+            try:
+                mod = self._current_adapted_model
+                for part in layer_name.split("."):
+                    if part:
+                        mod = getattr(mod, part)
+                # Get LoRA components
+                if not hasattr(mod, 'lora_A') or not hasattr(mod, 'lora_B'):
+                    print(f"⚠️ Layer {layer_name} doesn't have LoRA components")
+                    return "no_lora"
+                # Get the adapter name (should be the same as what we set)
+                adapter_key = next(iter(mod.lora_A.keys()))
+                A = mod.lora_A[adapter_key].weight
+                B = mod.lora_B[adapter_key].weight
+                s = float(mod.scaling[adapter_key])
+                # Compute fused weights: ΔW = (B @ A) * scaling
+                dW = (B @ A) * s
+                # Convert to bytes and hash
+                tensor_bytes = dW.detach().to("cpu", dtype=torch.float32).contiguous().numpy().tobytes()
+                return hashlib.sha256(tensor_bytes).hexdigest()[:16]
+            except Exception as e:
+                print(f"❌ Error computing hash for {adapter_name}: {e}")
+                return f"error_{adapter_name}"
+        print(f"🧪 Testing adapter uniqueness on layer: {layer_name}")
+        hashes = {}
+        for adapter_name in names:
+            hash_val = fused_sha(adapter_name, layer_name)
+            hashes[adapter_name] = hash_val
+            print(f"  {adapter_name}: {hash_val}")
+        # Check uniqueness
+        unique_hashes = set(hashes.values())
+        if len(unique_hashes) == len(names):
+            print("✅ All adapters have unique weights!")
+        else:
+            print(f"❌ Found duplicate weights! {len(names)} adapters but only {len(unique_hashes)} unique hashes")
+            # Show which ones are identical
+            from collections import defaultdict
+            hash_to_adapters = defaultdict(list)
+            for adapter, hash_val in hashes.items():
+                hash_to_adapters[hash_val].append(adapter)
+            for hash_val, adapter_list in hash_to_adapters.items():
+                if len(adapter_list) > 1:
+                    print(f"  Identical weights (hash {hash_val}): {adapter_list}")
+        return hashes
+    def generate(self, input_ids=None, attention_mask=None, **kwargs):
+        """
+        Standard generate method with automatic LoRA selection.
+        Works exactly like any other LLM's generate method.
+        """
+        # If we have input_ids, predict and apply the best LoRA
+        if input_ids is not None and hasattr(self, 'tokenizer'):
+            try:
+                # Decode the input to get the text for LoRA prediction
+                if len(input_ids.shape) > 1:
+                    # Batch input - use first item
+                    text_input = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+                else:
+                    text_input = self.tokenizer.decode(input_ids, skip_special_tokens=True)
+                # Clean the text thoroughly to remove ALL chat template artifacts
+                import re
+                # Build regex patterns to avoid escape issues in embedded code
+                start_pattern = '<' + '|im_start|' + '>user'
+                end_pattern = '<' + '|im_end|' + '>'
+                # First, try to extract just the user's actual question/prompt
+                if start_pattern in text_input and end_pattern in text_input:
+                    start_idx = text_input.find(start_pattern) + len(start_pattern)
+                    end_idx = text_input.find(end_pattern, start_idx)
+                    if end_idx > start_idx:
+                        text_input = text_input[start_idx:end_idx].strip()
+                # Clean up any remaining template artifacts
+                # Remove special tokens with simple string replacement
+                text_input = text_input.replace('<|im_start|>', '')
+                text_input = text_input.replace('<|im_end|>', '')
+                text_input = text_input.replace('system', '')
+                text_input = text_input.replace('user', '')
+                text_input = text_input.replace('assistant', '')
+                # Remove system message patterns
+                if 'You are Qwen' in text_input:
+                    lines = text_input.split('\n')
+                    lines = [line for line in lines if 'You are' not in line and 'Alibaba' not in line]
+                    text_input = ' '.join(lines)
+                # Final cleanup
+                text_input = re.sub(r'\n+', ' ', text_input)  # Replace newlines with spaces
+                text_input = re.sub(r'\s+', ' ', text_input)  # Normalize whitespace
+                text_input = text_input.strip()
+                # Debug: print the actual text being classified
+                # print(f"DEBUG RAW: '{self.tokenizer.decode(input_ids[0], skip_special_tokens=False)}'")
+                # print(f"DEBUG CLEAN: '{text_input}'")
+                # Predict and apply best LoRA
+                best_lora = self.predict_best_lora(text_input)
+                self._apply_lora(best_lora)
+            except Exception as e:
+                # If LoRA prediction fails, use base model
+                # print(f"DEBUG: LoRA prediction failed: {e}")
+                self._current_adapted_model = self.mola_model
+                self._current_lora = None
+        # Use the currently adapted model for generation
+        return self._current_adapted_model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs
+        )
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        """Forward pass through the model."""
+        return self._current_adapted_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+    def __call__(self, *args, **kwargs):
+        """Make the model callable."""
+        return self._current_adapted_model(*args, **kwargs)
+    def get_input_embeddings(self):
+        """Get the input embeddings."""
+        return self._current_adapted_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        """Set the input embeddings."""
+        self._current_adapted_model.set_input_embeddings(value)
+        # Also set for base model to keep them in sync
+        self.mola_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        """Get the output embeddings."""
+        return self._current_adapted_model.get_output_embeddings()
+    def set_output_embeddings(self, value):
+        """Set the output embeddings."""
+        self._current_adapted_model.set_output_embeddings(value)
+        # Also set for base model to keep them in sync
+        self.mola_model.set_output_embeddings(value)
+    def tie_weights(self):
+        """Tie input and output embeddings."""
+        self._current_adapted_model.tie_weights()
+    def resize_token_embeddings(self, new_num_tokens):
+        """Resize token embeddings."""
+        return self._current_adapted_model.resize_token_embeddings(new_num_tokens)
+    @property
+    def device(self):
+        """Get the device of the model."""
+        return next(self.mola_model.parameters()).device
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """Load model from pretrained path (transformers compatibility)."""
+        # Load config
+        config = MoLAConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        # Store the path for resource loading
+        config._name_or_path = pretrained_model_name_or_path
+        return cls(config)
+    def save_pretrained(self, save_directory, **kwargs):
+        """Save model using standard transformers approach."""
+        # Accept standard transformers parameters but use the ones we need
+        max_shard_size = kwargs.get('max_shard_size', "5GB")
+        safe_serialization = kwargs.get('safe_serialization', True)
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config using transformers method
+        self.config.save_pretrained(save_directory)
+        # Save tokenizer if available
+        if hasattr(self, 'tokenizer'):
+            self.tokenizer.save_pretrained(save_directory)
+        # Save the base model with proper sharding if needed
+        try:
+            # Use the base model's save_pretrained with the parameters
+            self.mola_model.save_pretrained(
+                save_directory,
+                max_shard_size=max_shard_size,
+                safe_serialization=safe_serialization
+            )
+        except Exception as e:
+            print(f"Warning: Could not save base model weights: {e}")
+            # Fallback: just save the config and tokenizer
+            pass
+        # Save router weights if they exist
+        try:
+            if hasattr(self, 'router_decoder'):
+                router_state_dict = self.router_decoder.state_dict()
+                torch.save(router_state_dict, os.path.join(save_directory, "router_weights.pth"))
+        except Exception as e:
+            print(f"Warning: Could not save router weights: {e}")
+        print(f"Model saved to {save_directory}")
+    def get_current_lora(self) -> str:
+        """Get the currently applied LoRA adapter name."""
+        return self._current_lora or "base_model"
+    def get_available_loras(self) -> List[str]:
+        """Get list of available LoRA adapters."""
+        return list(self.lora_models.keys())
+# For transformers AutoModel registration
+def _load_mola_model(model_path, **kwargs):
+    """Helper function to load MoLA model."""
+    return MoLAForCausalLM.from_pretrained(model_path, **kwargs)
+# Register with transformers AutoModel system
+try:
+    CONFIG_MAPPING.register("mola_lm", MoLAConfig)
+    MODEL_FOR_CAUSAL_LM_MAPPING.register(MoLAConfig, MoLAForCausalLM)
+    print("✅ Successfully registered MoLA-LM with AutoModel!")
+except Exception as e:
+    print(f"⚠️ AutoModel registration failed: {e}")
+    # Try alternative registration for backwards compatibility
+    try:
+        from transformers import AutoConfig, AutoModelForCausalLM
+        AutoConfig.register("mola_lm", MoLAConfig)
+        AutoModelForCausalLM.register(MoLAConfig, MoLAForCausalLM)
+        print("✅ Successfully registered MoLA-LM with legacy method!")
+    except Exception as e2:
+        print(f"⚠️ Legacy registration also failed: {e2}")
+        print("Model can still be loaded directly with MoLAForCausalLM.from_pretrained()")

router_weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4844ec0ad964c75599d93e5a20089f7ea41bc5e31a551d99b49665ba6ab7a8
+size 153763

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff