AtAndDev commited on
Commit
787e370
·
verified ·
1 Parent(s): 4eacf8c

Upload MoLA-LM: Mixture of LoRA Adapters Language Model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ tags:
5
+ - pytorch
6
+ - mixture-of-experts
7
+ - lora
8
+ - adapter
9
+ - causal-lm
10
+ - text-generation
11
+ language:
12
+ - en
13
+ pipeline_tag: text-generation
14
+ ---
15
+
16
+ Image here
17
+
18
+ # MoLA-LM: Mixture of LoRA Adapters LLM
19
+
20
+ MoLA-LM combines multiple LoRA adapters with an intelligent router to automatically select the best adapter for each input prompt. This approach enables specialized performance across different tasks while maintaining efficiency.
21
+
22
+ Evals are coming...
23
+
24
+ ## Model Details
25
+
26
+ - **Model Type**: Mixture of LoRA Adapters Language Model
27
+ - **Base Model**: Qwen/Qwen3-4B-Thinking-2507
28
+ - **Total Adapters**: 9
29
+ - **Architecture**: Custom MoLAForCausalLM with automatic adapter routing
30
+
31
+ ## Usage
32
+
33
+ ```python
34
+ from transformers import AutoModelForCausalLM, AutoTokenizer
35
+
36
+ # Load the model (trust_remote_code=True is required for custom architecture)
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ "MoLA-LLM/MoLA-v0.5-9x4b",
39
+ trust_remote_code=True,
40
+ device_map="auto"
41
+ )
42
+ tokenizer = AutoTokenizer.from_pretrained("MoLA-LLM/MoLA-v0.5-9x4b", trust_remote_code=True)
43
+
44
+ # Use like any other language model - adapter selection is automatic
45
+ prompt = "Write a Python function to calculate fibonacci numbers"
46
+ messages = [{"role": "user", "content": prompt}]
47
+ inputs = tokenizer.apply_chat_template(
48
+ messages,
49
+ add_generation_prompt=True,
50
+ tokenize=True,
51
+ return_dict=True,
52
+ return_tensors="pt",
53
+ ).to(model.device)
54
+
55
+ outputs = model.generate(**inputs, max_new_tokens=8192, temperature=.6, do_sample=True)
56
+ response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
57
+
58
+ print(f"Selected LoRA: {model.get_current_lora()}")
59
+ print(response)
60
+ ```
61
+ *You can also use load_in_4bit and load_in_8bit directly when loading!*
62
+
63
+ ## Architecture
64
+
65
+ The MoLA-LM architecture consists of:
66
+
67
+ 1. **Base Model**: Qwen/Qwen3-4B-Thinking-2507
68
+ 2. **Router Network**: Frozen encoder as Sentence transformer + decoder as one layer MLP for adapter selection
69
+ 3. **LoRA Adapters**: 9 task-specific fine-tuned adapters
70
+ 4. **Dynamic Switching**: Automatic adapter application based on input
71
+
72
+ ---
73
+
74
+ ##*Paper coming soon™*
__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MoLA-LM: Mixture of LoRA Adapters Language Model
3
+ """
4
+
5
+ from .configuration_mola_lm import MoLAConfig
6
+ from .modeling_mola_lm import MoLAForCausalLM
7
+
8
+ __all__ = ["MoLAConfig", "MoLAForCausalLM"]
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n<think>\n' }}
86
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MoLAForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_mola_lm.MoLAConfig",
7
+ "AutoModelForCausalLM": "modeling_mola_lm.MoLAForCausalLM"
8
+ },
9
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
10
+ "task_labels": [
11
+ "0",
12
+ "1",
13
+ "2",
14
+ "3",
15
+ "4",
16
+ "5",
17
+ "6",
18
+ "7",
19
+ "8"
20
+ ],
21
+ "num_loras": 9,
22
+ "model_type": "mola_lm",
23
+ "transformers_version": "4.36.0"
24
+ }
configuration_mola_lm.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration class for MoLA-LM
3
+ """
4
+
5
+ from transformers import PretrainedConfig
6
+ from typing import Dict, List
7
+
8
+ EXPERTS_LIST = [
9
+ "0",
10
+ "1",
11
+ "2",
12
+ "3",
13
+ "4",
14
+ "5",
15
+ "6",
16
+ "7",
17
+ "8",
18
+ ]
19
+
20
+
21
+ class MoLAConfig(PretrainedConfig):
22
+ """Configuration class for MoLA-LM model."""
23
+
24
+ model_type = "mola_lm"
25
+
26
+ def __init__(
27
+ self,
28
+ base_model_name_or_path: str = "Qwen/Qwen2.5-3B-Instruct",
29
+ task_labels: List[str] = None,
30
+ router_config: Dict = None,
31
+ lora_configs: Dict[str, Dict] = None,
32
+ **kwargs
33
+ ):
34
+ super().__init__(**kwargs)
35
+ self.base_model_name_or_path = base_model_name_or_path
36
+ self.task_labels = task_labels or EXPERTS_LIST
37
+ self.router_config = router_config or {}
38
+ self.lora_configs = lora_configs or {}
39
+ self.num_loras = len(self.task_labels)
loras/0/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/0/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "k_proj",
25
+ "v_proj",
26
+ "up_proj",
27
+ "gate_proj",
28
+ "q_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/0/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5188249779649a7bd2fccd0893dbd4d5ba46bf6dbefb4d3aa9c00c48446966ac
3
+ size 66126768
loras/1/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/1/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "k_proj",
25
+ "up_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/1/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b75845af51d525de7d85dd7132eec30f6c4d36761e843449c526cf07daafd3b
3
+ size 66126768
loras/2/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/2/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj",
26
+ "v_proj",
27
+ "gate_proj",
28
+ "up_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/2/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3a3824581f02be957b6e82217233d3b08b85f93632bcef1c1ff2089bf18f912
3
+ size 66126768
loras/3/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/3/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "up_proj",
25
+ "o_proj",
26
+ "k_proj",
27
+ "down_proj",
28
+ "gate_proj",
29
+ "q_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/3/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c658489b09b599292dd5a0f22a5989323e27860c6612b83ab89140b7cb2e705
3
+ size 66126768
loras/4/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/4/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "q_proj",
25
+ "down_proj",
26
+ "up_proj",
27
+ "gate_proj",
28
+ "k_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/4/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccf0ec70d994ce232c05cc1a6ec47980b138f3dad85a910b56b384b00b55b939
3
+ size 66126768
loras/5/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/5/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj",
25
+ "down_proj",
26
+ "k_proj",
27
+ "up_proj",
28
+ "o_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/5/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b692ebde6d90eb0b492fde12dd51c3fca9d73e68eb9a519b01a434124010cd6
3
+ size 66126768
loras/6/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/6/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "gate_proj",
25
+ "down_proj",
26
+ "v_proj",
27
+ "up_proj",
28
+ "q_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/6/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69981b3d52a31dc4631d35e6929ee18489413365570ad287d9bf544d05de10cd
3
+ size 66126768
loras/7/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/7/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "gate_proj",
25
+ "up_proj",
26
+ "down_proj",
27
+ "k_proj",
28
+ "o_proj",
29
+ "q_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/7/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daacefa493abc4a6606c29f2f3c143369f298722066e656fa6c28a1e7bdc88b2
3
+ size 66126768
loras/8/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Thinking-2507
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
loras/8/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "up_proj",
25
+ "v_proj",
26
+ "gate_proj",
27
+ "o_proj",
28
+ "k_proj",
29
+ "down_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
loras/8/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10256aef4d2832e826f1a341849b1dc0ea60e451c74dfdb7fd17df09e8e1b1fd
3
+ size 66126768
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f4a0ea0a2e096f17d2540516d5ad87c17965d0c8478cb91850823855ca164ea
3
+ size 4967217648
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4a211ece672744d6763cc52a55d5fc491292f5865c77f6ec09dc9f72554ab82
3
+ size 3168785724
model.safetensors.index.json ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4045219145,
4
+ "total_size": 8135940388
5
+ },
6
+ "weight_map": {
7
+ "base_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "base_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "base_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "base_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "base_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "base_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "base_model.model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
14
+ "base_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "base_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "base_model.model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
17
+ "base_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
+ "base_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
+ "base_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
+ "base_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
+ "base_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
+ "base_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
+ "base_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
+ "base_model.model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
25
+ "base_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
+ "base_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
+ "base_model.model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
28
+ "base_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
+ "base_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
30
+ "base_model.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "base_model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
32
+ "base_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
33
+ "base_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
34
+ "base_model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "base_model.model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
36
+ "base_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
+ "base_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
+ "base_model.model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
39
+ "base_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
40
+ "base_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
41
+ "base_model.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
42
+ "base_model.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
43
+ "base_model.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
44
+ "base_model.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
45
+ "base_model.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "base_model.model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
47
+ "base_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
+ "base_model.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
+ "base_model.model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
50
+ "base_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
+ "base_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
+ "base_model.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "base_model.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
+ "base_model.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
+ "base_model.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
+ "base_model.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "base_model.model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
58
+ "base_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "base_model.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "base_model.model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
61
+ "base_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
62
+ "base_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
63
+ "base_model.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
+ "base_model.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
+ "base_model.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
+ "base_model.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
67
+ "base_model.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
68
+ "base_model.model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
69
+ "base_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
70
+ "base_model.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
71
+ "base_model.model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
72
+ "base_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
+ "base_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
+ "base_model.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "base_model.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
+ "base_model.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
+ "base_model.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
+ "base_model.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
+ "base_model.model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
80
+ "base_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
+ "base_model.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
+ "base_model.model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
83
+ "base_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
+ "base_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
85
+ "base_model.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
86
+ "base_model.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
87
+ "base_model.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
88
+ "base_model.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
89
+ "base_model.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "base_model.model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
91
+ "base_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
92
+ "base_model.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
93
+ "base_model.model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
+ "base_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
+ "base_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
+ "base_model.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "base_model.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
+ "base_model.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
+ "base_model.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
+ "base_model.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
+ "base_model.model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
102
+ "base_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
+ "base_model.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
+ "base_model.model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
105
+ "base_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "base_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "base_model.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "base_model.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "base_model.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "base_model.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "base_model.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "base_model.model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
113
+ "base_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
114
+ "base_model.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
115
+ "base_model.model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
116
+ "base_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
117
+ "base_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
+ "base_model.model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
+ "base_model.model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
+ "base_model.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
+ "base_model.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "base_model.model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
+ "base_model.model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
124
+ "base_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
+ "base_model.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
+ "base_model.model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
127
+ "base_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
+ "base_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
+ "base_model.model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "base_model.model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
+ "base_model.model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
+ "base_model.model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
+ "base_model.model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "base_model.model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
135
+ "base_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
+ "base_model.model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
+ "base_model.model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
138
+ "base_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
+ "base_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
+ "base_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
+ "base_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
+ "base_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
+ "base_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
+ "base_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "base_model.model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
146
+ "base_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
+ "base_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
+ "base_model.model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
149
+ "base_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
+ "base_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "base_model.model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
+ "base_model.model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
+ "base_model.model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
+ "base_model.model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
+ "base_model.model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
+ "base_model.model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
157
+ "base_model.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "base_model.model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "base_model.model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
160
+ "base_model.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
+ "base_model.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
+ "base_model.model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
163
+ "base_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
+ "base_model.model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
+ "base_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
+ "base_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
167
+ "base_model.model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
168
+ "base_model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
169
+ "base_model.model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
+ "base_model.model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
+ "base_model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
172
+ "base_model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
173
+ "base_model.model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
174
+ "base_model.model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
175
+ "base_model.model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
176
+ "base_model.model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
177
+ "base_model.model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
178
+ "base_model.model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
179
+ "base_model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
180
+ "base_model.model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
181
+ "base_model.model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
182
+ "base_model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
183
+ "base_model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
184
+ "base_model.model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
185
+ "base_model.model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
186
+ "base_model.model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
187
+ "base_model.model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
188
+ "base_model.model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "base_model.model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
190
+ "base_model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
191
+ "base_model.model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
192
+ "base_model.model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
193
+ "base_model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
194
+ "base_model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
195
+ "base_model.model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
196
+ "base_model.model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
197
+ "base_model.model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
198
+ "base_model.model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
199
+ "base_model.model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
200
+ "base_model.model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
201
+ "base_model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
202
+ "base_model.model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
203
+ "base_model.model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
204
+ "base_model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
+ "base_model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
+ "base_model.model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
207
+ "base_model.model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
+ "base_model.model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
+ "base_model.model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
+ "base_model.model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
+ "base_model.model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
212
+ "base_model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
213
+ "base_model.model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
214
+ "base_model.model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
215
+ "base_model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
216
+ "base_model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
+ "base_model.model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
+ "base_model.model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
+ "base_model.model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
+ "base_model.model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
+ "base_model.model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
+ "base_model.model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
223
+ "base_model.model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
+ "base_model.model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
+ "base_model.model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
226
+ "base_model.model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
+ "base_model.model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
+ "base_model.model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
+ "base_model.model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
+ "base_model.model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
+ "base_model.model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
+ "base_model.model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
+ "base_model.model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
234
+ "base_model.model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
+ "base_model.model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
+ "base_model.model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
237
+ "base_model.model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
+ "base_model.model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
239
+ "base_model.model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
240
+ "base_model.model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
241
+ "base_model.model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
242
+ "base_model.model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
243
+ "base_model.model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
244
+ "base_model.model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
245
+ "base_model.model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
246
+ "base_model.model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
247
+ "base_model.model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
248
+ "base_model.model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
249
+ "base_model.model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
250
+ "base_model.model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
251
+ "base_model.model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
252
+ "base_model.model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
253
+ "base_model.model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
254
+ "base_model.model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
255
+ "base_model.model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
256
+ "base_model.model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
257
+ "base_model.model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
258
+ "base_model.model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
259
+ "base_model.model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
260
+ "base_model.model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
+ "base_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "base_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
+ "base_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
+ "base_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
+ "base_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
+ "base_model.model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
267
+ "base_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
+ "base_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
+ "base_model.model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
270
+ "base_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
+ "base_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
+ "base_model.model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "base_model.model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
+ "base_model.model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
+ "base_model.model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
+ "base_model.model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
+ "base_model.model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
278
+ "base_model.model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
+ "base_model.model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
+ "base_model.model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
281
+ "base_model.model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
+ "base_model.model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
+ "base_model.model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
284
+ "base_model.model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
285
+ "base_model.model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
286
+ "base_model.model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
287
+ "base_model.model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
288
+ "base_model.model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
+ "base_model.model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
+ "base_model.model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
+ "base_model.model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
+ "base_model.model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
+ "base_model.model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
+ "base_model.model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
+ "base_model.model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
+ "base_model.model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
+ "base_model.model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
+ "base_model.model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "base_model.model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
300
+ "base_model.model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
301
+ "base_model.model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
302
+ "base_model.model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
303
+ "base_model.model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
304
+ "base_model.model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
305
+ "base_model.model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
306
+ "base_model.model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
307
+ "base_model.model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
308
+ "base_model.model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
309
+ "base_model.model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
310
+ "base_model.model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
311
+ "base_model.model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
312
+ "base_model.model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
313
+ "base_model.model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
314
+ "base_model.model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
315
+ "base_model.model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
316
+ "base_model.model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
317
+ "base_model.model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
318
+ "base_model.model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
319
+ "base_model.model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
320
+ "base_model.model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "base_model.model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
322
+ "base_model.model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
323
+ "base_model.model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
324
+ "base_model.model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
325
+ "base_model.model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
326
+ "base_model.model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
327
+ "base_model.model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
328
+ "base_model.model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
329
+ "base_model.model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
330
+ "base_model.model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
331
+ "base_model.model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
332
+ "base_model.model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
333
+ "base_model.model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
334
+ "base_model.model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
335
+ "base_model.model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
336
+ "base_model.model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
337
+ "base_model.model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
+ "base_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
339
+ "base_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
340
+ "base_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
341
+ "base_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
342
+ "base_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
343
+ "base_model.model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
344
+ "base_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
345
+ "base_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
346
+ "base_model.model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
347
+ "base_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
348
+ "base_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
+ "base_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
350
+ "base_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
351
+ "base_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
352
+ "base_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
353
+ "base_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
354
+ "base_model.model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
355
+ "base_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
356
+ "base_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
357
+ "base_model.model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
358
+ "base_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
+ "base_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
+ "base_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
361
+ "base_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
362
+ "base_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
+ "base_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
364
+ "base_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
365
+ "base_model.model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
366
+ "base_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
367
+ "base_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
368
+ "base_model.model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
369
+ "base_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
370
+ "base_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
371
+ "base_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
372
+ "base_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
373
+ "base_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
374
+ "base_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
375
+ "base_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
+ "base_model.model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
+ "base_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
+ "base_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
379
+ "base_model.model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
380
+ "base_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
381
+ "base_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
+ "base_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
+ "base_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
+ "base_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
+ "base_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
+ "base_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "base_model.model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
388
+ "base_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "base_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
+ "base_model.model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
391
+ "base_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
+ "base_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
393
+ "base_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
394
+ "base_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
395
+ "base_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
396
+ "base_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
397
+ "base_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
398
+ "base_model.model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
399
+ "base_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
+ "base_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
+ "base_model.model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
402
+ "base_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
403
+ "base_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
+ "base_model.model.norm.weight": "model-00002-of-00002.safetensors",
405
+ "router_decoder.0.bias": "model-00002-of-00002.safetensors",
406
+ "router_decoder.0.weight": "model-00002-of-00002.safetensors",
407
+ "router_decoder.3.bias": "model-00002-of-00002.safetensors",
408
+ "router_decoder.3.weight": "model-00002-of-00002.safetensors",
409
+ "router_encoder.embeddings.LayerNorm.bias": "model-00002-of-00002.safetensors",
410
+ "router_encoder.embeddings.LayerNorm.weight": "model-00002-of-00002.safetensors",
411
+ "router_encoder.embeddings.position_embeddings.weight": "model-00002-of-00002.safetensors",
412
+ "router_encoder.embeddings.token_type_embeddings.weight": "model-00002-of-00002.safetensors",
413
+ "router_encoder.embeddings.word_embeddings.weight": "model-00002-of-00002.safetensors",
414
+ "router_encoder.encoder.layer.0.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
415
+ "router_encoder.encoder.layer.0.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
416
+ "router_encoder.encoder.layer.0.attention.output.dense.bias": "model-00002-of-00002.safetensors",
417
+ "router_encoder.encoder.layer.0.attention.output.dense.weight": "model-00002-of-00002.safetensors",
418
+ "router_encoder.encoder.layer.0.attention.self.key.bias": "model-00002-of-00002.safetensors",
419
+ "router_encoder.encoder.layer.0.attention.self.key.weight": "model-00002-of-00002.safetensors",
420
+ "router_encoder.encoder.layer.0.attention.self.query.bias": "model-00002-of-00002.safetensors",
421
+ "router_encoder.encoder.layer.0.attention.self.query.weight": "model-00002-of-00002.safetensors",
422
+ "router_encoder.encoder.layer.0.attention.self.value.bias": "model-00002-of-00002.safetensors",
423
+ "router_encoder.encoder.layer.0.attention.self.value.weight": "model-00002-of-00002.safetensors",
424
+ "router_encoder.encoder.layer.0.intermediate.dense.bias": "model-00002-of-00002.safetensors",
425
+ "router_encoder.encoder.layer.0.intermediate.dense.weight": "model-00002-of-00002.safetensors",
426
+ "router_encoder.encoder.layer.0.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
427
+ "router_encoder.encoder.layer.0.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
428
+ "router_encoder.encoder.layer.0.output.dense.bias": "model-00002-of-00002.safetensors",
429
+ "router_encoder.encoder.layer.0.output.dense.weight": "model-00002-of-00002.safetensors",
430
+ "router_encoder.encoder.layer.1.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
431
+ "router_encoder.encoder.layer.1.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
432
+ "router_encoder.encoder.layer.1.attention.output.dense.bias": "model-00002-of-00002.safetensors",
433
+ "router_encoder.encoder.layer.1.attention.output.dense.weight": "model-00002-of-00002.safetensors",
434
+ "router_encoder.encoder.layer.1.attention.self.key.bias": "model-00002-of-00002.safetensors",
435
+ "router_encoder.encoder.layer.1.attention.self.key.weight": "model-00002-of-00002.safetensors",
436
+ "router_encoder.encoder.layer.1.attention.self.query.bias": "model-00002-of-00002.safetensors",
437
+ "router_encoder.encoder.layer.1.attention.self.query.weight": "model-00002-of-00002.safetensors",
438
+ "router_encoder.encoder.layer.1.attention.self.value.bias": "model-00002-of-00002.safetensors",
439
+ "router_encoder.encoder.layer.1.attention.self.value.weight": "model-00002-of-00002.safetensors",
440
+ "router_encoder.encoder.layer.1.intermediate.dense.bias": "model-00002-of-00002.safetensors",
441
+ "router_encoder.encoder.layer.1.intermediate.dense.weight": "model-00002-of-00002.safetensors",
442
+ "router_encoder.encoder.layer.1.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
443
+ "router_encoder.encoder.layer.1.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
444
+ "router_encoder.encoder.layer.1.output.dense.bias": "model-00002-of-00002.safetensors",
445
+ "router_encoder.encoder.layer.1.output.dense.weight": "model-00002-of-00002.safetensors",
446
+ "router_encoder.encoder.layer.2.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
447
+ "router_encoder.encoder.layer.2.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
448
+ "router_encoder.encoder.layer.2.attention.output.dense.bias": "model-00002-of-00002.safetensors",
449
+ "router_encoder.encoder.layer.2.attention.output.dense.weight": "model-00002-of-00002.safetensors",
450
+ "router_encoder.encoder.layer.2.attention.self.key.bias": "model-00002-of-00002.safetensors",
451
+ "router_encoder.encoder.layer.2.attention.self.key.weight": "model-00002-of-00002.safetensors",
452
+ "router_encoder.encoder.layer.2.attention.self.query.bias": "model-00002-of-00002.safetensors",
453
+ "router_encoder.encoder.layer.2.attention.self.query.weight": "model-00002-of-00002.safetensors",
454
+ "router_encoder.encoder.layer.2.attention.self.value.bias": "model-00002-of-00002.safetensors",
455
+ "router_encoder.encoder.layer.2.attention.self.value.weight": "model-00002-of-00002.safetensors",
456
+ "router_encoder.encoder.layer.2.intermediate.dense.bias": "model-00002-of-00002.safetensors",
457
+ "router_encoder.encoder.layer.2.intermediate.dense.weight": "model-00002-of-00002.safetensors",
458
+ "router_encoder.encoder.layer.2.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
459
+ "router_encoder.encoder.layer.2.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
460
+ "router_encoder.encoder.layer.2.output.dense.bias": "model-00002-of-00002.safetensors",
461
+ "router_encoder.encoder.layer.2.output.dense.weight": "model-00002-of-00002.safetensors",
462
+ "router_encoder.encoder.layer.3.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
463
+ "router_encoder.encoder.layer.3.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
464
+ "router_encoder.encoder.layer.3.attention.output.dense.bias": "model-00002-of-00002.safetensors",
465
+ "router_encoder.encoder.layer.3.attention.output.dense.weight": "model-00002-of-00002.safetensors",
466
+ "router_encoder.encoder.layer.3.attention.self.key.bias": "model-00002-of-00002.safetensors",
467
+ "router_encoder.encoder.layer.3.attention.self.key.weight": "model-00002-of-00002.safetensors",
468
+ "router_encoder.encoder.layer.3.attention.self.query.bias": "model-00002-of-00002.safetensors",
469
+ "router_encoder.encoder.layer.3.attention.self.query.weight": "model-00002-of-00002.safetensors",
470
+ "router_encoder.encoder.layer.3.attention.self.value.bias": "model-00002-of-00002.safetensors",
471
+ "router_encoder.encoder.layer.3.attention.self.value.weight": "model-00002-of-00002.safetensors",
472
+ "router_encoder.encoder.layer.3.intermediate.dense.bias": "model-00002-of-00002.safetensors",
473
+ "router_encoder.encoder.layer.3.intermediate.dense.weight": "model-00002-of-00002.safetensors",
474
+ "router_encoder.encoder.layer.3.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
475
+ "router_encoder.encoder.layer.3.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
476
+ "router_encoder.encoder.layer.3.output.dense.bias": "model-00002-of-00002.safetensors",
477
+ "router_encoder.encoder.layer.3.output.dense.weight": "model-00002-of-00002.safetensors",
478
+ "router_encoder.encoder.layer.4.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
479
+ "router_encoder.encoder.layer.4.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
480
+ "router_encoder.encoder.layer.4.attention.output.dense.bias": "model-00002-of-00002.safetensors",
481
+ "router_encoder.encoder.layer.4.attention.output.dense.weight": "model-00002-of-00002.safetensors",
482
+ "router_encoder.encoder.layer.4.attention.self.key.bias": "model-00002-of-00002.safetensors",
483
+ "router_encoder.encoder.layer.4.attention.self.key.weight": "model-00002-of-00002.safetensors",
484
+ "router_encoder.encoder.layer.4.attention.self.query.bias": "model-00002-of-00002.safetensors",
485
+ "router_encoder.encoder.layer.4.attention.self.query.weight": "model-00002-of-00002.safetensors",
486
+ "router_encoder.encoder.layer.4.attention.self.value.bias": "model-00002-of-00002.safetensors",
487
+ "router_encoder.encoder.layer.4.attention.self.value.weight": "model-00002-of-00002.safetensors",
488
+ "router_encoder.encoder.layer.4.intermediate.dense.bias": "model-00002-of-00002.safetensors",
489
+ "router_encoder.encoder.layer.4.intermediate.dense.weight": "model-00002-of-00002.safetensors",
490
+ "router_encoder.encoder.layer.4.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
491
+ "router_encoder.encoder.layer.4.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
492
+ "router_encoder.encoder.layer.4.output.dense.bias": "model-00002-of-00002.safetensors",
493
+ "router_encoder.encoder.layer.4.output.dense.weight": "model-00002-of-00002.safetensors",
494
+ "router_encoder.encoder.layer.5.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
495
+ "router_encoder.encoder.layer.5.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
496
+ "router_encoder.encoder.layer.5.attention.output.dense.bias": "model-00002-of-00002.safetensors",
497
+ "router_encoder.encoder.layer.5.attention.output.dense.weight": "model-00002-of-00002.safetensors",
498
+ "router_encoder.encoder.layer.5.attention.self.key.bias": "model-00002-of-00002.safetensors",
499
+ "router_encoder.encoder.layer.5.attention.self.key.weight": "model-00002-of-00002.safetensors",
500
+ "router_encoder.encoder.layer.5.attention.self.query.bias": "model-00002-of-00002.safetensors",
501
+ "router_encoder.encoder.layer.5.attention.self.query.weight": "model-00002-of-00002.safetensors",
502
+ "router_encoder.encoder.layer.5.attention.self.value.bias": "model-00002-of-00002.safetensors",
503
+ "router_encoder.encoder.layer.5.attention.self.value.weight": "model-00002-of-00002.safetensors",
504
+ "router_encoder.encoder.layer.5.intermediate.dense.bias": "model-00002-of-00002.safetensors",
505
+ "router_encoder.encoder.layer.5.intermediate.dense.weight": "model-00002-of-00002.safetensors",
506
+ "router_encoder.encoder.layer.5.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
507
+ "router_encoder.encoder.layer.5.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
508
+ "router_encoder.encoder.layer.5.output.dense.bias": "model-00002-of-00002.safetensors",
509
+ "router_encoder.encoder.layer.5.output.dense.weight": "model-00002-of-00002.safetensors",
510
+ "router_encoder.pooler.dense.bias": "model-00002-of-00002.safetensors",
511
+ "router_encoder.pooler.dense.weight": "model-00002-of-00002.safetensors"
512
+ }
513
+ }
modeling_mola_lm.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ from typing import Dict, List, Optional, Union
7
+ from transformers import (
8
+ AutoConfig, AutoTokenizer, AutoModelForCausalLM,
9
+ PretrainedConfig, PreTrainedModel, GenerationMixin
10
+ )
11
+ from transformers.models.auto import CONFIG_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING
12
+ from peft import PeftModel, LoraConfig, get_peft_model
13
+
14
+ EXPERTS_LIST = [
15
+ "0",
16
+ "1",
17
+ "2",
18
+ "3",
19
+ "4",
20
+ "5",
21
+ "6",
22
+ "7",
23
+ "8",
24
+ ]
25
+
26
+
27
+ class MoLAConfig(PretrainedConfig):
28
+ """Configuration class for MoLA-LM model."""
29
+
30
+ model_type = "mola_lm"
31
+
32
+ def __init__(
33
+ self,
34
+ base_model_name_or_path: str = "Qwen/Qwen3-4B-Thinking-2507",
35
+ task_labels: List[str] = None,
36
+ router_config: Dict = None,
37
+ lora_configs: Dict[str, Dict] = None,
38
+ **kwargs
39
+ ):
40
+ super().__init__(**kwargs)
41
+ self.base_model_name_or_path = base_model_name_or_path
42
+ self.task_labels = task_labels or EXPERTS_LIST
43
+ self.router_config = router_config or {}
44
+ self.lora_configs = lora_configs or {}
45
+ self.num_loras = len(self.task_labels)
46
+
47
+
48
+ class MoLAForCausalLM(PreTrainedModel, GenerationMixin):
49
+ """
50
+ MoLA Language Model for Causal Language Modeling - AutoModel Compatible
51
+ """
52
+
53
+ config_class = MoLAConfig
54
+ base_model_prefix = "mola_model" # Avoid recursion by using unique prefix
55
+ supports_gradient_checkpointing = True
56
+
57
+ def __init__(self, config):
58
+ super().__init__(config)
59
+ self.config = config
60
+
61
+ # Store model path for loading resources
62
+ self.model_path = getattr(config, '_name_or_path', None)
63
+
64
+ # Load base model (use base_model_prefix name)
65
+ print(f"Loading base model: {self.config.base_model_name_or_path}")
66
+ self.mola_model = AutoModelForCausalLM.from_pretrained(
67
+ self.config.base_model_name_or_path,
68
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
69
+ device_map="auto" if torch.cuda.is_available() else None
70
+ )
71
+
72
+ # Load tokenizer
73
+ if self.model_path:
74
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
75
+ else:
76
+ self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model_name_or_path)
77
+
78
+ if self.tokenizer.pad_token is None:
79
+ self.tokenizer.pad_token = self.tokenizer.eos_token
80
+
81
+ # Initialize router
82
+ self._init_router()
83
+
84
+ # Initialize current model state (will be updated by _load_lora_adapters)
85
+ self._current_lora = None
86
+ self._current_adapted_model = self.mola_model
87
+
88
+ # Load LoRA configurations and adapters (this will update _current_adapted_model)
89
+ self._load_lora_adapters()
90
+
91
+ # Initialize device property (needed for PreTrainedModel compatibility)
92
+ self._device = next(self.mola_model.parameters()).device
93
+
94
+ # Load router weights if available
95
+ self._load_router_weights()
96
+
97
+ print("MoLA-LM initialized successfully!")
98
+
99
+ def _load_router_weights(self):
100
+ """Load router weights from the saved checkpoint."""
101
+ if self.model_path:
102
+ try:
103
+ # Handle both local and Hub paths for router weights
104
+ if os.path.exists(self.model_path):
105
+ # Local path
106
+ router_weights_path = os.path.join(self.model_path, "router_weights.pth")
107
+ if os.path.exists(router_weights_path):
108
+ checkpoint = torch.load(router_weights_path, map_location='cpu')
109
+ else:
110
+ print("⚠️ No router weights found locally")
111
+ return
112
+ else:
113
+ # Hub path - download router weights
114
+ try:
115
+ from huggingface_hub import hf_hub_download
116
+ router_weights_path = hf_hub_download(
117
+ repo_id=self.model_path,
118
+ filename="router_weights.pth",
119
+ local_files_only=False
120
+ )
121
+ checkpoint = torch.load(router_weights_path, map_location='cpu')
122
+ print("📥 Downloaded router weights from Hub")
123
+ except Exception as hub_e:
124
+ print(f"⚠️ Failed to download router weights from Hub: {hub_e}")
125
+ print("🔄 Router will use random initialization (reduced performance)")
126
+ return
127
+
128
+ # Load router decoder weights
129
+ router_state_dict = {}
130
+ for key, value in checkpoint.items():
131
+ if not key.startswith('encoder.'): # Skip encoder weights
132
+ router_state_dict[key] = value
133
+
134
+ if router_state_dict:
135
+ self.router_decoder.load_state_dict(router_state_dict, strict=False)
136
+ print("✅ Loaded router weights successfully!")
137
+
138
+ # Verify weights loaded by checking if they're not all zeros
139
+ first_layer = next(iter(self.router_decoder.parameters()))
140
+ if torch.all(first_layer == 0):
141
+ print("⚠️ Warning: Router weights appear to be zero-initialized")
142
+ else:
143
+ print("🎯 Router weights verified - non-zero values detected")
144
+ else:
145
+ print("⚠️ No valid router weights found in checkpoint")
146
+
147
+ except Exception as e:
148
+ print(f"❌ Failed to load router weights: {e}")
149
+ print("🔄 Router will use random initialization (reduced performance)")
150
+
151
+ def _init_router(self):
152
+ """Initialize the router model for LoRA selection."""
153
+ try:
154
+ from transformers import AutoModel
155
+
156
+ print("Initializing router components...")
157
+ # Router components
158
+ self.router_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
159
+ self.router_encoder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
160
+
161
+ # Freeze encoder
162
+ for param in self.router_encoder.parameters():
163
+ param.requires_grad = False
164
+
165
+ # Router decoder
166
+ encoder_dim = self.router_encoder.config.hidden_size
167
+ self.router_decoder = nn.Sequential(
168
+ nn.Linear(encoder_dim, 96),
169
+ nn.ReLU(),
170
+ nn.Dropout(0.2),
171
+ nn.Linear(96, self.config.num_loras)
172
+ )
173
+
174
+ # Move router to device
175
+ if torch.cuda.is_available():
176
+ self.router_encoder = self.router_encoder.cuda()
177
+ self.router_decoder = self.router_decoder.cuda()
178
+
179
+ print("Router initialized successfully!")
180
+
181
+ except ImportError as e:
182
+ raise ImportError(f"Required dependencies not found: {e}")
183
+
184
+ def _load_lora_adapters(self):
185
+ """Load LoRA adapters using PEFT (single wrapper, multiple adapters)."""
186
+ from huggingface_hub import hf_hub_download
187
+
188
+ if not self.model_path:
189
+ print("No model path specified, skipping LoRA loading")
190
+ return
191
+
192
+ print("Loading LoRA adapters (single wrapper)...")
193
+
194
+ # Get the first adapter to create the initial PEFT wrapper
195
+ first_adapter = str(self.config.task_labels[0])
196
+ first_lora_path = None
197
+
198
+ try:
199
+ # Handle both local and Hub paths for first adapter
200
+ if os.path.exists(self.model_path):
201
+ # Local path
202
+ first_lora_path = os.path.join(self.model_path, "loras", first_adapter)
203
+ if not os.path.exists(first_lora_path):
204
+ raise FileNotFoundError(f"First adapter directory not found: {first_lora_path}")
205
+ else:
206
+ # Hub path - download first adapter
207
+ try:
208
+ # Download first adapter to get local path
209
+ adapter_file = hf_hub_download(
210
+ repo_id=self.model_path,
211
+ filename=f"loras/{first_adapter}/adapter_model.safetensors"
212
+ )
213
+ first_lora_path = os.path.dirname(adapter_file)
214
+ print(f"Downloaded first adapter to: {first_lora_path}")
215
+ except Exception as e:
216
+ raise Exception(f"Failed to download first adapter {first_adapter}: {e}")
217
+
218
+ # Create the initial PEFT wrapper with unique adapter name
219
+ peft_model = PeftModel.from_pretrained(
220
+ self.mola_model,
221
+ first_lora_path,
222
+ adapter_name=first_adapter,
223
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
224
+ )
225
+ print(f"✅ Loaded first LoRA: {first_adapter}")
226
+
227
+ # Load remaining adapters into the same wrapper with unique names
228
+ for task_name in self.config.task_labels[1:]:
229
+ try:
230
+ lora_path = None
231
+
232
+ if os.path.exists(self.model_path):
233
+ # Local path
234
+ lora_path = os.path.join(self.model_path, "loras", task_name)
235
+ if not os.path.exists(lora_path):
236
+ print(f"⚠️ LoRA directory not found: {lora_path}")
237
+ continue
238
+ else:
239
+ # Hub path - download adapter
240
+ try:
241
+ adapter_file = hf_hub_download(
242
+ repo_id=self.model_path,
243
+ filename=f"loras/{task_name}/adapter_model.safetensors"
244
+ )
245
+ lora_path = os.path.dirname(adapter_file)
246
+ except Exception as e:
247
+ print(f"❌ Failed to download LoRA {task_name}: {e}")
248
+ continue
249
+
250
+ # Load adapter into the same PEFT model with unique name
251
+ peft_model.load_adapter(lora_path, adapter_name=task_name)
252
+ print(f"✅ Loaded LoRA: {task_name}")
253
+
254
+ except Exception as e:
255
+ print(f"❌ Failed to load LoRA {task_name}: {e}")
256
+
257
+ # Store single PEFT model for all adapters
258
+ self.lora_models = {str(name): peft_model for name in self.config.task_labels}
259
+ self._current_lora = first_adapter
260
+ self._current_adapted_model = peft_model
261
+
262
+ print(f"Loaded {len(self.config.task_labels)} LoRA adapters into one PEFT model.")
263
+ print(f"Available adapter names: {list(peft_model.peft_config.keys())}")
264
+
265
+ except Exception as e:
266
+ print(f"❌ Failed to initialize LoRA loading: {e}")
267
+ self.lora_models = {}
268
+ self._current_adapted_model = self.mola_model
269
+ self._current_lora = None
270
+
271
+ def predict_best_lora(self, text: str) -> str:
272
+ """Predict the best LoRA adapter for given text."""
273
+ # Set models to eval mode
274
+ self.router_encoder.eval()
275
+ self.router_decoder.eval()
276
+
277
+ # Encode text
278
+ inputs = self.router_tokenizer(
279
+ [text],
280
+ padding=True,
281
+ truncation=True,
282
+ max_length=512,
283
+ return_tensors="pt"
284
+ )
285
+
286
+ # Move to device
287
+ device = next(self.router_decoder.parameters()).device
288
+ inputs = {k: v.to(device) for k, v in inputs.items()}
289
+
290
+ with torch.no_grad():
291
+ outputs = self.router_encoder(**inputs)
292
+ embeddings = outputs.last_hidden_state.mean(dim=1)
293
+ logits = self.router_decoder(embeddings)
294
+
295
+ # Get best LoRA
296
+ best_idx = torch.argmax(logits, dim=-1).item()
297
+ predicted_label = self.config.task_labels[best_idx]
298
+
299
+ # Debug output
300
+ # print(f"Debug - Text: {text[:50]}...")
301
+ # print(f"Debug - Logits: {logits[0].cpu().numpy()}")
302
+ # print(f"Debug - Best idx: {best_idx}, Label: {predicted_label}")
303
+
304
+ return predicted_label
305
+
306
+ def _apply_lora(self, lora_name: str):
307
+ """Apply the selected LoRA adapter using set_adapter."""
308
+ if hasattr(self, '_current_adapted_model') and isinstance(self._current_adapted_model, PeftModel):
309
+ # Check if the adapter exists in the PEFT model
310
+ if str(lora_name) in self._current_adapted_model.peft_config:
311
+ if lora_name != self._current_lora:
312
+ self._current_adapted_model.set_adapter(str(lora_name))
313
+ self._current_lora = str(lora_name)
314
+ # print(f"🎯 Applied LoRA: {lora_name}") # Uncomment for debugging
315
+ else:
316
+ print(f"⚠️ LoRA adapter '{lora_name}' not found in PEFT model. Available: {list(self._current_adapted_model.peft_config.keys())}")
317
+ # Keep current adapter if requested one doesn't exist
318
+ else:
319
+ # Fallback to base model if no PEFT model available
320
+ self._current_adapted_model = self.mola_model
321
+ self._current_lora = None
322
+ print(f"⚠️ No PEFT model available, using base model")
323
+
324
+ def get_available_loras(self) -> List[str]:
325
+ """Get list of available LoRA adapter names."""
326
+ if hasattr(self, '_current_adapted_model') and isinstance(self._current_adapted_model, PeftModel):
327
+ return list(self._current_adapted_model.peft_config.keys())
328
+ else:
329
+ return []
330
+
331
+ def test_adapter_uniqueness(self, layer_name: str = "base_model.model.model.layers.33.mlp.down_proj"):
332
+ """
333
+ Regression test to verify that adapters have different weights.
334
+
335
+ Args:
336
+ layer_name: The layer to test (default is a common MLP layer)
337
+
338
+ Returns:
339
+ Dict[str, str]: Mapping of adapter names to their weight hashes
340
+ """
341
+ import hashlib
342
+
343
+ if not hasattr(self, '_current_adapted_model') or not isinstance(self._current_adapted_model, PeftModel):
344
+ print("⚠️ No PEFT model available for testing")
345
+ return {}
346
+
347
+ names = self.get_available_loras()
348
+ if len(names) <= 1:
349
+ print(f"⚠️ Need at least 2 adapters for uniqueness test, found {len(names)}")
350
+ return {}
351
+
352
+ def fused_sha(adapter_name, layer_name):
353
+ """Compute SHA256 hash of fused LoRA weights for given adapter and layer."""
354
+ # Switch to the adapter
355
+ self._apply_lora(adapter_name)
356
+
357
+ # Navigate to the specified layer
358
+ try:
359
+ mod = self._current_adapted_model
360
+ for part in layer_name.split("."):
361
+ if part:
362
+ mod = getattr(mod, part)
363
+
364
+ # Get LoRA components
365
+ if not hasattr(mod, 'lora_A') or not hasattr(mod, 'lora_B'):
366
+ print(f"⚠️ Layer {layer_name} doesn't have LoRA components")
367
+ return "no_lora"
368
+
369
+ # Get the adapter name (should be the same as what we set)
370
+ adapter_key = next(iter(mod.lora_A.keys()))
371
+ A = mod.lora_A[adapter_key].weight
372
+ B = mod.lora_B[adapter_key].weight
373
+ s = float(mod.scaling[adapter_key])
374
+
375
+ # Compute fused weights: ΔW = (B @ A) * scaling
376
+ dW = (B @ A) * s
377
+
378
+ # Convert to bytes and hash
379
+ tensor_bytes = dW.detach().to("cpu", dtype=torch.float32).contiguous().numpy().tobytes()
380
+ return hashlib.sha256(tensor_bytes).hexdigest()[:16]
381
+
382
+ except Exception as e:
383
+ print(f"❌ Error computing hash for {adapter_name}: {e}")
384
+ return f"error_{adapter_name}"
385
+
386
+ print(f"🧪 Testing adapter uniqueness on layer: {layer_name}")
387
+ hashes = {}
388
+ for adapter_name in names:
389
+ hash_val = fused_sha(adapter_name, layer_name)
390
+ hashes[adapter_name] = hash_val
391
+ print(f" {adapter_name}: {hash_val}")
392
+
393
+ # Check uniqueness
394
+ unique_hashes = set(hashes.values())
395
+ if len(unique_hashes) == len(names):
396
+ print("✅ All adapters have unique weights!")
397
+ else:
398
+ print(f"❌ Found duplicate weights! {len(names)} adapters but only {len(unique_hashes)} unique hashes")
399
+ # Show which ones are identical
400
+ from collections import defaultdict
401
+ hash_to_adapters = defaultdict(list)
402
+ for adapter, hash_val in hashes.items():
403
+ hash_to_adapters[hash_val].append(adapter)
404
+
405
+ for hash_val, adapter_list in hash_to_adapters.items():
406
+ if len(adapter_list) > 1:
407
+ print(f" Identical weights (hash {hash_val}): {adapter_list}")
408
+
409
+ return hashes
410
+
411
+ def generate(self, input_ids=None, attention_mask=None, **kwargs):
412
+ """
413
+ Standard generate method with automatic LoRA selection.
414
+ Works exactly like any other LLM's generate method.
415
+ """
416
+ # If we have input_ids, predict and apply the best LoRA
417
+ if input_ids is not None and hasattr(self, 'tokenizer'):
418
+ try:
419
+ # Decode the input to get the text for LoRA prediction
420
+ if len(input_ids.shape) > 1:
421
+ # Batch input - use first item
422
+ text_input = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
423
+ else:
424
+ text_input = self.tokenizer.decode(input_ids, skip_special_tokens=True)
425
+
426
+ # Clean the text thoroughly to remove ALL chat template artifacts
427
+ import re
428
+
429
+ # Build regex patterns to avoid escape issues in embedded code
430
+ start_pattern = '<' + '|im_start|' + '>user'
431
+ end_pattern = '<' + '|im_end|' + '>'
432
+
433
+ # First, try to extract just the user's actual question/prompt
434
+ if start_pattern in text_input and end_pattern in text_input:
435
+ start_idx = text_input.find(start_pattern) + len(start_pattern)
436
+ end_idx = text_input.find(end_pattern, start_idx)
437
+ if end_idx > start_idx:
438
+ text_input = text_input[start_idx:end_idx].strip()
439
+
440
+ # Clean up any remaining template artifacts
441
+ # Remove special tokens with simple string replacement
442
+ text_input = text_input.replace('<|im_start|>', '')
443
+ text_input = text_input.replace('<|im_end|>', '')
444
+ text_input = text_input.replace('system', '')
445
+ text_input = text_input.replace('user', '')
446
+ text_input = text_input.replace('assistant', '')
447
+
448
+ # Remove system message patterns
449
+ if 'You are Qwen' in text_input:
450
+ lines = text_input.split('\n')
451
+ lines = [line for line in lines if 'You are' not in line and 'Alibaba' not in line]
452
+ text_input = ' '.join(lines)
453
+
454
+ # Final cleanup
455
+ text_input = re.sub(r'\n+', ' ', text_input) # Replace newlines with spaces
456
+ text_input = re.sub(r'\s+', ' ', text_input) # Normalize whitespace
457
+ text_input = text_input.strip()
458
+
459
+ # Debug: print the actual text being classified
460
+ # print(f"DEBUG RAW: '{self.tokenizer.decode(input_ids[0], skip_special_tokens=False)}'")
461
+ # print(f"DEBUG CLEAN: '{text_input}'")
462
+
463
+ # Predict and apply best LoRA
464
+ best_lora = self.predict_best_lora(text_input)
465
+ self._apply_lora(best_lora)
466
+
467
+ except Exception as e:
468
+ # If LoRA prediction fails, use base model
469
+ # print(f"DEBUG: LoRA prediction failed: {e}")
470
+ self._current_adapted_model = self.mola_model
471
+ self._current_lora = None
472
+
473
+ # Use the currently adapted model for generation
474
+ return self._current_adapted_model.generate(
475
+ input_ids=input_ids,
476
+ attention_mask=attention_mask,
477
+ **kwargs
478
+ )
479
+
480
+ def forward(self, input_ids, attention_mask=None, **kwargs):
481
+ """Forward pass through the model."""
482
+ return self._current_adapted_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
483
+
484
+ def __call__(self, *args, **kwargs):
485
+ """Make the model callable."""
486
+ return self._current_adapted_model(*args, **kwargs)
487
+
488
+ def get_input_embeddings(self):
489
+ """Get the input embeddings."""
490
+ return self._current_adapted_model.get_input_embeddings()
491
+
492
+ def set_input_embeddings(self, value):
493
+ """Set the input embeddings."""
494
+ self._current_adapted_model.set_input_embeddings(value)
495
+ # Also set for base model to keep them in sync
496
+ self.mola_model.set_input_embeddings(value)
497
+
498
+ def get_output_embeddings(self):
499
+ """Get the output embeddings."""
500
+ return self._current_adapted_model.get_output_embeddings()
501
+
502
+ def set_output_embeddings(self, value):
503
+ """Set the output embeddings."""
504
+ self._current_adapted_model.set_output_embeddings(value)
505
+ # Also set for base model to keep them in sync
506
+ self.mola_model.set_output_embeddings(value)
507
+
508
+ def tie_weights(self):
509
+ """Tie input and output embeddings."""
510
+ self._current_adapted_model.tie_weights()
511
+
512
+ def resize_token_embeddings(self, new_num_tokens):
513
+ """Resize token embeddings."""
514
+ return self._current_adapted_model.resize_token_embeddings(new_num_tokens)
515
+
516
+ @property
517
+ def device(self):
518
+ """Get the device of the model."""
519
+ return next(self.mola_model.parameters()).device
520
+
521
+ @classmethod
522
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
523
+ """Load model from pretrained path (transformers compatibility)."""
524
+ # Load config
525
+ config = MoLAConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
526
+ # Store the path for resource loading
527
+ config._name_or_path = pretrained_model_name_or_path
528
+ return cls(config)
529
+
530
+ def save_pretrained(self, save_directory, **kwargs):
531
+ """Save model using standard transformers approach."""
532
+ # Accept standard transformers parameters but use the ones we need
533
+ max_shard_size = kwargs.get('max_shard_size', "5GB")
534
+ safe_serialization = kwargs.get('safe_serialization', True)
535
+
536
+ os.makedirs(save_directory, exist_ok=True)
537
+
538
+ # Save config using transformers method
539
+ self.config.save_pretrained(save_directory)
540
+
541
+ # Save tokenizer if available
542
+ if hasattr(self, 'tokenizer'):
543
+ self.tokenizer.save_pretrained(save_directory)
544
+
545
+ # Save the base model with proper sharding if needed
546
+ try:
547
+ # Use the base model's save_pretrained with the parameters
548
+ self.mola_model.save_pretrained(
549
+ save_directory,
550
+ max_shard_size=max_shard_size,
551
+ safe_serialization=safe_serialization
552
+ )
553
+ except Exception as e:
554
+ print(f"Warning: Could not save base model weights: {e}")
555
+ # Fallback: just save the config and tokenizer
556
+ pass
557
+
558
+ # Save router weights if they exist
559
+ try:
560
+ if hasattr(self, 'router_decoder'):
561
+ router_state_dict = self.router_decoder.state_dict()
562
+ torch.save(router_state_dict, os.path.join(save_directory, "router_weights.pth"))
563
+ except Exception as e:
564
+ print(f"Warning: Could not save router weights: {e}")
565
+
566
+ print(f"Model saved to {save_directory}")
567
+
568
+ def get_current_lora(self) -> str:
569
+ """Get the currently applied LoRA adapter name."""
570
+ return self._current_lora or "base_model"
571
+
572
+ def get_available_loras(self) -> List[str]:
573
+ """Get list of available LoRA adapters."""
574
+ return list(self.lora_models.keys())
575
+
576
+
577
+ # For transformers AutoModel registration
578
+ def _load_mola_model(model_path, **kwargs):
579
+ """Helper function to load MoLA model."""
580
+ return MoLAForCausalLM.from_pretrained(model_path, **kwargs)
581
+
582
+
583
+ # Register with transformers AutoModel system
584
+ try:
585
+ CONFIG_MAPPING.register("mola_lm", MoLAConfig)
586
+ MODEL_FOR_CAUSAL_LM_MAPPING.register(MoLAConfig, MoLAForCausalLM)
587
+ print("✅ Successfully registered MoLA-LM with AutoModel!")
588
+ except Exception as e:
589
+ print(f"⚠️ AutoModel registration failed: {e}")
590
+ # Try alternative registration for backwards compatibility
591
+ try:
592
+ from transformers import AutoConfig, AutoModelForCausalLM
593
+ AutoConfig.register("mola_lm", MoLAConfig)
594
+ AutoModelForCausalLM.register(MoLAConfig, MoLAForCausalLM)
595
+ print("✅ Successfully registered MoLA-LM with legacy method!")
596
+ except Exception as e2:
597
+ print(f"⚠️ Legacy registration also failed: {e2}")
598
+ print("Model can still be loaded directly with MoLAForCausalLM.from_pretrained()")
router_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd4844ec0ad964c75599d93e5a20089f7ea41bc5e31a551d99b49665ba6ab7a8
3
+ size 153763
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff