Upload MoLA-LM: Mixture of LoRA Adapters Language Model
Browse files- .gitattributes +1 -0
- README.md +74 -0
- __init__.py +8 -0
- added_tokens.json +28 -0
- chat_template.jinja +86 -0
- config.json +24 -0
- configuration_mola_lm.py +39 -0
- loras/0/README.md +202 -0
- loras/0/adapter_config.json +34 -0
- loras/0/adapter_model.safetensors +3 -0
- loras/1/README.md +202 -0
- loras/1/adapter_config.json +34 -0
- loras/1/adapter_model.safetensors +3 -0
- loras/2/README.md +202 -0
- loras/2/adapter_config.json +34 -0
- loras/2/adapter_model.safetensors +3 -0
- loras/3/README.md +202 -0
- loras/3/adapter_config.json +34 -0
- loras/3/adapter_model.safetensors +3 -0
- loras/4/README.md +202 -0
- loras/4/adapter_config.json +34 -0
- loras/4/adapter_model.safetensors +3 -0
- loras/5/README.md +202 -0
- loras/5/adapter_config.json +34 -0
- loras/5/adapter_model.safetensors +3 -0
- loras/6/README.md +202 -0
- loras/6/adapter_config.json +34 -0
- loras/6/adapter_model.safetensors +3 -0
- loras/7/README.md +202 -0
- loras/7/adapter_config.json +34 -0
- loras/7/adapter_model.safetensors +3 -0
- loras/8/README.md +202 -0
- loras/8/adapter_config.json +34 -0
- loras/8/adapter_model.safetensors +3 -0
- merges.txt +0 -0
- model-00001-of-00002.safetensors +3 -0
- model-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +513 -0
- modeling_mola_lm.py +598 -0
- router_weights.pth +3 -0
- special_tokens_map.json +31 -0
- tokenizer.json +3 -0
- tokenizer_config.json +239 -0
- vocab.json +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
library_name: transformers
|
4 |
+
tags:
|
5 |
+
- pytorch
|
6 |
+
- mixture-of-experts
|
7 |
+
- lora
|
8 |
+
- adapter
|
9 |
+
- causal-lm
|
10 |
+
- text-generation
|
11 |
+
language:
|
12 |
+
- en
|
13 |
+
pipeline_tag: text-generation
|
14 |
+
---
|
15 |
+
|
16 |
+
Image here
|
17 |
+
|
18 |
+
# MoLA-LM: Mixture of LoRA Adapters LLM
|
19 |
+
|
20 |
+
MoLA-LM combines multiple LoRA adapters with an intelligent router to automatically select the best adapter for each input prompt. This approach enables specialized performance across different tasks while maintaining efficiency.
|
21 |
+
|
22 |
+
Evals are coming...
|
23 |
+
|
24 |
+
## Model Details
|
25 |
+
|
26 |
+
- **Model Type**: Mixture of LoRA Adapters Language Model
|
27 |
+
- **Base Model**: Qwen/Qwen3-4B-Thinking-2507
|
28 |
+
- **Total Adapters**: 9
|
29 |
+
- **Architecture**: Custom MoLAForCausalLM with automatic adapter routing
|
30 |
+
|
31 |
+
## Usage
|
32 |
+
|
33 |
+
```python
|
34 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
35 |
+
|
36 |
+
# Load the model (trust_remote_code=True is required for custom architecture)
|
37 |
+
model = AutoModelForCausalLM.from_pretrained(
|
38 |
+
"MoLA-LLM/MoLA-v0.5-9x4b",
|
39 |
+
trust_remote_code=True,
|
40 |
+
device_map="auto"
|
41 |
+
)
|
42 |
+
tokenizer = AutoTokenizer.from_pretrained("MoLA-LLM/MoLA-v0.5-9x4b", trust_remote_code=True)
|
43 |
+
|
44 |
+
# Use like any other language model - adapter selection is automatic
|
45 |
+
prompt = "Write a Python function to calculate fibonacci numbers"
|
46 |
+
messages = [{"role": "user", "content": prompt}]
|
47 |
+
inputs = tokenizer.apply_chat_template(
|
48 |
+
messages,
|
49 |
+
add_generation_prompt=True,
|
50 |
+
tokenize=True,
|
51 |
+
return_dict=True,
|
52 |
+
return_tensors="pt",
|
53 |
+
).to(model.device)
|
54 |
+
|
55 |
+
outputs = model.generate(**inputs, max_new_tokens=8192, temperature=.6, do_sample=True)
|
56 |
+
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
|
57 |
+
|
58 |
+
print(f"Selected LoRA: {model.get_current_lora()}")
|
59 |
+
print(response)
|
60 |
+
```
|
61 |
+
*You can also use load_in_4bit and load_in_8bit directly when loading!*
|
62 |
+
|
63 |
+
## Architecture
|
64 |
+
|
65 |
+
The MoLA-LM architecture consists of:
|
66 |
+
|
67 |
+
1. **Base Model**: Qwen/Qwen3-4B-Thinking-2507
|
68 |
+
2. **Router Network**: Frozen encoder as Sentence transformer + decoder as one layer MLP for adapter selection
|
69 |
+
3. **LoRA Adapters**: 9 task-specific fine-tuned adapters
|
70 |
+
4. **Dynamic Switching**: Automatic adapter application based on input
|
71 |
+
|
72 |
+
---
|
73 |
+
|
74 |
+
##*Paper coming soon™*
|
__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
MoLA-LM: Mixture of LoRA Adapters Language Model
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .configuration_mola_lm import MoLAConfig
|
6 |
+
from .modeling_mola_lm import MoLAForCausalLM
|
7 |
+
|
8 |
+
__all__ = ["MoLAConfig", "MoLAForCausalLM"]
|
added_tokens.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</think>": 151668,
|
3 |
+
"</tool_call>": 151658,
|
4 |
+
"</tool_response>": 151666,
|
5 |
+
"<think>": 151667,
|
6 |
+
"<tool_call>": 151657,
|
7 |
+
"<tool_response>": 151665,
|
8 |
+
"<|box_end|>": 151649,
|
9 |
+
"<|box_start|>": 151648,
|
10 |
+
"<|endoftext|>": 151643,
|
11 |
+
"<|file_sep|>": 151664,
|
12 |
+
"<|fim_middle|>": 151660,
|
13 |
+
"<|fim_pad|>": 151662,
|
14 |
+
"<|fim_prefix|>": 151659,
|
15 |
+
"<|fim_suffix|>": 151661,
|
16 |
+
"<|im_end|>": 151645,
|
17 |
+
"<|im_start|>": 151644,
|
18 |
+
"<|image_pad|>": 151655,
|
19 |
+
"<|object_ref_end|>": 151647,
|
20 |
+
"<|object_ref_start|>": 151646,
|
21 |
+
"<|quad_end|>": 151651,
|
22 |
+
"<|quad_start|>": 151650,
|
23 |
+
"<|repo_name|>": 151663,
|
24 |
+
"<|video_pad|>": 151656,
|
25 |
+
"<|vision_end|>": 151653,
|
26 |
+
"<|vision_pad|>": 151654,
|
27 |
+
"<|vision_start|>": 151652
|
28 |
+
}
|
chat_template.jinja
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{%- if tools %}
|
2 |
+
{{- '<|im_start|>system\n' }}
|
3 |
+
{%- if messages[0].role == 'system' %}
|
4 |
+
{{- messages[0].content + '\n\n' }}
|
5 |
+
{%- endif %}
|
6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
7 |
+
{%- for tool in tools %}
|
8 |
+
{{- "\n" }}
|
9 |
+
{{- tool | tojson }}
|
10 |
+
{%- endfor %}
|
11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
12 |
+
{%- else %}
|
13 |
+
{%- if messages[0].role == 'system' %}
|
14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
15 |
+
{%- endif %}
|
16 |
+
{%- endif %}
|
17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
18 |
+
{%- for message in messages[::-1] %}
|
19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
20 |
+
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
21 |
+
{%- set ns.multi_step_tool = false %}
|
22 |
+
{%- set ns.last_query_index = index %}
|
23 |
+
{%- endif %}
|
24 |
+
{%- endfor %}
|
25 |
+
{%- for message in messages %}
|
26 |
+
{%- if message.content is string %}
|
27 |
+
{%- set content = message.content %}
|
28 |
+
{%- else %}
|
29 |
+
{%- set content = '' %}
|
30 |
+
{%- endif %}
|
31 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
32 |
+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
33 |
+
{%- elif message.role == "assistant" %}
|
34 |
+
{%- set reasoning_content = '' %}
|
35 |
+
{%- if message.reasoning_content is string %}
|
36 |
+
{%- set reasoning_content = message.reasoning_content %}
|
37 |
+
{%- else %}
|
38 |
+
{%- if '</think>' in content %}
|
39 |
+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
40 |
+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
41 |
+
{%- endif %}
|
42 |
+
{%- endif %}
|
43 |
+
{%- if loop.index0 > ns.last_query_index %}
|
44 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
45 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
46 |
+
{%- else %}
|
47 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
48 |
+
{%- endif %}
|
49 |
+
{%- else %}
|
50 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
51 |
+
{%- endif %}
|
52 |
+
{%- if message.tool_calls %}
|
53 |
+
{%- for tool_call in message.tool_calls %}
|
54 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
55 |
+
{{- '\n' }}
|
56 |
+
{%- endif %}
|
57 |
+
{%- if tool_call.function %}
|
58 |
+
{%- set tool_call = tool_call.function %}
|
59 |
+
{%- endif %}
|
60 |
+
{{- '<tool_call>\n{"name": "' }}
|
61 |
+
{{- tool_call.name }}
|
62 |
+
{{- '", "arguments": ' }}
|
63 |
+
{%- if tool_call.arguments is string %}
|
64 |
+
{{- tool_call.arguments }}
|
65 |
+
{%- else %}
|
66 |
+
{{- tool_call.arguments | tojson }}
|
67 |
+
{%- endif %}
|
68 |
+
{{- '}\n</tool_call>' }}
|
69 |
+
{%- endfor %}
|
70 |
+
{%- endif %}
|
71 |
+
{{- '<|im_end|>\n' }}
|
72 |
+
{%- elif message.role == "tool" %}
|
73 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
74 |
+
{{- '<|im_start|>user' }}
|
75 |
+
{%- endif %}
|
76 |
+
{{- '\n<tool_response>\n' }}
|
77 |
+
{{- content }}
|
78 |
+
{{- '\n</tool_response>' }}
|
79 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
80 |
+
{{- '<|im_end|>\n' }}
|
81 |
+
{%- endif %}
|
82 |
+
{%- endif %}
|
83 |
+
{%- endfor %}
|
84 |
+
{%- if add_generation_prompt %}
|
85 |
+
{{- '<|im_start|>assistant\n<think>\n' }}
|
86 |
+
{%- endif %}
|
config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"MoLAForCausalLM"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_mola_lm.MoLAConfig",
|
7 |
+
"AutoModelForCausalLM": "modeling_mola_lm.MoLAForCausalLM"
|
8 |
+
},
|
9 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
10 |
+
"task_labels": [
|
11 |
+
"0",
|
12 |
+
"1",
|
13 |
+
"2",
|
14 |
+
"3",
|
15 |
+
"4",
|
16 |
+
"5",
|
17 |
+
"6",
|
18 |
+
"7",
|
19 |
+
"8"
|
20 |
+
],
|
21 |
+
"num_loras": 9,
|
22 |
+
"model_type": "mola_lm",
|
23 |
+
"transformers_version": "4.36.0"
|
24 |
+
}
|
configuration_mola_lm.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration class for MoLA-LM
|
3 |
+
"""
|
4 |
+
|
5 |
+
from transformers import PretrainedConfig
|
6 |
+
from typing import Dict, List
|
7 |
+
|
8 |
+
EXPERTS_LIST = [
|
9 |
+
"0",
|
10 |
+
"1",
|
11 |
+
"2",
|
12 |
+
"3",
|
13 |
+
"4",
|
14 |
+
"5",
|
15 |
+
"6",
|
16 |
+
"7",
|
17 |
+
"8",
|
18 |
+
]
|
19 |
+
|
20 |
+
|
21 |
+
class MoLAConfig(PretrainedConfig):
|
22 |
+
"""Configuration class for MoLA-LM model."""
|
23 |
+
|
24 |
+
model_type = "mola_lm"
|
25 |
+
|
26 |
+
def __init__(
|
27 |
+
self,
|
28 |
+
base_model_name_or_path: str = "Qwen/Qwen2.5-3B-Instruct",
|
29 |
+
task_labels: List[str] = None,
|
30 |
+
router_config: Dict = None,
|
31 |
+
lora_configs: Dict[str, Dict] = None,
|
32 |
+
**kwargs
|
33 |
+
):
|
34 |
+
super().__init__(**kwargs)
|
35 |
+
self.base_model_name_or_path = base_model_name_or_path
|
36 |
+
self.task_labels = task_labels or EXPERTS_LIST
|
37 |
+
self.router_config = router_config or {}
|
38 |
+
self.lora_configs = lora_configs or {}
|
39 |
+
self.num_loras = len(self.task_labels)
|
loras/0/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/0/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"down_proj",
|
24 |
+
"k_proj",
|
25 |
+
"v_proj",
|
26 |
+
"up_proj",
|
27 |
+
"gate_proj",
|
28 |
+
"q_proj",
|
29 |
+
"o_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/0/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5188249779649a7bd2fccd0893dbd4d5ba46bf6dbefb4d3aa9c00c48446966ac
|
3 |
+
size 66126768
|
loras/1/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/1/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"down_proj",
|
24 |
+
"k_proj",
|
25 |
+
"up_proj",
|
26 |
+
"q_proj",
|
27 |
+
"o_proj",
|
28 |
+
"v_proj",
|
29 |
+
"gate_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/1/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b75845af51d525de7d85dd7132eec30f6c4d36761e843449c526cf07daafd3b
|
3 |
+
size 66126768
|
loras/2/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/2/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
+
"down_proj",
|
25 |
+
"o_proj",
|
26 |
+
"v_proj",
|
27 |
+
"gate_proj",
|
28 |
+
"up_proj",
|
29 |
+
"k_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/2/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3a3824581f02be957b6e82217233d3b08b85f93632bcef1c1ff2089bf18f912
|
3 |
+
size 66126768
|
loras/3/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/3/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"v_proj",
|
24 |
+
"up_proj",
|
25 |
+
"o_proj",
|
26 |
+
"k_proj",
|
27 |
+
"down_proj",
|
28 |
+
"gate_proj",
|
29 |
+
"q_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/3/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c658489b09b599292dd5a0f22a5989323e27860c6612b83ab89140b7cb2e705
|
3 |
+
size 66126768
|
loras/4/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/4/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"o_proj",
|
24 |
+
"q_proj",
|
25 |
+
"down_proj",
|
26 |
+
"up_proj",
|
27 |
+
"gate_proj",
|
28 |
+
"k_proj",
|
29 |
+
"v_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/4/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ccf0ec70d994ce232c05cc1a6ec47980b138f3dad85a910b56b384b00b55b939
|
3 |
+
size 66126768
|
loras/5/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/5/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"down_proj",
|
26 |
+
"k_proj",
|
27 |
+
"up_proj",
|
28 |
+
"o_proj",
|
29 |
+
"gate_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/5/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b692ebde6d90eb0b492fde12dd51c3fca9d73e68eb9a519b01a434124010cd6
|
3 |
+
size 66126768
|
loras/6/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/6/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"k_proj",
|
24 |
+
"gate_proj",
|
25 |
+
"down_proj",
|
26 |
+
"v_proj",
|
27 |
+
"up_proj",
|
28 |
+
"q_proj",
|
29 |
+
"o_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/6/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69981b3d52a31dc4631d35e6929ee18489413365570ad287d9bf544d05de10cd
|
3 |
+
size 66126768
|
loras/7/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/7/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"v_proj",
|
24 |
+
"gate_proj",
|
25 |
+
"up_proj",
|
26 |
+
"down_proj",
|
27 |
+
"k_proj",
|
28 |
+
"o_proj",
|
29 |
+
"q_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/7/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:daacefa493abc4a6606c29f2f3c143369f298722066e656fa6c28a1e7bdc88b2
|
3 |
+
size 66126768
|
loras/8/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Qwen/Qwen3-4B-Thinking-2507
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
loras/8/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "Qwen/Qwen3-4B-Thinking-2507",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
+
"up_proj",
|
25 |
+
"v_proj",
|
26 |
+
"gate_proj",
|
27 |
+
"o_proj",
|
28 |
+
"k_proj",
|
29 |
+
"down_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
loras/8/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10256aef4d2832e826f1a341849b1dc0ea60e451c74dfdb7fd17df09e8e1b1fd
|
3 |
+
size 66126768
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model-00001-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f4a0ea0a2e096f17d2540516d5ad87c17965d0c8478cb91850823855ca164ea
|
3 |
+
size 4967217648
|
model-00002-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4a211ece672744d6763cc52a55d5fc491292f5865c77f6ec09dc9f72554ab82
|
3 |
+
size 3168785724
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_parameters": 4045219145,
|
4 |
+
"total_size": 8135940388
|
5 |
+
},
|
6 |
+
"weight_map": {
|
7 |
+
"base_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
8 |
+
"base_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
9 |
+
"base_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
10 |
+
"base_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
11 |
+
"base_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
12 |
+
"base_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
13 |
+
"base_model.model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
14 |
+
"base_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
15 |
+
"base_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
16 |
+
"base_model.model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
17 |
+
"base_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
18 |
+
"base_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
19 |
+
"base_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
20 |
+
"base_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
21 |
+
"base_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
22 |
+
"base_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
23 |
+
"base_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
24 |
+
"base_model.model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
25 |
+
"base_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
26 |
+
"base_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
27 |
+
"base_model.model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
28 |
+
"base_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
29 |
+
"base_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
30 |
+
"base_model.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
31 |
+
"base_model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
32 |
+
"base_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
33 |
+
"base_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
34 |
+
"base_model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
35 |
+
"base_model.model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
36 |
+
"base_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
37 |
+
"base_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
38 |
+
"base_model.model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
39 |
+
"base_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
40 |
+
"base_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
41 |
+
"base_model.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
42 |
+
"base_model.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
43 |
+
"base_model.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
44 |
+
"base_model.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
45 |
+
"base_model.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
46 |
+
"base_model.model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
47 |
+
"base_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
48 |
+
"base_model.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
49 |
+
"base_model.model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
50 |
+
"base_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
51 |
+
"base_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
52 |
+
"base_model.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
53 |
+
"base_model.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
54 |
+
"base_model.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
55 |
+
"base_model.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
56 |
+
"base_model.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
57 |
+
"base_model.model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
58 |
+
"base_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
59 |
+
"base_model.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
60 |
+
"base_model.model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
61 |
+
"base_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
62 |
+
"base_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
63 |
+
"base_model.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
64 |
+
"base_model.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
65 |
+
"base_model.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
66 |
+
"base_model.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
67 |
+
"base_model.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
68 |
+
"base_model.model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
69 |
+
"base_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
70 |
+
"base_model.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
71 |
+
"base_model.model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
72 |
+
"base_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
73 |
+
"base_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
74 |
+
"base_model.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
75 |
+
"base_model.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
76 |
+
"base_model.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
77 |
+
"base_model.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
78 |
+
"base_model.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
79 |
+
"base_model.model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
80 |
+
"base_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
81 |
+
"base_model.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
82 |
+
"base_model.model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
83 |
+
"base_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
84 |
+
"base_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
85 |
+
"base_model.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
86 |
+
"base_model.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
87 |
+
"base_model.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
88 |
+
"base_model.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
89 |
+
"base_model.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
90 |
+
"base_model.model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
91 |
+
"base_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
92 |
+
"base_model.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
93 |
+
"base_model.model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
94 |
+
"base_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
95 |
+
"base_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
96 |
+
"base_model.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
97 |
+
"base_model.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
98 |
+
"base_model.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
99 |
+
"base_model.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
100 |
+
"base_model.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
101 |
+
"base_model.model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
102 |
+
"base_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
103 |
+
"base_model.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
104 |
+
"base_model.model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
105 |
+
"base_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
106 |
+
"base_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
107 |
+
"base_model.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
108 |
+
"base_model.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
109 |
+
"base_model.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
110 |
+
"base_model.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
111 |
+
"base_model.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
112 |
+
"base_model.model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
113 |
+
"base_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
114 |
+
"base_model.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
115 |
+
"base_model.model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
116 |
+
"base_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
117 |
+
"base_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
118 |
+
"base_model.model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
119 |
+
"base_model.model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
120 |
+
"base_model.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
121 |
+
"base_model.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
122 |
+
"base_model.model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
123 |
+
"base_model.model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
124 |
+
"base_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
125 |
+
"base_model.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
126 |
+
"base_model.model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
127 |
+
"base_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
128 |
+
"base_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
129 |
+
"base_model.model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
130 |
+
"base_model.model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
131 |
+
"base_model.model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
132 |
+
"base_model.model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
133 |
+
"base_model.model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
134 |
+
"base_model.model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
135 |
+
"base_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
136 |
+
"base_model.model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
137 |
+
"base_model.model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
138 |
+
"base_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
139 |
+
"base_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
140 |
+
"base_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
141 |
+
"base_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
142 |
+
"base_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
143 |
+
"base_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
144 |
+
"base_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
145 |
+
"base_model.model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
146 |
+
"base_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
147 |
+
"base_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
148 |
+
"base_model.model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
149 |
+
"base_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
150 |
+
"base_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
151 |
+
"base_model.model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
152 |
+
"base_model.model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
153 |
+
"base_model.model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
154 |
+
"base_model.model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
155 |
+
"base_model.model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
156 |
+
"base_model.model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
157 |
+
"base_model.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
158 |
+
"base_model.model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
159 |
+
"base_model.model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
160 |
+
"base_model.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
161 |
+
"base_model.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
162 |
+
"base_model.model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
163 |
+
"base_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
164 |
+
"base_model.model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
165 |
+
"base_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
166 |
+
"base_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
167 |
+
"base_model.model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
168 |
+
"base_model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
169 |
+
"base_model.model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
170 |
+
"base_model.model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
171 |
+
"base_model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
172 |
+
"base_model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
173 |
+
"base_model.model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
174 |
+
"base_model.model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
175 |
+
"base_model.model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
176 |
+
"base_model.model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
177 |
+
"base_model.model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
178 |
+
"base_model.model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
179 |
+
"base_model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
180 |
+
"base_model.model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
181 |
+
"base_model.model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
182 |
+
"base_model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
183 |
+
"base_model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
184 |
+
"base_model.model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
185 |
+
"base_model.model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
186 |
+
"base_model.model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
187 |
+
"base_model.model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
188 |
+
"base_model.model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
189 |
+
"base_model.model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
190 |
+
"base_model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
191 |
+
"base_model.model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
192 |
+
"base_model.model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
193 |
+
"base_model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
194 |
+
"base_model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
195 |
+
"base_model.model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
196 |
+
"base_model.model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
197 |
+
"base_model.model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
198 |
+
"base_model.model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
199 |
+
"base_model.model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
200 |
+
"base_model.model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
201 |
+
"base_model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
202 |
+
"base_model.model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
203 |
+
"base_model.model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
204 |
+
"base_model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
205 |
+
"base_model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
206 |
+
"base_model.model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
207 |
+
"base_model.model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
208 |
+
"base_model.model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
209 |
+
"base_model.model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
210 |
+
"base_model.model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
211 |
+
"base_model.model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
212 |
+
"base_model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
213 |
+
"base_model.model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
214 |
+
"base_model.model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
215 |
+
"base_model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
216 |
+
"base_model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
217 |
+
"base_model.model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
218 |
+
"base_model.model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
219 |
+
"base_model.model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
220 |
+
"base_model.model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
221 |
+
"base_model.model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
222 |
+
"base_model.model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
223 |
+
"base_model.model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
224 |
+
"base_model.model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
225 |
+
"base_model.model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
226 |
+
"base_model.model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
227 |
+
"base_model.model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
228 |
+
"base_model.model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
229 |
+
"base_model.model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
230 |
+
"base_model.model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
231 |
+
"base_model.model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
232 |
+
"base_model.model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
233 |
+
"base_model.model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
234 |
+
"base_model.model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
235 |
+
"base_model.model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
236 |
+
"base_model.model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
237 |
+
"base_model.model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
238 |
+
"base_model.model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
239 |
+
"base_model.model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
240 |
+
"base_model.model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
241 |
+
"base_model.model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
242 |
+
"base_model.model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
243 |
+
"base_model.model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
244 |
+
"base_model.model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
245 |
+
"base_model.model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
246 |
+
"base_model.model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
247 |
+
"base_model.model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
248 |
+
"base_model.model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
249 |
+
"base_model.model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
250 |
+
"base_model.model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
251 |
+
"base_model.model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
252 |
+
"base_model.model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
253 |
+
"base_model.model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
254 |
+
"base_model.model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
255 |
+
"base_model.model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
256 |
+
"base_model.model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
257 |
+
"base_model.model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
258 |
+
"base_model.model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
259 |
+
"base_model.model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
260 |
+
"base_model.model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
261 |
+
"base_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
262 |
+
"base_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
263 |
+
"base_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
264 |
+
"base_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
265 |
+
"base_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
266 |
+
"base_model.model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
267 |
+
"base_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
268 |
+
"base_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
269 |
+
"base_model.model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
270 |
+
"base_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
271 |
+
"base_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
272 |
+
"base_model.model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
273 |
+
"base_model.model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
274 |
+
"base_model.model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
275 |
+
"base_model.model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
276 |
+
"base_model.model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
277 |
+
"base_model.model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
278 |
+
"base_model.model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
279 |
+
"base_model.model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
280 |
+
"base_model.model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
281 |
+
"base_model.model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
282 |
+
"base_model.model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
283 |
+
"base_model.model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
284 |
+
"base_model.model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
285 |
+
"base_model.model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
286 |
+
"base_model.model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
287 |
+
"base_model.model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
288 |
+
"base_model.model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
289 |
+
"base_model.model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
290 |
+
"base_model.model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
291 |
+
"base_model.model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
292 |
+
"base_model.model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
293 |
+
"base_model.model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
294 |
+
"base_model.model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
295 |
+
"base_model.model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
296 |
+
"base_model.model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
297 |
+
"base_model.model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
298 |
+
"base_model.model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
299 |
+
"base_model.model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
300 |
+
"base_model.model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
301 |
+
"base_model.model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
302 |
+
"base_model.model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
303 |
+
"base_model.model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
304 |
+
"base_model.model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
305 |
+
"base_model.model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
306 |
+
"base_model.model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
307 |
+
"base_model.model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
308 |
+
"base_model.model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
309 |
+
"base_model.model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
310 |
+
"base_model.model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
311 |
+
"base_model.model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
312 |
+
"base_model.model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
313 |
+
"base_model.model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
314 |
+
"base_model.model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
315 |
+
"base_model.model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
316 |
+
"base_model.model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
317 |
+
"base_model.model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
318 |
+
"base_model.model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
319 |
+
"base_model.model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
320 |
+
"base_model.model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
321 |
+
"base_model.model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
322 |
+
"base_model.model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
323 |
+
"base_model.model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
324 |
+
"base_model.model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
325 |
+
"base_model.model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
326 |
+
"base_model.model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
327 |
+
"base_model.model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
328 |
+
"base_model.model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
329 |
+
"base_model.model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
330 |
+
"base_model.model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
331 |
+
"base_model.model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
332 |
+
"base_model.model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
333 |
+
"base_model.model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
334 |
+
"base_model.model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
335 |
+
"base_model.model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
336 |
+
"base_model.model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
337 |
+
"base_model.model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
338 |
+
"base_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
339 |
+
"base_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
340 |
+
"base_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
341 |
+
"base_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
342 |
+
"base_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
343 |
+
"base_model.model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
344 |
+
"base_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
345 |
+
"base_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
346 |
+
"base_model.model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
347 |
+
"base_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
348 |
+
"base_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
349 |
+
"base_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
350 |
+
"base_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
351 |
+
"base_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
352 |
+
"base_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
353 |
+
"base_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
354 |
+
"base_model.model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
355 |
+
"base_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
356 |
+
"base_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
357 |
+
"base_model.model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
358 |
+
"base_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
359 |
+
"base_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
360 |
+
"base_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
361 |
+
"base_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
362 |
+
"base_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
363 |
+
"base_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
364 |
+
"base_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
365 |
+
"base_model.model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
366 |
+
"base_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
367 |
+
"base_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
368 |
+
"base_model.model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
369 |
+
"base_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
370 |
+
"base_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
371 |
+
"base_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
372 |
+
"base_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
373 |
+
"base_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
374 |
+
"base_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
375 |
+
"base_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
376 |
+
"base_model.model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
377 |
+
"base_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
378 |
+
"base_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
379 |
+
"base_model.model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
380 |
+
"base_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
381 |
+
"base_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
382 |
+
"base_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
383 |
+
"base_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
384 |
+
"base_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
385 |
+
"base_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
386 |
+
"base_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
387 |
+
"base_model.model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
388 |
+
"base_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
389 |
+
"base_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
390 |
+
"base_model.model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
391 |
+
"base_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
392 |
+
"base_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
393 |
+
"base_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
394 |
+
"base_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
395 |
+
"base_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
396 |
+
"base_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
397 |
+
"base_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
398 |
+
"base_model.model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
399 |
+
"base_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
400 |
+
"base_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
401 |
+
"base_model.model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
402 |
+
"base_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
403 |
+
"base_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
404 |
+
"base_model.model.norm.weight": "model-00002-of-00002.safetensors",
|
405 |
+
"router_decoder.0.bias": "model-00002-of-00002.safetensors",
|
406 |
+
"router_decoder.0.weight": "model-00002-of-00002.safetensors",
|
407 |
+
"router_decoder.3.bias": "model-00002-of-00002.safetensors",
|
408 |
+
"router_decoder.3.weight": "model-00002-of-00002.safetensors",
|
409 |
+
"router_encoder.embeddings.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
410 |
+
"router_encoder.embeddings.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
411 |
+
"router_encoder.embeddings.position_embeddings.weight": "model-00002-of-00002.safetensors",
|
412 |
+
"router_encoder.embeddings.token_type_embeddings.weight": "model-00002-of-00002.safetensors",
|
413 |
+
"router_encoder.embeddings.word_embeddings.weight": "model-00002-of-00002.safetensors",
|
414 |
+
"router_encoder.encoder.layer.0.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
415 |
+
"router_encoder.encoder.layer.0.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
416 |
+
"router_encoder.encoder.layer.0.attention.output.dense.bias": "model-00002-of-00002.safetensors",
|
417 |
+
"router_encoder.encoder.layer.0.attention.output.dense.weight": "model-00002-of-00002.safetensors",
|
418 |
+
"router_encoder.encoder.layer.0.attention.self.key.bias": "model-00002-of-00002.safetensors",
|
419 |
+
"router_encoder.encoder.layer.0.attention.self.key.weight": "model-00002-of-00002.safetensors",
|
420 |
+
"router_encoder.encoder.layer.0.attention.self.query.bias": "model-00002-of-00002.safetensors",
|
421 |
+
"router_encoder.encoder.layer.0.attention.self.query.weight": "model-00002-of-00002.safetensors",
|
422 |
+
"router_encoder.encoder.layer.0.attention.self.value.bias": "model-00002-of-00002.safetensors",
|
423 |
+
"router_encoder.encoder.layer.0.attention.self.value.weight": "model-00002-of-00002.safetensors",
|
424 |
+
"router_encoder.encoder.layer.0.intermediate.dense.bias": "model-00002-of-00002.safetensors",
|
425 |
+
"router_encoder.encoder.layer.0.intermediate.dense.weight": "model-00002-of-00002.safetensors",
|
426 |
+
"router_encoder.encoder.layer.0.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
427 |
+
"router_encoder.encoder.layer.0.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
428 |
+
"router_encoder.encoder.layer.0.output.dense.bias": "model-00002-of-00002.safetensors",
|
429 |
+
"router_encoder.encoder.layer.0.output.dense.weight": "model-00002-of-00002.safetensors",
|
430 |
+
"router_encoder.encoder.layer.1.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
431 |
+
"router_encoder.encoder.layer.1.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
432 |
+
"router_encoder.encoder.layer.1.attention.output.dense.bias": "model-00002-of-00002.safetensors",
|
433 |
+
"router_encoder.encoder.layer.1.attention.output.dense.weight": "model-00002-of-00002.safetensors",
|
434 |
+
"router_encoder.encoder.layer.1.attention.self.key.bias": "model-00002-of-00002.safetensors",
|
435 |
+
"router_encoder.encoder.layer.1.attention.self.key.weight": "model-00002-of-00002.safetensors",
|
436 |
+
"router_encoder.encoder.layer.1.attention.self.query.bias": "model-00002-of-00002.safetensors",
|
437 |
+
"router_encoder.encoder.layer.1.attention.self.query.weight": "model-00002-of-00002.safetensors",
|
438 |
+
"router_encoder.encoder.layer.1.attention.self.value.bias": "model-00002-of-00002.safetensors",
|
439 |
+
"router_encoder.encoder.layer.1.attention.self.value.weight": "model-00002-of-00002.safetensors",
|
440 |
+
"router_encoder.encoder.layer.1.intermediate.dense.bias": "model-00002-of-00002.safetensors",
|
441 |
+
"router_encoder.encoder.layer.1.intermediate.dense.weight": "model-00002-of-00002.safetensors",
|
442 |
+
"router_encoder.encoder.layer.1.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
443 |
+
"router_encoder.encoder.layer.1.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
444 |
+
"router_encoder.encoder.layer.1.output.dense.bias": "model-00002-of-00002.safetensors",
|
445 |
+
"router_encoder.encoder.layer.1.output.dense.weight": "model-00002-of-00002.safetensors",
|
446 |
+
"router_encoder.encoder.layer.2.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
447 |
+
"router_encoder.encoder.layer.2.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
448 |
+
"router_encoder.encoder.layer.2.attention.output.dense.bias": "model-00002-of-00002.safetensors",
|
449 |
+
"router_encoder.encoder.layer.2.attention.output.dense.weight": "model-00002-of-00002.safetensors",
|
450 |
+
"router_encoder.encoder.layer.2.attention.self.key.bias": "model-00002-of-00002.safetensors",
|
451 |
+
"router_encoder.encoder.layer.2.attention.self.key.weight": "model-00002-of-00002.safetensors",
|
452 |
+
"router_encoder.encoder.layer.2.attention.self.query.bias": "model-00002-of-00002.safetensors",
|
453 |
+
"router_encoder.encoder.layer.2.attention.self.query.weight": "model-00002-of-00002.safetensors",
|
454 |
+
"router_encoder.encoder.layer.2.attention.self.value.bias": "model-00002-of-00002.safetensors",
|
455 |
+
"router_encoder.encoder.layer.2.attention.self.value.weight": "model-00002-of-00002.safetensors",
|
456 |
+
"router_encoder.encoder.layer.2.intermediate.dense.bias": "model-00002-of-00002.safetensors",
|
457 |
+
"router_encoder.encoder.layer.2.intermediate.dense.weight": "model-00002-of-00002.safetensors",
|
458 |
+
"router_encoder.encoder.layer.2.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
459 |
+
"router_encoder.encoder.layer.2.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
460 |
+
"router_encoder.encoder.layer.2.output.dense.bias": "model-00002-of-00002.safetensors",
|
461 |
+
"router_encoder.encoder.layer.2.output.dense.weight": "model-00002-of-00002.safetensors",
|
462 |
+
"router_encoder.encoder.layer.3.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
463 |
+
"router_encoder.encoder.layer.3.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
464 |
+
"router_encoder.encoder.layer.3.attention.output.dense.bias": "model-00002-of-00002.safetensors",
|
465 |
+
"router_encoder.encoder.layer.3.attention.output.dense.weight": "model-00002-of-00002.safetensors",
|
466 |
+
"router_encoder.encoder.layer.3.attention.self.key.bias": "model-00002-of-00002.safetensors",
|
467 |
+
"router_encoder.encoder.layer.3.attention.self.key.weight": "model-00002-of-00002.safetensors",
|
468 |
+
"router_encoder.encoder.layer.3.attention.self.query.bias": "model-00002-of-00002.safetensors",
|
469 |
+
"router_encoder.encoder.layer.3.attention.self.query.weight": "model-00002-of-00002.safetensors",
|
470 |
+
"router_encoder.encoder.layer.3.attention.self.value.bias": "model-00002-of-00002.safetensors",
|
471 |
+
"router_encoder.encoder.layer.3.attention.self.value.weight": "model-00002-of-00002.safetensors",
|
472 |
+
"router_encoder.encoder.layer.3.intermediate.dense.bias": "model-00002-of-00002.safetensors",
|
473 |
+
"router_encoder.encoder.layer.3.intermediate.dense.weight": "model-00002-of-00002.safetensors",
|
474 |
+
"router_encoder.encoder.layer.3.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
475 |
+
"router_encoder.encoder.layer.3.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
476 |
+
"router_encoder.encoder.layer.3.output.dense.bias": "model-00002-of-00002.safetensors",
|
477 |
+
"router_encoder.encoder.layer.3.output.dense.weight": "model-00002-of-00002.safetensors",
|
478 |
+
"router_encoder.encoder.layer.4.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
479 |
+
"router_encoder.encoder.layer.4.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
480 |
+
"router_encoder.encoder.layer.4.attention.output.dense.bias": "model-00002-of-00002.safetensors",
|
481 |
+
"router_encoder.encoder.layer.4.attention.output.dense.weight": "model-00002-of-00002.safetensors",
|
482 |
+
"router_encoder.encoder.layer.4.attention.self.key.bias": "model-00002-of-00002.safetensors",
|
483 |
+
"router_encoder.encoder.layer.4.attention.self.key.weight": "model-00002-of-00002.safetensors",
|
484 |
+
"router_encoder.encoder.layer.4.attention.self.query.bias": "model-00002-of-00002.safetensors",
|
485 |
+
"router_encoder.encoder.layer.4.attention.self.query.weight": "model-00002-of-00002.safetensors",
|
486 |
+
"router_encoder.encoder.layer.4.attention.self.value.bias": "model-00002-of-00002.safetensors",
|
487 |
+
"router_encoder.encoder.layer.4.attention.self.value.weight": "model-00002-of-00002.safetensors",
|
488 |
+
"router_encoder.encoder.layer.4.intermediate.dense.bias": "model-00002-of-00002.safetensors",
|
489 |
+
"router_encoder.encoder.layer.4.intermediate.dense.weight": "model-00002-of-00002.safetensors",
|
490 |
+
"router_encoder.encoder.layer.4.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
491 |
+
"router_encoder.encoder.layer.4.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
492 |
+
"router_encoder.encoder.layer.4.output.dense.bias": "model-00002-of-00002.safetensors",
|
493 |
+
"router_encoder.encoder.layer.4.output.dense.weight": "model-00002-of-00002.safetensors",
|
494 |
+
"router_encoder.encoder.layer.5.attention.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
495 |
+
"router_encoder.encoder.layer.5.attention.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
496 |
+
"router_encoder.encoder.layer.5.attention.output.dense.bias": "model-00002-of-00002.safetensors",
|
497 |
+
"router_encoder.encoder.layer.5.attention.output.dense.weight": "model-00002-of-00002.safetensors",
|
498 |
+
"router_encoder.encoder.layer.5.attention.self.key.bias": "model-00002-of-00002.safetensors",
|
499 |
+
"router_encoder.encoder.layer.5.attention.self.key.weight": "model-00002-of-00002.safetensors",
|
500 |
+
"router_encoder.encoder.layer.5.attention.self.query.bias": "model-00002-of-00002.safetensors",
|
501 |
+
"router_encoder.encoder.layer.5.attention.self.query.weight": "model-00002-of-00002.safetensors",
|
502 |
+
"router_encoder.encoder.layer.5.attention.self.value.bias": "model-00002-of-00002.safetensors",
|
503 |
+
"router_encoder.encoder.layer.5.attention.self.value.weight": "model-00002-of-00002.safetensors",
|
504 |
+
"router_encoder.encoder.layer.5.intermediate.dense.bias": "model-00002-of-00002.safetensors",
|
505 |
+
"router_encoder.encoder.layer.5.intermediate.dense.weight": "model-00002-of-00002.safetensors",
|
506 |
+
"router_encoder.encoder.layer.5.output.LayerNorm.bias": "model-00002-of-00002.safetensors",
|
507 |
+
"router_encoder.encoder.layer.5.output.LayerNorm.weight": "model-00002-of-00002.safetensors",
|
508 |
+
"router_encoder.encoder.layer.5.output.dense.bias": "model-00002-of-00002.safetensors",
|
509 |
+
"router_encoder.encoder.layer.5.output.dense.weight": "model-00002-of-00002.safetensors",
|
510 |
+
"router_encoder.pooler.dense.bias": "model-00002-of-00002.safetensors",
|
511 |
+
"router_encoder.pooler.dense.weight": "model-00002-of-00002.safetensors"
|
512 |
+
}
|
513 |
+
}
|
modeling_mola_lm.py
ADDED
@@ -0,0 +1,598 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from typing import Dict, List, Optional, Union
|
7 |
+
from transformers import (
|
8 |
+
AutoConfig, AutoTokenizer, AutoModelForCausalLM,
|
9 |
+
PretrainedConfig, PreTrainedModel, GenerationMixin
|
10 |
+
)
|
11 |
+
from transformers.models.auto import CONFIG_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING
|
12 |
+
from peft import PeftModel, LoraConfig, get_peft_model
|
13 |
+
|
14 |
+
EXPERTS_LIST = [
|
15 |
+
"0",
|
16 |
+
"1",
|
17 |
+
"2",
|
18 |
+
"3",
|
19 |
+
"4",
|
20 |
+
"5",
|
21 |
+
"6",
|
22 |
+
"7",
|
23 |
+
"8",
|
24 |
+
]
|
25 |
+
|
26 |
+
|
27 |
+
class MoLAConfig(PretrainedConfig):
|
28 |
+
"""Configuration class for MoLA-LM model."""
|
29 |
+
|
30 |
+
model_type = "mola_lm"
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self,
|
34 |
+
base_model_name_or_path: str = "Qwen/Qwen3-4B-Thinking-2507",
|
35 |
+
task_labels: List[str] = None,
|
36 |
+
router_config: Dict = None,
|
37 |
+
lora_configs: Dict[str, Dict] = None,
|
38 |
+
**kwargs
|
39 |
+
):
|
40 |
+
super().__init__(**kwargs)
|
41 |
+
self.base_model_name_or_path = base_model_name_or_path
|
42 |
+
self.task_labels = task_labels or EXPERTS_LIST
|
43 |
+
self.router_config = router_config or {}
|
44 |
+
self.lora_configs = lora_configs or {}
|
45 |
+
self.num_loras = len(self.task_labels)
|
46 |
+
|
47 |
+
|
48 |
+
class MoLAForCausalLM(PreTrainedModel, GenerationMixin):
|
49 |
+
"""
|
50 |
+
MoLA Language Model for Causal Language Modeling - AutoModel Compatible
|
51 |
+
"""
|
52 |
+
|
53 |
+
config_class = MoLAConfig
|
54 |
+
base_model_prefix = "mola_model" # Avoid recursion by using unique prefix
|
55 |
+
supports_gradient_checkpointing = True
|
56 |
+
|
57 |
+
def __init__(self, config):
|
58 |
+
super().__init__(config)
|
59 |
+
self.config = config
|
60 |
+
|
61 |
+
# Store model path for loading resources
|
62 |
+
self.model_path = getattr(config, '_name_or_path', None)
|
63 |
+
|
64 |
+
# Load base model (use base_model_prefix name)
|
65 |
+
print(f"Loading base model: {self.config.base_model_name_or_path}")
|
66 |
+
self.mola_model = AutoModelForCausalLM.from_pretrained(
|
67 |
+
self.config.base_model_name_or_path,
|
68 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
69 |
+
device_map="auto" if torch.cuda.is_available() else None
|
70 |
+
)
|
71 |
+
|
72 |
+
# Load tokenizer
|
73 |
+
if self.model_path:
|
74 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
|
75 |
+
else:
|
76 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model_name_or_path)
|
77 |
+
|
78 |
+
if self.tokenizer.pad_token is None:
|
79 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
80 |
+
|
81 |
+
# Initialize router
|
82 |
+
self._init_router()
|
83 |
+
|
84 |
+
# Initialize current model state (will be updated by _load_lora_adapters)
|
85 |
+
self._current_lora = None
|
86 |
+
self._current_adapted_model = self.mola_model
|
87 |
+
|
88 |
+
# Load LoRA configurations and adapters (this will update _current_adapted_model)
|
89 |
+
self._load_lora_adapters()
|
90 |
+
|
91 |
+
# Initialize device property (needed for PreTrainedModel compatibility)
|
92 |
+
self._device = next(self.mola_model.parameters()).device
|
93 |
+
|
94 |
+
# Load router weights if available
|
95 |
+
self._load_router_weights()
|
96 |
+
|
97 |
+
print("MoLA-LM initialized successfully!")
|
98 |
+
|
99 |
+
def _load_router_weights(self):
|
100 |
+
"""Load router weights from the saved checkpoint."""
|
101 |
+
if self.model_path:
|
102 |
+
try:
|
103 |
+
# Handle both local and Hub paths for router weights
|
104 |
+
if os.path.exists(self.model_path):
|
105 |
+
# Local path
|
106 |
+
router_weights_path = os.path.join(self.model_path, "router_weights.pth")
|
107 |
+
if os.path.exists(router_weights_path):
|
108 |
+
checkpoint = torch.load(router_weights_path, map_location='cpu')
|
109 |
+
else:
|
110 |
+
print("⚠️ No router weights found locally")
|
111 |
+
return
|
112 |
+
else:
|
113 |
+
# Hub path - download router weights
|
114 |
+
try:
|
115 |
+
from huggingface_hub import hf_hub_download
|
116 |
+
router_weights_path = hf_hub_download(
|
117 |
+
repo_id=self.model_path,
|
118 |
+
filename="router_weights.pth",
|
119 |
+
local_files_only=False
|
120 |
+
)
|
121 |
+
checkpoint = torch.load(router_weights_path, map_location='cpu')
|
122 |
+
print("📥 Downloaded router weights from Hub")
|
123 |
+
except Exception as hub_e:
|
124 |
+
print(f"⚠️ Failed to download router weights from Hub: {hub_e}")
|
125 |
+
print("🔄 Router will use random initialization (reduced performance)")
|
126 |
+
return
|
127 |
+
|
128 |
+
# Load router decoder weights
|
129 |
+
router_state_dict = {}
|
130 |
+
for key, value in checkpoint.items():
|
131 |
+
if not key.startswith('encoder.'): # Skip encoder weights
|
132 |
+
router_state_dict[key] = value
|
133 |
+
|
134 |
+
if router_state_dict:
|
135 |
+
self.router_decoder.load_state_dict(router_state_dict, strict=False)
|
136 |
+
print("✅ Loaded router weights successfully!")
|
137 |
+
|
138 |
+
# Verify weights loaded by checking if they're not all zeros
|
139 |
+
first_layer = next(iter(self.router_decoder.parameters()))
|
140 |
+
if torch.all(first_layer == 0):
|
141 |
+
print("⚠️ Warning: Router weights appear to be zero-initialized")
|
142 |
+
else:
|
143 |
+
print("🎯 Router weights verified - non-zero values detected")
|
144 |
+
else:
|
145 |
+
print("⚠️ No valid router weights found in checkpoint")
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
print(f"❌ Failed to load router weights: {e}")
|
149 |
+
print("🔄 Router will use random initialization (reduced performance)")
|
150 |
+
|
151 |
+
def _init_router(self):
|
152 |
+
"""Initialize the router model for LoRA selection."""
|
153 |
+
try:
|
154 |
+
from transformers import AutoModel
|
155 |
+
|
156 |
+
print("Initializing router components...")
|
157 |
+
# Router components
|
158 |
+
self.router_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
159 |
+
self.router_encoder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
160 |
+
|
161 |
+
# Freeze encoder
|
162 |
+
for param in self.router_encoder.parameters():
|
163 |
+
param.requires_grad = False
|
164 |
+
|
165 |
+
# Router decoder
|
166 |
+
encoder_dim = self.router_encoder.config.hidden_size
|
167 |
+
self.router_decoder = nn.Sequential(
|
168 |
+
nn.Linear(encoder_dim, 96),
|
169 |
+
nn.ReLU(),
|
170 |
+
nn.Dropout(0.2),
|
171 |
+
nn.Linear(96, self.config.num_loras)
|
172 |
+
)
|
173 |
+
|
174 |
+
# Move router to device
|
175 |
+
if torch.cuda.is_available():
|
176 |
+
self.router_encoder = self.router_encoder.cuda()
|
177 |
+
self.router_decoder = self.router_decoder.cuda()
|
178 |
+
|
179 |
+
print("Router initialized successfully!")
|
180 |
+
|
181 |
+
except ImportError as e:
|
182 |
+
raise ImportError(f"Required dependencies not found: {e}")
|
183 |
+
|
184 |
+
def _load_lora_adapters(self):
|
185 |
+
"""Load LoRA adapters using PEFT (single wrapper, multiple adapters)."""
|
186 |
+
from huggingface_hub import hf_hub_download
|
187 |
+
|
188 |
+
if not self.model_path:
|
189 |
+
print("No model path specified, skipping LoRA loading")
|
190 |
+
return
|
191 |
+
|
192 |
+
print("Loading LoRA adapters (single wrapper)...")
|
193 |
+
|
194 |
+
# Get the first adapter to create the initial PEFT wrapper
|
195 |
+
first_adapter = str(self.config.task_labels[0])
|
196 |
+
first_lora_path = None
|
197 |
+
|
198 |
+
try:
|
199 |
+
# Handle both local and Hub paths for first adapter
|
200 |
+
if os.path.exists(self.model_path):
|
201 |
+
# Local path
|
202 |
+
first_lora_path = os.path.join(self.model_path, "loras", first_adapter)
|
203 |
+
if not os.path.exists(first_lora_path):
|
204 |
+
raise FileNotFoundError(f"First adapter directory not found: {first_lora_path}")
|
205 |
+
else:
|
206 |
+
# Hub path - download first adapter
|
207 |
+
try:
|
208 |
+
# Download first adapter to get local path
|
209 |
+
adapter_file = hf_hub_download(
|
210 |
+
repo_id=self.model_path,
|
211 |
+
filename=f"loras/{first_adapter}/adapter_model.safetensors"
|
212 |
+
)
|
213 |
+
first_lora_path = os.path.dirname(adapter_file)
|
214 |
+
print(f"Downloaded first adapter to: {first_lora_path}")
|
215 |
+
except Exception as e:
|
216 |
+
raise Exception(f"Failed to download first adapter {first_adapter}: {e}")
|
217 |
+
|
218 |
+
# Create the initial PEFT wrapper with unique adapter name
|
219 |
+
peft_model = PeftModel.from_pretrained(
|
220 |
+
self.mola_model,
|
221 |
+
first_lora_path,
|
222 |
+
adapter_name=first_adapter,
|
223 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
|
224 |
+
)
|
225 |
+
print(f"✅ Loaded first LoRA: {first_adapter}")
|
226 |
+
|
227 |
+
# Load remaining adapters into the same wrapper with unique names
|
228 |
+
for task_name in self.config.task_labels[1:]:
|
229 |
+
try:
|
230 |
+
lora_path = None
|
231 |
+
|
232 |
+
if os.path.exists(self.model_path):
|
233 |
+
# Local path
|
234 |
+
lora_path = os.path.join(self.model_path, "loras", task_name)
|
235 |
+
if not os.path.exists(lora_path):
|
236 |
+
print(f"⚠️ LoRA directory not found: {lora_path}")
|
237 |
+
continue
|
238 |
+
else:
|
239 |
+
# Hub path - download adapter
|
240 |
+
try:
|
241 |
+
adapter_file = hf_hub_download(
|
242 |
+
repo_id=self.model_path,
|
243 |
+
filename=f"loras/{task_name}/adapter_model.safetensors"
|
244 |
+
)
|
245 |
+
lora_path = os.path.dirname(adapter_file)
|
246 |
+
except Exception as e:
|
247 |
+
print(f"❌ Failed to download LoRA {task_name}: {e}")
|
248 |
+
continue
|
249 |
+
|
250 |
+
# Load adapter into the same PEFT model with unique name
|
251 |
+
peft_model.load_adapter(lora_path, adapter_name=task_name)
|
252 |
+
print(f"✅ Loaded LoRA: {task_name}")
|
253 |
+
|
254 |
+
except Exception as e:
|
255 |
+
print(f"❌ Failed to load LoRA {task_name}: {e}")
|
256 |
+
|
257 |
+
# Store single PEFT model for all adapters
|
258 |
+
self.lora_models = {str(name): peft_model for name in self.config.task_labels}
|
259 |
+
self._current_lora = first_adapter
|
260 |
+
self._current_adapted_model = peft_model
|
261 |
+
|
262 |
+
print(f"Loaded {len(self.config.task_labels)} LoRA adapters into one PEFT model.")
|
263 |
+
print(f"Available adapter names: {list(peft_model.peft_config.keys())}")
|
264 |
+
|
265 |
+
except Exception as e:
|
266 |
+
print(f"❌ Failed to initialize LoRA loading: {e}")
|
267 |
+
self.lora_models = {}
|
268 |
+
self._current_adapted_model = self.mola_model
|
269 |
+
self._current_lora = None
|
270 |
+
|
271 |
+
def predict_best_lora(self, text: str) -> str:
|
272 |
+
"""Predict the best LoRA adapter for given text."""
|
273 |
+
# Set models to eval mode
|
274 |
+
self.router_encoder.eval()
|
275 |
+
self.router_decoder.eval()
|
276 |
+
|
277 |
+
# Encode text
|
278 |
+
inputs = self.router_tokenizer(
|
279 |
+
[text],
|
280 |
+
padding=True,
|
281 |
+
truncation=True,
|
282 |
+
max_length=512,
|
283 |
+
return_tensors="pt"
|
284 |
+
)
|
285 |
+
|
286 |
+
# Move to device
|
287 |
+
device = next(self.router_decoder.parameters()).device
|
288 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
289 |
+
|
290 |
+
with torch.no_grad():
|
291 |
+
outputs = self.router_encoder(**inputs)
|
292 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
293 |
+
logits = self.router_decoder(embeddings)
|
294 |
+
|
295 |
+
# Get best LoRA
|
296 |
+
best_idx = torch.argmax(logits, dim=-1).item()
|
297 |
+
predicted_label = self.config.task_labels[best_idx]
|
298 |
+
|
299 |
+
# Debug output
|
300 |
+
# print(f"Debug - Text: {text[:50]}...")
|
301 |
+
# print(f"Debug - Logits: {logits[0].cpu().numpy()}")
|
302 |
+
# print(f"Debug - Best idx: {best_idx}, Label: {predicted_label}")
|
303 |
+
|
304 |
+
return predicted_label
|
305 |
+
|
306 |
+
def _apply_lora(self, lora_name: str):
|
307 |
+
"""Apply the selected LoRA adapter using set_adapter."""
|
308 |
+
if hasattr(self, '_current_adapted_model') and isinstance(self._current_adapted_model, PeftModel):
|
309 |
+
# Check if the adapter exists in the PEFT model
|
310 |
+
if str(lora_name) in self._current_adapted_model.peft_config:
|
311 |
+
if lora_name != self._current_lora:
|
312 |
+
self._current_adapted_model.set_adapter(str(lora_name))
|
313 |
+
self._current_lora = str(lora_name)
|
314 |
+
# print(f"🎯 Applied LoRA: {lora_name}") # Uncomment for debugging
|
315 |
+
else:
|
316 |
+
print(f"⚠️ LoRA adapter '{lora_name}' not found in PEFT model. Available: {list(self._current_adapted_model.peft_config.keys())}")
|
317 |
+
# Keep current adapter if requested one doesn't exist
|
318 |
+
else:
|
319 |
+
# Fallback to base model if no PEFT model available
|
320 |
+
self._current_adapted_model = self.mola_model
|
321 |
+
self._current_lora = None
|
322 |
+
print(f"⚠️ No PEFT model available, using base model")
|
323 |
+
|
324 |
+
def get_available_loras(self) -> List[str]:
|
325 |
+
"""Get list of available LoRA adapter names."""
|
326 |
+
if hasattr(self, '_current_adapted_model') and isinstance(self._current_adapted_model, PeftModel):
|
327 |
+
return list(self._current_adapted_model.peft_config.keys())
|
328 |
+
else:
|
329 |
+
return []
|
330 |
+
|
331 |
+
def test_adapter_uniqueness(self, layer_name: str = "base_model.model.model.layers.33.mlp.down_proj"):
|
332 |
+
"""
|
333 |
+
Regression test to verify that adapters have different weights.
|
334 |
+
|
335 |
+
Args:
|
336 |
+
layer_name: The layer to test (default is a common MLP layer)
|
337 |
+
|
338 |
+
Returns:
|
339 |
+
Dict[str, str]: Mapping of adapter names to their weight hashes
|
340 |
+
"""
|
341 |
+
import hashlib
|
342 |
+
|
343 |
+
if not hasattr(self, '_current_adapted_model') or not isinstance(self._current_adapted_model, PeftModel):
|
344 |
+
print("⚠️ No PEFT model available for testing")
|
345 |
+
return {}
|
346 |
+
|
347 |
+
names = self.get_available_loras()
|
348 |
+
if len(names) <= 1:
|
349 |
+
print(f"⚠️ Need at least 2 adapters for uniqueness test, found {len(names)}")
|
350 |
+
return {}
|
351 |
+
|
352 |
+
def fused_sha(adapter_name, layer_name):
|
353 |
+
"""Compute SHA256 hash of fused LoRA weights for given adapter and layer."""
|
354 |
+
# Switch to the adapter
|
355 |
+
self._apply_lora(adapter_name)
|
356 |
+
|
357 |
+
# Navigate to the specified layer
|
358 |
+
try:
|
359 |
+
mod = self._current_adapted_model
|
360 |
+
for part in layer_name.split("."):
|
361 |
+
if part:
|
362 |
+
mod = getattr(mod, part)
|
363 |
+
|
364 |
+
# Get LoRA components
|
365 |
+
if not hasattr(mod, 'lora_A') or not hasattr(mod, 'lora_B'):
|
366 |
+
print(f"⚠️ Layer {layer_name} doesn't have LoRA components")
|
367 |
+
return "no_lora"
|
368 |
+
|
369 |
+
# Get the adapter name (should be the same as what we set)
|
370 |
+
adapter_key = next(iter(mod.lora_A.keys()))
|
371 |
+
A = mod.lora_A[adapter_key].weight
|
372 |
+
B = mod.lora_B[adapter_key].weight
|
373 |
+
s = float(mod.scaling[adapter_key])
|
374 |
+
|
375 |
+
# Compute fused weights: ΔW = (B @ A) * scaling
|
376 |
+
dW = (B @ A) * s
|
377 |
+
|
378 |
+
# Convert to bytes and hash
|
379 |
+
tensor_bytes = dW.detach().to("cpu", dtype=torch.float32).contiguous().numpy().tobytes()
|
380 |
+
return hashlib.sha256(tensor_bytes).hexdigest()[:16]
|
381 |
+
|
382 |
+
except Exception as e:
|
383 |
+
print(f"❌ Error computing hash for {adapter_name}: {e}")
|
384 |
+
return f"error_{adapter_name}"
|
385 |
+
|
386 |
+
print(f"🧪 Testing adapter uniqueness on layer: {layer_name}")
|
387 |
+
hashes = {}
|
388 |
+
for adapter_name in names:
|
389 |
+
hash_val = fused_sha(adapter_name, layer_name)
|
390 |
+
hashes[adapter_name] = hash_val
|
391 |
+
print(f" {adapter_name}: {hash_val}")
|
392 |
+
|
393 |
+
# Check uniqueness
|
394 |
+
unique_hashes = set(hashes.values())
|
395 |
+
if len(unique_hashes) == len(names):
|
396 |
+
print("✅ All adapters have unique weights!")
|
397 |
+
else:
|
398 |
+
print(f"❌ Found duplicate weights! {len(names)} adapters but only {len(unique_hashes)} unique hashes")
|
399 |
+
# Show which ones are identical
|
400 |
+
from collections import defaultdict
|
401 |
+
hash_to_adapters = defaultdict(list)
|
402 |
+
for adapter, hash_val in hashes.items():
|
403 |
+
hash_to_adapters[hash_val].append(adapter)
|
404 |
+
|
405 |
+
for hash_val, adapter_list in hash_to_adapters.items():
|
406 |
+
if len(adapter_list) > 1:
|
407 |
+
print(f" Identical weights (hash {hash_val}): {adapter_list}")
|
408 |
+
|
409 |
+
return hashes
|
410 |
+
|
411 |
+
def generate(self, input_ids=None, attention_mask=None, **kwargs):
|
412 |
+
"""
|
413 |
+
Standard generate method with automatic LoRA selection.
|
414 |
+
Works exactly like any other LLM's generate method.
|
415 |
+
"""
|
416 |
+
# If we have input_ids, predict and apply the best LoRA
|
417 |
+
if input_ids is not None and hasattr(self, 'tokenizer'):
|
418 |
+
try:
|
419 |
+
# Decode the input to get the text for LoRA prediction
|
420 |
+
if len(input_ids.shape) > 1:
|
421 |
+
# Batch input - use first item
|
422 |
+
text_input = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
|
423 |
+
else:
|
424 |
+
text_input = self.tokenizer.decode(input_ids, skip_special_tokens=True)
|
425 |
+
|
426 |
+
# Clean the text thoroughly to remove ALL chat template artifacts
|
427 |
+
import re
|
428 |
+
|
429 |
+
# Build regex patterns to avoid escape issues in embedded code
|
430 |
+
start_pattern = '<' + '|im_start|' + '>user'
|
431 |
+
end_pattern = '<' + '|im_end|' + '>'
|
432 |
+
|
433 |
+
# First, try to extract just the user's actual question/prompt
|
434 |
+
if start_pattern in text_input and end_pattern in text_input:
|
435 |
+
start_idx = text_input.find(start_pattern) + len(start_pattern)
|
436 |
+
end_idx = text_input.find(end_pattern, start_idx)
|
437 |
+
if end_idx > start_idx:
|
438 |
+
text_input = text_input[start_idx:end_idx].strip()
|
439 |
+
|
440 |
+
# Clean up any remaining template artifacts
|
441 |
+
# Remove special tokens with simple string replacement
|
442 |
+
text_input = text_input.replace('<|im_start|>', '')
|
443 |
+
text_input = text_input.replace('<|im_end|>', '')
|
444 |
+
text_input = text_input.replace('system', '')
|
445 |
+
text_input = text_input.replace('user', '')
|
446 |
+
text_input = text_input.replace('assistant', '')
|
447 |
+
|
448 |
+
# Remove system message patterns
|
449 |
+
if 'You are Qwen' in text_input:
|
450 |
+
lines = text_input.split('\n')
|
451 |
+
lines = [line for line in lines if 'You are' not in line and 'Alibaba' not in line]
|
452 |
+
text_input = ' '.join(lines)
|
453 |
+
|
454 |
+
# Final cleanup
|
455 |
+
text_input = re.sub(r'\n+', ' ', text_input) # Replace newlines with spaces
|
456 |
+
text_input = re.sub(r'\s+', ' ', text_input) # Normalize whitespace
|
457 |
+
text_input = text_input.strip()
|
458 |
+
|
459 |
+
# Debug: print the actual text being classified
|
460 |
+
# print(f"DEBUG RAW: '{self.tokenizer.decode(input_ids[0], skip_special_tokens=False)}'")
|
461 |
+
# print(f"DEBUG CLEAN: '{text_input}'")
|
462 |
+
|
463 |
+
# Predict and apply best LoRA
|
464 |
+
best_lora = self.predict_best_lora(text_input)
|
465 |
+
self._apply_lora(best_lora)
|
466 |
+
|
467 |
+
except Exception as e:
|
468 |
+
# If LoRA prediction fails, use base model
|
469 |
+
# print(f"DEBUG: LoRA prediction failed: {e}")
|
470 |
+
self._current_adapted_model = self.mola_model
|
471 |
+
self._current_lora = None
|
472 |
+
|
473 |
+
# Use the currently adapted model for generation
|
474 |
+
return self._current_adapted_model.generate(
|
475 |
+
input_ids=input_ids,
|
476 |
+
attention_mask=attention_mask,
|
477 |
+
**kwargs
|
478 |
+
)
|
479 |
+
|
480 |
+
def forward(self, input_ids, attention_mask=None, **kwargs):
|
481 |
+
"""Forward pass through the model."""
|
482 |
+
return self._current_adapted_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
|
483 |
+
|
484 |
+
def __call__(self, *args, **kwargs):
|
485 |
+
"""Make the model callable."""
|
486 |
+
return self._current_adapted_model(*args, **kwargs)
|
487 |
+
|
488 |
+
def get_input_embeddings(self):
|
489 |
+
"""Get the input embeddings."""
|
490 |
+
return self._current_adapted_model.get_input_embeddings()
|
491 |
+
|
492 |
+
def set_input_embeddings(self, value):
|
493 |
+
"""Set the input embeddings."""
|
494 |
+
self._current_adapted_model.set_input_embeddings(value)
|
495 |
+
# Also set for base model to keep them in sync
|
496 |
+
self.mola_model.set_input_embeddings(value)
|
497 |
+
|
498 |
+
def get_output_embeddings(self):
|
499 |
+
"""Get the output embeddings."""
|
500 |
+
return self._current_adapted_model.get_output_embeddings()
|
501 |
+
|
502 |
+
def set_output_embeddings(self, value):
|
503 |
+
"""Set the output embeddings."""
|
504 |
+
self._current_adapted_model.set_output_embeddings(value)
|
505 |
+
# Also set for base model to keep them in sync
|
506 |
+
self.mola_model.set_output_embeddings(value)
|
507 |
+
|
508 |
+
def tie_weights(self):
|
509 |
+
"""Tie input and output embeddings."""
|
510 |
+
self._current_adapted_model.tie_weights()
|
511 |
+
|
512 |
+
def resize_token_embeddings(self, new_num_tokens):
|
513 |
+
"""Resize token embeddings."""
|
514 |
+
return self._current_adapted_model.resize_token_embeddings(new_num_tokens)
|
515 |
+
|
516 |
+
@property
|
517 |
+
def device(self):
|
518 |
+
"""Get the device of the model."""
|
519 |
+
return next(self.mola_model.parameters()).device
|
520 |
+
|
521 |
+
@classmethod
|
522 |
+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
523 |
+
"""Load model from pretrained path (transformers compatibility)."""
|
524 |
+
# Load config
|
525 |
+
config = MoLAConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
526 |
+
# Store the path for resource loading
|
527 |
+
config._name_or_path = pretrained_model_name_or_path
|
528 |
+
return cls(config)
|
529 |
+
|
530 |
+
def save_pretrained(self, save_directory, **kwargs):
|
531 |
+
"""Save model using standard transformers approach."""
|
532 |
+
# Accept standard transformers parameters but use the ones we need
|
533 |
+
max_shard_size = kwargs.get('max_shard_size', "5GB")
|
534 |
+
safe_serialization = kwargs.get('safe_serialization', True)
|
535 |
+
|
536 |
+
os.makedirs(save_directory, exist_ok=True)
|
537 |
+
|
538 |
+
# Save config using transformers method
|
539 |
+
self.config.save_pretrained(save_directory)
|
540 |
+
|
541 |
+
# Save tokenizer if available
|
542 |
+
if hasattr(self, 'tokenizer'):
|
543 |
+
self.tokenizer.save_pretrained(save_directory)
|
544 |
+
|
545 |
+
# Save the base model with proper sharding if needed
|
546 |
+
try:
|
547 |
+
# Use the base model's save_pretrained with the parameters
|
548 |
+
self.mola_model.save_pretrained(
|
549 |
+
save_directory,
|
550 |
+
max_shard_size=max_shard_size,
|
551 |
+
safe_serialization=safe_serialization
|
552 |
+
)
|
553 |
+
except Exception as e:
|
554 |
+
print(f"Warning: Could not save base model weights: {e}")
|
555 |
+
# Fallback: just save the config and tokenizer
|
556 |
+
pass
|
557 |
+
|
558 |
+
# Save router weights if they exist
|
559 |
+
try:
|
560 |
+
if hasattr(self, 'router_decoder'):
|
561 |
+
router_state_dict = self.router_decoder.state_dict()
|
562 |
+
torch.save(router_state_dict, os.path.join(save_directory, "router_weights.pth"))
|
563 |
+
except Exception as e:
|
564 |
+
print(f"Warning: Could not save router weights: {e}")
|
565 |
+
|
566 |
+
print(f"Model saved to {save_directory}")
|
567 |
+
|
568 |
+
def get_current_lora(self) -> str:
|
569 |
+
"""Get the currently applied LoRA adapter name."""
|
570 |
+
return self._current_lora or "base_model"
|
571 |
+
|
572 |
+
def get_available_loras(self) -> List[str]:
|
573 |
+
"""Get list of available LoRA adapters."""
|
574 |
+
return list(self.lora_models.keys())
|
575 |
+
|
576 |
+
|
577 |
+
# For transformers AutoModel registration
|
578 |
+
def _load_mola_model(model_path, **kwargs):
|
579 |
+
"""Helper function to load MoLA model."""
|
580 |
+
return MoLAForCausalLM.from_pretrained(model_path, **kwargs)
|
581 |
+
|
582 |
+
|
583 |
+
# Register with transformers AutoModel system
|
584 |
+
try:
|
585 |
+
CONFIG_MAPPING.register("mola_lm", MoLAConfig)
|
586 |
+
MODEL_FOR_CAUSAL_LM_MAPPING.register(MoLAConfig, MoLAForCausalLM)
|
587 |
+
print("✅ Successfully registered MoLA-LM with AutoModel!")
|
588 |
+
except Exception as e:
|
589 |
+
print(f"⚠️ AutoModel registration failed: {e}")
|
590 |
+
# Try alternative registration for backwards compatibility
|
591 |
+
try:
|
592 |
+
from transformers import AutoConfig, AutoModelForCausalLM
|
593 |
+
AutoConfig.register("mola_lm", MoLAConfig)
|
594 |
+
AutoModelForCausalLM.register(MoLAConfig, MoLAForCausalLM)
|
595 |
+
print("✅ Successfully registered MoLA-LM with legacy method!")
|
596 |
+
except Exception as e2:
|
597 |
+
print(f"⚠️ Legacy registration also failed: {e2}")
|
598 |
+
print("Model can still be loaded directly with MoLAForCausalLM.from_pretrained()")
|
router_weights.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd4844ec0ad964c75599d93e5a20089f7ea41bc5e31a551d99b49665ba6ab7a8
|
3 |
+
size 153763
|
special_tokens_map.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>",
|
5 |
+
"<|object_ref_start|>",
|
6 |
+
"<|object_ref_end|>",
|
7 |
+
"<|box_start|>",
|
8 |
+
"<|box_end|>",
|
9 |
+
"<|quad_start|>",
|
10 |
+
"<|quad_end|>",
|
11 |
+
"<|vision_start|>",
|
12 |
+
"<|vision_end|>",
|
13 |
+
"<|vision_pad|>",
|
14 |
+
"<|image_pad|>",
|
15 |
+
"<|video_pad|>"
|
16 |
+
],
|
17 |
+
"eos_token": {
|
18 |
+
"content": "<|im_end|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
},
|
24 |
+
"pad_token": {
|
25 |
+
"content": "<|endoftext|>",
|
26 |
+
"lstrip": false,
|
27 |
+
"normalized": false,
|
28 |
+
"rstrip": false,
|
29 |
+
"single_word": false
|
30 |
+
}
|
31 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
|
3 |
+
size 11422654
|
tokenizer_config.json
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_prefix_space": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"151643": {
|
6 |
+
"content": "<|endoftext|>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"151644": {
|
14 |
+
"content": "<|im_start|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"151645": {
|
22 |
+
"content": "<|im_end|>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"151646": {
|
30 |
+
"content": "<|object_ref_start|>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
},
|
37 |
+
"151647": {
|
38 |
+
"content": "<|object_ref_end|>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": true
|
44 |
+
},
|
45 |
+
"151648": {
|
46 |
+
"content": "<|box_start|>",
|
47 |
+
"lstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"rstrip": false,
|
50 |
+
"single_word": false,
|
51 |
+
"special": true
|
52 |
+
},
|
53 |
+
"151649": {
|
54 |
+
"content": "<|box_end|>",
|
55 |
+
"lstrip": false,
|
56 |
+
"normalized": false,
|
57 |
+
"rstrip": false,
|
58 |
+
"single_word": false,
|
59 |
+
"special": true
|
60 |
+
},
|
61 |
+
"151650": {
|
62 |
+
"content": "<|quad_start|>",
|
63 |
+
"lstrip": false,
|
64 |
+
"normalized": false,
|
65 |
+
"rstrip": false,
|
66 |
+
"single_word": false,
|
67 |
+
"special": true
|
68 |
+
},
|
69 |
+
"151651": {
|
70 |
+
"content": "<|quad_end|>",
|
71 |
+
"lstrip": false,
|
72 |
+
"normalized": false,
|
73 |
+
"rstrip": false,
|
74 |
+
"single_word": false,
|
75 |
+
"special": true
|
76 |
+
},
|
77 |
+
"151652": {
|
78 |
+
"content": "<|vision_start|>",
|
79 |
+
"lstrip": false,
|
80 |
+
"normalized": false,
|
81 |
+
"rstrip": false,
|
82 |
+
"single_word": false,
|
83 |
+
"special": true
|
84 |
+
},
|
85 |
+
"151653": {
|
86 |
+
"content": "<|vision_end|>",
|
87 |
+
"lstrip": false,
|
88 |
+
"normalized": false,
|
89 |
+
"rstrip": false,
|
90 |
+
"single_word": false,
|
91 |
+
"special": true
|
92 |
+
},
|
93 |
+
"151654": {
|
94 |
+
"content": "<|vision_pad|>",
|
95 |
+
"lstrip": false,
|
96 |
+
"normalized": false,
|
97 |
+
"rstrip": false,
|
98 |
+
"single_word": false,
|
99 |
+
"special": true
|
100 |
+
},
|
101 |
+
"151655": {
|
102 |
+
"content": "<|image_pad|>",
|
103 |
+
"lstrip": false,
|
104 |
+
"normalized": false,
|
105 |
+
"rstrip": false,
|
106 |
+
"single_word": false,
|
107 |
+
"special": true
|
108 |
+
},
|
109 |
+
"151656": {
|
110 |
+
"content": "<|video_pad|>",
|
111 |
+
"lstrip": false,
|
112 |
+
"normalized": false,
|
113 |
+
"rstrip": false,
|
114 |
+
"single_word": false,
|
115 |
+
"special": true
|
116 |
+
},
|
117 |
+
"151657": {
|
118 |
+
"content": "<tool_call>",
|
119 |
+
"lstrip": false,
|
120 |
+
"normalized": false,
|
121 |
+
"rstrip": false,
|
122 |
+
"single_word": false,
|
123 |
+
"special": false
|
124 |
+
},
|
125 |
+
"151658": {
|
126 |
+
"content": "</tool_call>",
|
127 |
+
"lstrip": false,
|
128 |
+
"normalized": false,
|
129 |
+
"rstrip": false,
|
130 |
+
"single_word": false,
|
131 |
+
"special": false
|
132 |
+
},
|
133 |
+
"151659": {
|
134 |
+
"content": "<|fim_prefix|>",
|
135 |
+
"lstrip": false,
|
136 |
+
"normalized": false,
|
137 |
+
"rstrip": false,
|
138 |
+
"single_word": false,
|
139 |
+
"special": false
|
140 |
+
},
|
141 |
+
"151660": {
|
142 |
+
"content": "<|fim_middle|>",
|
143 |
+
"lstrip": false,
|
144 |
+
"normalized": false,
|
145 |
+
"rstrip": false,
|
146 |
+
"single_word": false,
|
147 |
+
"special": false
|
148 |
+
},
|
149 |
+
"151661": {
|
150 |
+
"content": "<|fim_suffix|>",
|
151 |
+
"lstrip": false,
|
152 |
+
"normalized": false,
|
153 |
+
"rstrip": false,
|
154 |
+
"single_word": false,
|
155 |
+
"special": false
|
156 |
+
},
|
157 |
+
"151662": {
|
158 |
+
"content": "<|fim_pad|>",
|
159 |
+
"lstrip": false,
|
160 |
+
"normalized": false,
|
161 |
+
"rstrip": false,
|
162 |
+
"single_word": false,
|
163 |
+
"special": false
|
164 |
+
},
|
165 |
+
"151663": {
|
166 |
+
"content": "<|repo_name|>",
|
167 |
+
"lstrip": false,
|
168 |
+
"normalized": false,
|
169 |
+
"rstrip": false,
|
170 |
+
"single_word": false,
|
171 |
+
"special": false
|
172 |
+
},
|
173 |
+
"151664": {
|
174 |
+
"content": "<|file_sep|>",
|
175 |
+
"lstrip": false,
|
176 |
+
"normalized": false,
|
177 |
+
"rstrip": false,
|
178 |
+
"single_word": false,
|
179 |
+
"special": false
|
180 |
+
},
|
181 |
+
"151665": {
|
182 |
+
"content": "<tool_response>",
|
183 |
+
"lstrip": false,
|
184 |
+
"normalized": false,
|
185 |
+
"rstrip": false,
|
186 |
+
"single_word": false,
|
187 |
+
"special": false
|
188 |
+
},
|
189 |
+
"151666": {
|
190 |
+
"content": "</tool_response>",
|
191 |
+
"lstrip": false,
|
192 |
+
"normalized": false,
|
193 |
+
"rstrip": false,
|
194 |
+
"single_word": false,
|
195 |
+
"special": false
|
196 |
+
},
|
197 |
+
"151667": {
|
198 |
+
"content": "<think>",
|
199 |
+
"lstrip": false,
|
200 |
+
"normalized": false,
|
201 |
+
"rstrip": false,
|
202 |
+
"single_word": false,
|
203 |
+
"special": false
|
204 |
+
},
|
205 |
+
"151668": {
|
206 |
+
"content": "</think>",
|
207 |
+
"lstrip": false,
|
208 |
+
"normalized": false,
|
209 |
+
"rstrip": false,
|
210 |
+
"single_word": false,
|
211 |
+
"special": false
|
212 |
+
}
|
213 |
+
},
|
214 |
+
"additional_special_tokens": [
|
215 |
+
"<|im_start|>",
|
216 |
+
"<|im_end|>",
|
217 |
+
"<|object_ref_start|>",
|
218 |
+
"<|object_ref_end|>",
|
219 |
+
"<|box_start|>",
|
220 |
+
"<|box_end|>",
|
221 |
+
"<|quad_start|>",
|
222 |
+
"<|quad_end|>",
|
223 |
+
"<|vision_start|>",
|
224 |
+
"<|vision_end|>",
|
225 |
+
"<|vision_pad|>",
|
226 |
+
"<|image_pad|>",
|
227 |
+
"<|video_pad|>"
|
228 |
+
],
|
229 |
+
"bos_token": null,
|
230 |
+
"clean_up_tokenization_spaces": false,
|
231 |
+
"eos_token": "<|im_end|>",
|
232 |
+
"errors": "replace",
|
233 |
+
"extra_special_tokens": {},
|
234 |
+
"model_max_length": 262144,
|
235 |
+
"pad_token": "<|endoftext|>",
|
236 |
+
"split_special_tokens": false,
|
237 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
238 |
+
"unk_token": null
|
239 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|