|
--- |
|
datasets: |
|
- NeelNanda/pile-10k |
|
base_model: |
|
- deepseek-ai/DeepSeek-R1 |
|
|
|
|
|
|
|
|
|
|
|
--- |
|
|
|
## Model Details |
|
|
|
This model is an int2 model with group_size 64 and symmetric quantization of [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) generated by [intel/auto-round](https://github.com/intel/auto-round) algorithm. We recommend using a mixed version [OPEA/DeepSeek-R1-int2-mixed-sym-inc](https://huggingface.co/OPEA/DeepSeek-R1-int2-mixed-sym-inc) for better accuracy |
|
|
|
Please follow the license of the original model. |
|
|
|
## How To Use |
|
|
|
### INT2 Inference on CUDA(4X80G) |
|
|
|
please note int2 **may be slower** than int4 on CUDA due to kernel issue. |
|
|
|
~~~python |
|
import transformers |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
quantized_model_dir = "OPEA/DeepSeek-R1-int2-gptq-sym-inc" |
|
|
|
## directly use device_map='auto' if you have enough GPUs |
|
device_map = {"model.norm": 0, "lm_head": 0, "model.embed_tokens": 0} |
|
for i in range(61): |
|
name = "model.layers." + str(i) |
|
if i < 15: |
|
device_map[name] = 0 |
|
elif i < 30: |
|
device_map[name] = 1 |
|
elif i < 45: |
|
device_map[name] = 2 |
|
else: |
|
device_map[name] = 3 |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
quantized_model_dir, |
|
torch_dtype=torch.bfloat16, |
|
device_map=device_map, |
|
) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True) |
|
prompts = [ |
|
"9.11和9.8哪个数字大", |
|
"如果你是人,你最想做什么“", |
|
"How many e in word deepseek", |
|
"There are ten birds in a tree. A hunter shoots one. How many are left in the tree?", |
|
] |
|
|
|
texts = [] |
|
for prompt in prompts: |
|
messages = [ |
|
{"role": "user", "content": prompt} |
|
] |
|
text = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
texts.append(text) |
|
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True) |
|
|
|
outputs = model.generate( |
|
input_ids=inputs["input_ids"].to(model.device), |
|
attention_mask=inputs["attention_mask"].to(model.device), |
|
max_length=512, ##change this to align with the official usage |
|
num_return_sequences=1, |
|
do_sample=False ##change this to align with the official usage |
|
) |
|
generated_ids = [ |
|
output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs) |
|
] |
|
|
|
decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) |
|
|
|
for i, prompt in enumerate(prompts): |
|
input_id = inputs |
|
print(f"Prompt: {prompt}") |
|
print(f"Generated: {decoded_outputs[i]}") |
|
print("-" * 50) |
|
|
|
~~~ |
|
|
|
|
|
|
|
### INT2 Inference on CPU |
|
|
|
Requirements |
|
|
|
~~~bash |
|
pip install auto-round |
|
pip uninstall intel-extension-for-pytorch |
|
pip install intel-extension-for-transformers |
|
~~~ |
|
|
|
**It would be quite slow if the cpu does not support avx512** |
|
|
|
~~~python |
|
import transformers |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from auto_round import AutoRoundConfig ##must import for auto-round format |
|
|
|
# https://github.com/huggingface/transformers/pull/35493 |
|
def set_initialized_submodules(model, state_dict_keys): |
|
""" |
|
Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state |
|
dict. |
|
""" |
|
state_dict_keys = set(state_dict_keys) |
|
not_initialized_submodules = {} |
|
for module_name, module in model.named_modules(): |
|
if module_name == "": |
|
# When checking if the root module is loaded there's no need to prepend module_name. |
|
module_keys = set(module.state_dict()) |
|
else: |
|
module_keys = {f"{module_name}.{k}" for k in module.state_dict()} |
|
if module_keys.issubset(state_dict_keys): |
|
module._is_hf_initialized = True |
|
else: |
|
not_initialized_submodules[module_name] = module |
|
return not_initialized_submodules |
|
|
|
|
|
transformers.modeling_utils.set_initialized_submodules = set_initialized_submodules |
|
|
|
import torch |
|
|
|
quantized_model_dir = "OPEA/DeepSeek-R1-int2-mixed-sym-inc" |
|
|
|
|
|
quantization_config = AutoRoundConfig( |
|
backend="cpu", |
|
) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
quantized_model_dir, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
device_map="cpu", |
|
quantization_config=quantization_config, |
|
revision="080ef2d" |
|
) |
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True) |
|
prompts = [ |
|
"9.11和9.8哪个数字大", |
|
"如果你是人,你最想做什么“", |
|
"How many e in word deepseek", |
|
"There are ten birds in a tree. A hunter shoots one. How many are left in the tree?", |
|
] |
|
|
|
texts = [] |
|
for prompt in prompts: |
|
messages = [ |
|
{"role": "user", "content": prompt} |
|
] |
|
text = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
texts.append(text) |
|
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True) |
|
|
|
outputs = model.generate( |
|
input_ids=inputs["input_ids"].to(model.device), |
|
attention_mask=inputs["attention_mask"].to(model.device), |
|
max_length=512, ##change this to align with the official usage |
|
num_return_sequences=1, |
|
do_sample=False ##change this to align with the official usage |
|
) |
|
generated_ids = [ |
|
output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs) |
|
] |
|
|
|
decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) |
|
|
|
for i, prompt in enumerate(prompts): |
|
input_id = inputs |
|
print(f"Prompt: {prompt}") |
|
print(f"Generated: {decoded_outputs[i]}") |
|
print("-" * 50) |
|
|
|
""" |
|
Prompt: 9.11和9.8哪个数字大 |
|
Generated: <think> |
|
首先,比较9.11和9.8的整数部分,两者都是9,所以需要比较小数部分。9.11的小数部分是0.11,而9.8的小数部分是0.8。0.8大于0.11,因此9.8比9.11大。 |
|
</think> |
|
要比较9.11和9.8哪个数字更大,我们可以按照以下步骤进行: |
|
|
|
1. **比较整数部分**:两个数的整数部分都是9,所以它们相等。 |
|
2. **比较小数部分**: |
|
- 9.11的小数部分是0.11 |
|
- 9.8的小数部分是0.8 |
|
3. **比较小数部分的大小**:0.8大于0.11。 |
|
4. **结论**:由于9.8的小数部分更大,因此9.8比9.11大。 |
|
|
|
**答案**:9.8 大于 9.11。 |
|
-------------------------------------------------- |
|
Prompt: 如果你是人,你最想做什么 |
|
Generated: <think> |
|
嗯,用户问如果我是人,我最想做什么。首先,我需要理解这个问题的背景。用户可能好奇作为一个AI,我的愿望或兴趣,或者他们想知道AI是否有类似人类的愿望。也许他们想知道AI是否有情感或自主性。 |
|
|
|
首先,我应该考虑作为AI,我没有情感或欲望,但可以模拟人类的行为。所以,我可以回答假设性的场景。用户可能想知道如果AI有愿望,会是什么。可能的兴趣包括学习、帮助他人,或者探索知识。 |
|
|
|
我应该保持回答友好,说明作为AI,虽然没有个人欲望,但可以提供帮助。同时,可以 mention areas like problem-solving, learning, and supporting others. 需要保持回答自然,避免技术术语,并保持友好和开放-ended. |
|
|
|
可能还要 mention that while I don't have personal desires, I can assist with various tasks. 需要确保回答清晰,避免误解,并 invite further questions. |
|
</think> |
|
|
|
如果我是人,我最想做的可能是探索 the world, learn continuously, and connect with others. I would want to immerse myself in different cultures, learn new languages, and experience diverse perspectives. I’d aim to contribute to solving meaningful problems, whether through science, art, or community work. Building meaningful relationships and fostering understanding between people would be a priority. Ultimately, I’d want to leave a positive impact on the world, helping others and making life a little better for those around me. |
|
-------------------------------------------------- |
|
Prompt: How many e in word deepseek |
|
Generated: <think> |
|
Okay, so I need to figure out how many times the letter 'e' appears in the word "deepseek". Let me start by breaking down the word. The word is "deepseek". Let me write it out: D, E, E, P, S, E, E, K. Wait, let me check that again. Hmm, maybe I should count each letter one by one. |
|
|
|
First, I'll write down the word again to make sure I have it right. D, E, E, P, S, E, E, K. So that's 8 letters. Now, I need to count how many times 'e' appears. Let me go through each letter: |
|
|
|
1. D - not an e. |
|
2. E - that's one. |
|
3. E - that's two. |
|
4. P - not an e. |
|
5. S - not an e. |
|
6. E - that's three. |
|
7. E - that's four. |
|
8. K - not an e. |
|
|
|
So, I count four 'e's in the word "deepseek". Let me double-check to make sure I didn't miss any. The letters are D, E, E, P, S, E, E, K. So positions 2, 3, 6, and 7 are 'e's. That's four times. I think that's correct. I don't think I missed any. So the answer should be 4. |
|
</think> |
|
|
|
The word "deepseek" contains 4 instances of the letter 'e'. |
|
-------------------------------------------------- |
|
Prompt: There are ten birds in a tree. A hunter shoots one. How many are left in the tree? |
|
Generated: <think> |
|
Okay, so I came across this problem: "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?" At first glance, it seems straightforward, but I want to make sure I understand it properly. Let me break it down step by step. |
|
|
|
First, there are ten birds in the tree. Then, a hunter shoots one. The question is asking how many are left in the tree. Hmm, so the initial number is 10, and one is shot. So, if you subtract one from ten, that would leave nine birds. But wait, I need to consider the possible implications here. Maybe there's a trick question involved. |
|
|
|
I remember sometimes these kinds of problems have a twist. For example, maybe the shot causes the other birds to fly away. But the question specifically says the hunter shoots one. So, does that mean the other birds stay? Or do they get scared and fly away? The problem doesn't mention anything about the other birds leaving, so maybe they stay. But I should consider both possibilities. |
|
|
|
If the hunter shoots one, and the rest don't fly away, then there would be 10 minus 1, which is 9. But if the other birds get scared and fly away, then there would be 0 left. But the problem doesn't mention the other birds leaving, so maybe the answer is 9. But I need to think about possible interpretations. |
|
|
|
Another angle is that maybe the question is a riddle. Sometimes riddles play on words or common sayings. For example, if the question is about birds in a tree and a hunter shoots, maybe the answer is related to the sound or the effect of the shot. But I'm not sure. Let me think. |
|
|
|
In some riddles, the answer might be that there are none left because the shot scares all the birds away. So, even though only one was shot, the rest might fly away. But the problem doesn't specify that. So, maybe the answer is 9, but maybe it's 0. I need to figure out which one is correct. |
|
|
|
Let me check the wording again. It says, "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?" So, the key here is whether the act of shooting causes the other birds to leave. If the hunter shoots |
|
""" |
|
|
|
~~~ |
|
|
|
|
|
|
|
### Evaluate the model |
|
|
|
The accuracy is evaluated on CUDA with overflow protection, and it is expected to be lower than that on the CPU. |
|
|
|
| | INT2 | |
|
| --------- | ------ | |
|
| mmlu | 0.7845 | |
|
| hellaswag | 0.6318 | |
|
|
|
|
|
|
|
~~~python |
|
import transformers |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from lm_eval.utils import make_table |
|
|
|
# https://github.com/huggingface/transformers/pull/35493 |
|
def set_initialized_submodules(model, state_dict_keys): |
|
""" |
|
Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state |
|
dict. |
|
""" |
|
state_dict_keys = set(state_dict_keys) |
|
not_initialized_submodules = {} |
|
for module_name, module in model.named_modules(): |
|
if module_name == "": |
|
# When checking if the root module is loaded there's no need to prepend module_name. |
|
module_keys = set(module.state_dict()) |
|
else: |
|
module_keys = {f"{module_name}.{k}" for k in module.state_dict()} |
|
if module_keys.issubset(state_dict_keys): |
|
module._is_hf_initialized = True |
|
else: |
|
not_initialized_submodules[module_name] = module |
|
return not_initialized_submodules |
|
|
|
|
|
transformers.modeling_utils.set_initialized_submodules = set_initialized_submodules |
|
|
|
import torch |
|
|
|
quantized_model_dir = "OPEA/DeepSeek-R1-int2-gptq-sym-inc" |
|
|
|
## directly use device_map='auto' if you have enough GPUs |
|
device_map = {"model.norm": 0, "lm_head": 0, "model.embed_tokens": 0} |
|
for i in range(61): |
|
name = "model.layers." + str(i) |
|
if i < 15: |
|
device_map[name] = 0 |
|
elif i < 30: |
|
device_map[name] = 1 |
|
elif i < 45: |
|
device_map[name] = 2 |
|
else: |
|
device_map[name] = 3 |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
quantized_model_dir, |
|
torch_dtype=torch.float16, |
|
trust_remote_code=True, |
|
device_map=device_map, |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir) |
|
|
|
|
|
def forward_hook(module, input, output): |
|
return torch.clamp(output, -65504, 65504) |
|
|
|
|
|
def register_fp16_hooks(model): |
|
for name, module in model.named_modules(): |
|
if "QuantLinear" in module.__class__.__name__ or isinstance(module, torch.nn.Linear): |
|
module.register_forward_hook(forward_hook) |
|
|
|
|
|
register_fp16_hooks(model) ##better add this hook to avoid overflow |
|
|
|
from auto_round.eval.evaluation import simple_evaluate_user_model |
|
|
|
res = simple_evaluate_user_model( model, tokenizer, tasks=["hellaswag","mmlu"], batch_size=4) |
|
print(make_table(res)) |
|
~~~ |
|
|
|
|
|
|
|
### Generate the model |
|
|
|
**1 add meta data to bf16 model** https://huggingface.co/opensourcerelease/DeepSeek-R1-bf16 |
|
|
|
~~~python |
|
import safetensors |
|
from safetensors.torch import save_file |
|
|
|
for i in range(1, 164): |
|
idx_str = "0" * (5-len(str(i))) + str(i) |
|
safetensors_path = f"model-{idx_str}-of-000163.safetensors" |
|
print(safetensors_path) |
|
tensors = dict() |
|
with safetensors.safe_open(safetensors_path, framework="pt") as f: |
|
for key in f.keys(): |
|
tensors[key] = f.get_tensor(key) |
|
save_file(tensors, safetensors_path, metadata={'format': 'pt'}) |
|
~~~ |
|
|
|
|
|
|
|
**2 remove torch.no_grad** in modeling_deepseek.py as we need some tuning in AutoRound. |
|
|
|
5*80g and 1.4T-1.6T memory is required |
|
|
|
~~~python |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import transformers |
|
|
|
# https://github.com/huggingface/transformers/pull/35493 |
|
def set_initialized_submodules(model, state_dict_keys): |
|
""" |
|
Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state |
|
dict. |
|
""" |
|
state_dict_keys = set(state_dict_keys) |
|
not_initialized_submodules = {} |
|
for module_name, module in model.named_modules(): |
|
if module_name == "": |
|
# When checking if the root module is loaded there's no need to prepend module_name. |
|
module_keys = set(module.state_dict()) |
|
else: |
|
module_keys = {f"{module_name}.{k}" for k in module.state_dict()} |
|
if module_keys.issubset(state_dict_keys): |
|
module._is_hf_initialized = True |
|
else: |
|
not_initialized_submodules[module_name] = module |
|
return not_initialized_submodules |
|
|
|
|
|
transformers.modeling_utils.set_initialized_submodules = set_initialized_submodules |
|
|
|
model_name = "opensourcerelease/DeepSeek-R1-bf16" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto") |
|
|
|
block = model.model.layers |
|
device_map = {} |
|
|
|
for n, m in block.named_modules(): |
|
if isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)): |
|
if "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) < 63: |
|
device = "cuda:1" |
|
elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 63 and int( |
|
n.split('.')[-2]) < 128: |
|
device = "cuda:2" |
|
elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 128 and int( |
|
n.split('.')[-2]) < 192: |
|
device = "cuda:3" |
|
elif "experts" in n and ("shared_experts" not in n) and int( |
|
n.split('.')[-2]) >= 192: |
|
device = "cuda:4" |
|
else: |
|
device = "cuda:0" |
|
n = n[2:] |
|
|
|
device_map.update({n: device}) |
|
|
|
from auto_round import AutoRound |
|
|
|
|
|
|
|
autoround = AutoRound(model=model, tokenizer=tokenizer, device_map=device_map, bits=2, group_size=64, |
|
iters=1000, batch_size=4, seqlen=512, nsamples=512, enable_torch_compile=False, |
|
) |
|
autoround.quantize() |
|
autoround.save_quantized(format="auto_round", output_dir="tmp_autoround") |
|
|
|
~~~ |
|
|
|
|
|
|
|
## Ethical Considerations and Limitations |
|
|
|
The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs. |
|
|
|
Therefore, before deploying any applications of the model, developers should perform safety testing. |
|
|
|
## Caveats and Recommendations |
|
|
|
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. |
|
|
|
Here are a couple of useful links to learn more about Intel's AI software: |
|
|
|
- Intel Neural Compressor [link](https://github.com/intel/neural-compressor) |
|
|
|
## Disclaimer |
|
|
|
The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes. |
|
|
|
## Cite |
|
|
|
@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} } |
|
|
|
[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round) |