Meta-Llama-3-70B-AQLM-PV-1Bit-1x16 baaaaaaaaaaaaaaaaaaaaad
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
import transformers
import torch
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16", trust_remote_code=True, torch_dtype=torch.float16,
).cuda()
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16")
inputs = tokenizer(["An increasing sequence: one,"], return_tensors="pt")["input_ids"].cuda()
streamer = TextStreamer(tokenizer)
_ = quantized_model.generate(inputs, streamer=streamer, max_new_tokens=120)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
An increasing sequence: /usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
one,augaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaaugaNewPropNewPropNewPropĠNewPropĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ
from transformers import pipeline, AutoTokenizer
model_name = "ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16"
Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
Manually set a chat template (modify based on your model's expected format)
tokenizer.chat_template = "[INST] {user_message} [/INST] "
Load pipeline
pipe = pipeline("text-generation", model=model_name, trust_remote_code=True, device_map="auto", tokenizer=tokenizer)
Format message correctly
messages = [{"role": "user", "content": "Who are you?"}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
Generate response
output = pipe(formatted_prompt, max_new_tokens=100)
print(output)
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Loading checkpoint shards: 100%
3/3 [01:03<00:00, 20.22s/it]
Device set to use cuda:0
Setting pad_token_id to eos_token_id:128001 for open-end generation.
/usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.11/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
[{'generated_text': '[INST] {user_message} [/INST] Pent Weg Weg Weg Weg Weg Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blinkaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklı'}
Is there a way to improve the form answers?
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
import torch
تحميل النموذج والمحلل اللغوي (غير مضغوط)
model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16", torch_dtype=torch.float16, device_map="cuda"# يمكنك تغيير هذا إلى نموذج آخر إذا كنت ترغب
).cuda()
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-70B-Instruct")
تحضير المدخلات مع إضافة attention_mask
inputs = tokenizer(["An increasing sequence: one,"], return_tensors="pt")
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()
ضبط خيارات التوليد المختلفة
streamer = TextStreamer(tokenizer)
generation_config = {
"max_new_tokens": 40, # زيادة عدد التكرارات الجديدة
"temperature": 0.2, # تقليل درجة الحرارة لمزيد من التنبؤ
"top_k": 50, # زيادة قيمة top_k
"top_p": 0.9, # ضبط قيمة top_p
"do_sample": True, # تفعيل العينات العشوائية
"return_dict_in_generate": True,
"output_scores": True,
"num_beams": 1 # تعيين num_beams إلى 1 لتجنب مشكلة الاستخدام مع streamer
}
توليد النص باستخدام النموذج المحسن
outputs = model.generate(
input_ids,
attention_mask=attention_mask,
streamer=streamer,
**generation_config
)
طباعة النصوص المولدة
for output in outputs:
print(output)
one,veticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticavetica unconveticavetica uncon uncon
From v4.47 onwards, when a model cache is to be returned, generate
will return a Cache
instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set return_legacy_cache=True
.
sequences
scores
past_key_values
تحضير المدخلات مع إضافة attention_mask
inputs = tokenizer(["An increasing sequence: one,"], return_tensors="pt")
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()
ضبط خيارات التوليد المختلفة
streamer = TextStreamer(tokenizer)
generation_config = {
"max_new_tokens": 40, # زيادة عدد التكرارات الجديدة
"temperature": 0.2, # تقليل درجة الحرارة لمزيد من التنبؤ
"top_k": 20, # زيادة قيمة top_k
"top_p": 0.4, # ضبط قيمة top_p
"do_sample": True, # تفعيل العينات العشوائية
"return_dict_in_generate": True,
"output_scores": True,
"return_legacy_cache": True,
"num_beams": 1 # تعيين num_beams إلى 1 لتجنب مشكلة الاستخدام مع streamer
}
توليد النص باستخدام النموذج المحسن
outputs = model.generate(
input_ids,
attention_mask=attention_mask,
streamer=streamer,
**generation_config
)
طباعة النصوص المولدة
for output in outputs:
print(output)
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
<|begin_of_text|>An increasing sequence: one,treveticaveticavetica542542542542542542542542542542542542542542542542542542542542542542542542542542542542542542542542542542542542
sequences
scores
past_key_values
from transformers import pipeline, AutoTokenizer
model_name = "ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16"
Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
Manually set a chat template (modify based on your model's expected format)
tokenizer.chat_template = "[INST] {user_message} [/INST] " # This template might need adjustment
Load pipeline with the tokenizer
pipe = pipeline("text-generation", model=model_name, trust_remote_code=True, device_map="auto", tokenizer=tokenizer)
Format message correctly
messages = [{"role": "user", "content": "Who are you?"}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
Generate response
output = pipe(formatted_prompt, max_new_tokens=100) # Adjust max_new_tokens as needed
print(output)
[{'generated_text': '[INST] {user_message} [/INST] Pent Weg Weg Weg Weg Weg Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Slots Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blink Blinkaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklıaklı'}]