MODEL DESCRIPTION
Simple compression of llama-3.3-70B-instruct model using AWQ method.
Loading model with AutoModelForCausalLM
from transformers import AutoModelForCausalLM
model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"
model = AutoModelForCausalLM.from_pretrained(model_name)
print(model)
Loading this model with VLLM via docker
docker run --runtime nvidia --gpus all \
--env "HUGGING_FACE_HUB_TOKEN = .........." \
-p 8000:8000 \
--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
--gpu-memory-utilization 0.9 \
--swap-space 0 \
--max-seq-len-to-capture 512 \
--max-num-seqs 1 \
--api-key "token-abc123" \
--max-model-len 8000 \
--trust-remote-code --enable-chunked-prefill \
--max_num_batched_tokens 1024
A method to merge adapter weights to the base model and quantize
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
from awq import AutoAWQForCausalLM
import gc
def clear_gpu_memory():
"""Clear GPU memory and cache"""
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
"""Merge adapter with base model and save"""
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
device_map=device
)
print("Loading adapter...")
adapter_model = PeftModel.from_pretrained(
base_model,
adapter_path,
device_map=device
)
print("Merging adapter with base model...")
merged_model = adapter_model.merge_and_unload()
print("Saving merged model...")
merged_model.save_pretrained(merged_path)
# Clear model from GPU memory
del base_model
del adapter_model
del merged_model
clear_gpu_memory()
print("Cleared GPU memory after merge")
def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
"""Quantize the merged model"""
print("Starting quantization...")
quant_config = {
"bits": 4,
"group_size": 128,
"zero_point": True,
"modules_to_not_convert": [
"attention", # keep attention in fp16
"rotary_emb", # keep embeddings in fp16
"norm", # keep normalization in fp16
"adapter", # keep adapter weights in fp16
"lora" # keep any remaining LoRA weights in fp16
]
}
# Load and quantize
print("Loading merged model for quantization...")
quantizer = AutoAWQForCausalLM.from_pretrained(
merged_path,
**quant_config,
device_map=device
)
quantized_model = quantizer.quantize(
examples=128,
verify_loading=True
)
print("Saving quantized model...")
quantized_model.save_pretrained(quantized_path)
# Clear GPU memory again
del quantizer
del quantized_model
clear_gpu_memory()
print("Cleared GPU memory after quantization")
def process_model(base_model_path: str, adapter_path: str, output_dir: str):
"""Main processing function"""
os.makedirs(output_dir, exist_ok=True)
merged_path = os.path.join(output_dir, "merged_model")
quantized_path = os.path.join(output_dir, "quantized_model")
try:
# Step 1: Merge
merge_model(base_model_path, adapter_path, merged_path)
# Step 2: Quantize
quantize_model(merged_path, quantized_path)
print("Process completed successfully!")
return True
except Exception as e:
print(f"Error during processing: {str(e)}")
clear_gpu_memory() # Clear memory if there's an error
return False
if __name__ == "__main__":
# Configuration
BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
ADAPTER_PATH = "./checkpoint-781" # Directory with adapter_config.json
OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"
# Run the process
success = process_model(
base_model_path=BASE_MODEL_PATH,
adapter_path=ADAPTER_PATH,
output_dir=OUTPUT_DIR
)
- Downloads last month
- 135
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model has no library tag.
Model tree for jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM
Base model
meta-llama/Llama-3.1-70B
Finetuned
meta-llama/Llama-3.3-70B-Instruct