File size: 4,463 Bytes
5c10800 9c421f8 5c10800 89994da f26e73a f065df8 6555649 2a03737 6555649 2a03737 6555649 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
---
license: llama3.3
base_model:
- meta-llama/Llama-3.3-70B-Instruct
language:
- en
- hi
- it
- de
- fr
- th
- es
- pt
library_name: transformers
tags:
- meta
- pytorch
- llama
---
# MODEL DESCRIPTION
Simple compression of llama-3.3-70B-instruct model using AWQ method.
## Loading model with AutoModelForCausalLM
```python
from transformers import AutoModelForCausalLM
model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"
model = AutoModelForCausalLM.from_pretrained(model_name)
print(model)
```
## Loading this model with VLLM via docker
```
docker run --runtime nvidia --gpus all \
--env "HUGGING_FACE_HUB_TOKEN = .........." \
-p 8000:8000 \
--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
--gpu-memory-utilization 0.9 \
--swap-space 0 \
--max-seq-len-to-capture 512 \
--max-num-seqs 1 \
--api-key "token-abc123" \
--max-model-len 8000 \
--trust-remote-code --enable-chunked-prefill \
--max_num_batched_tokens 1024
```
## A method to merge adapter weights to the base model and quantize
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
from awq import AutoAWQForCausalLM
import gc
def clear_gpu_memory():
"""Clear GPU memory and cache"""
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
"""Merge adapter with base model and save"""
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
device_map=device
)
print("Loading adapter...")
adapter_model = PeftModel.from_pretrained(
base_model,
adapter_path,
device_map=device
)
print("Merging adapter with base model...")
merged_model = adapter_model.merge_and_unload()
print("Saving merged model...")
merged_model.save_pretrained(merged_path)
# Clear model from GPU memory
del base_model
del adapter_model
del merged_model
clear_gpu_memory()
print("Cleared GPU memory after merge")
def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
"""Quantize the merged model"""
print("Starting quantization...")
quant_config = {
"bits": 4,
"group_size": 128,
"zero_point": True,
"modules_to_not_convert": [
"attention", # keep attention in fp16
"rotary_emb", # keep embeddings in fp16
"norm", # keep normalization in fp16
"adapter", # keep adapter weights in fp16
"lora" # keep any remaining LoRA weights in fp16
]
}
# Load and quantize
print("Loading merged model for quantization...")
quantizer = AutoAWQForCausalLM.from_pretrained(
merged_path,
**quant_config,
device_map=device
)
quantized_model = quantizer.quantize(
examples=128,
verify_loading=True
)
print("Saving quantized model...")
quantized_model.save_pretrained(quantized_path)
# Clear GPU memory again
del quantizer
del quantized_model
clear_gpu_memory()
print("Cleared GPU memory after quantization")
def process_model(base_model_path: str, adapter_path: str, output_dir: str):
"""Main processing function"""
os.makedirs(output_dir, exist_ok=True)
merged_path = os.path.join(output_dir, "merged_model")
quantized_path = os.path.join(output_dir, "quantized_model")
try:
# Step 1: Merge
merge_model(base_model_path, adapter_path, merged_path)
# Step 2: Quantize
quantize_model(merged_path, quantized_path)
print("Process completed successfully!")
return True
except Exception as e:
print(f"Error during processing: {str(e)}")
clear_gpu_memory() # Clear memory if there's an error
return False
if __name__ == "__main__":
# Configuration
BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
ADAPTER_PATH = "./checkpoint-781" # Directory with adapter_config.json
OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"
# Run the process
success = process_model(
base_model_path=BASE_MODEL_PATH,
adapter_path=ADAPTER_PATH,
output_dir=OUTPUT_DIR
)
``` |