File size: 4,463 Bytes
5c10800
9c421f8
5c10800
 
89994da
 
 
 
 
 
 
 
 
 
 
 
 
 
f26e73a
f065df8
6555649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a03737
 
 
6555649
2a03737
 
 
 
 
 
 
 
6555649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
---
license: llama3.3
base_model:
- meta-llama/Llama-3.3-70B-Instruct
language:
- en
- hi
- it
- de
- fr
- th
- es
- pt
library_name: transformers
tags:
- meta
- pytorch
- llama
---
# MODEL DESCRIPTION
Simple compression of llama-3.3-70B-instruct model using AWQ method.

## Loading model with AutoModelForCausalLM
```python
from transformers import AutoModelForCausalLM

model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"

model = AutoModelForCausalLM.from_pretrained(model_name)

print(model)
```

## Loading this model with VLLM via docker
```
docker run --runtime nvidia --gpus all \
--env "HUGGING_FACE_HUB_TOKEN = .........." \
-p 8000:8000 \
--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
--gpu-memory-utilization 0.9 \
--swap-space 0 \
--max-seq-len-to-capture 512 \
--max-num-seqs 1 \
--api-key "token-abc123" \
--max-model-len 8000 \
--trust-remote-code --enable-chunked-prefill \
--max_num_batched_tokens 1024
```

## A method to merge adapter weights to the base model and quantize
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
from awq import AutoAWQForCausalLM
import gc

def clear_gpu_memory():
    """Clear GPU memory and cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
    """Merge adapter with base model and save"""
    print("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        device_map=device
    )
    
    print("Loading adapter...")
    adapter_model = PeftModel.from_pretrained(
        base_model,
        adapter_path,
        device_map=device
    )
    
    print("Merging adapter with base model...")
    merged_model = adapter_model.merge_and_unload()
    
    print("Saving merged model...")
    merged_model.save_pretrained(merged_path)
    
    # Clear model from GPU memory
    del base_model
    del adapter_model
    del merged_model
    clear_gpu_memory()
    print("Cleared GPU memory after merge")

def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
    """Quantize the merged model"""
    print("Starting quantization...")
    quant_config = {
        "bits": 4,
        "group_size": 128,
        "zero_point": True,
        "modules_to_not_convert": [
            "attention",    # keep attention in fp16
            "rotary_emb",  # keep embeddings in fp16
            "norm",        # keep normalization in fp16
            "adapter",     # keep adapter weights in fp16
            "lora"         # keep any remaining LoRA weights in fp16
        ]
    }
    
    # Load and quantize
    print("Loading merged model for quantization...")
    quantizer = AutoAWQForCausalLM.from_pretrained(
        merged_path,
        **quant_config,
        device_map=device
    )
    
    quantized_model = quantizer.quantize(
        examples=128,
        verify_loading=True
    )
    
    print("Saving quantized model...")
    quantized_model.save_pretrained(quantized_path)
    
    # Clear GPU memory again
    del quantizer
    del quantized_model
    clear_gpu_memory()
    print("Cleared GPU memory after quantization")

def process_model(base_model_path: str, adapter_path: str, output_dir: str):
    """Main processing function"""
    os.makedirs(output_dir, exist_ok=True)
    merged_path = os.path.join(output_dir, "merged_model")
    quantized_path = os.path.join(output_dir, "quantized_model")
    
    try:
        # Step 1: Merge
        merge_model(base_model_path, adapter_path, merged_path)
        
        # Step 2: Quantize
        quantize_model(merged_path, quantized_path)
        
        print("Process completed successfully!")
        return True
        
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        clear_gpu_memory()  # Clear memory if there's an error
        return False

if __name__ == "__main__":
    # Configuration
    BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
    ADAPTER_PATH = "./checkpoint-781"  # Directory with adapter_config.json
    OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"
    
    # Run the process
    success = process_model(
        base_model_path=BASE_MODEL_PATH,
        adapter_path=ADAPTER_PATH,
        output_dir=OUTPUT_DIR
    )
```