uyiosa commited on
Commit
6555649
·
verified ·
1 Parent(s): 9c421f8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +138 -1
README.md CHANGED
@@ -4,4 +4,141 @@ base_model:
4
  - meta-llama/Llama-3.3-70B-Instruct
5
  ---
6
  # MODEL DESCRIPTION
7
- Simple compression of llama-3.3-70B-instruct model using AWQ method.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  - meta-llama/Llama-3.3-70B-Instruct
5
  ---
6
  # MODEL DESCRIPTION
7
+ Simple compression of llama-3.3-70B-instruct model using AWQ method.
8
+
9
+ ## Loading model with AutoModelForCausalLM
10
+ ```python
11
+ from transformers import AutoModelForCausalLM
12
+
13
+ model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"
14
+
15
+ model = AutoModelForCausalLM.from_pretrained(model_name)
16
+
17
+ print(model)
18
+ ```
19
+
20
+ ## Loading this model with VLLM via docker
21
+ ```
22
+ docker run --runtime nvidia --gpus all --env "HUGGING_FACE_HUB_TOKEN = .........." -p 8000:8000 \
23
+ --ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
24
+ --gpu-memory-utilization 0.9 --swap-space 0 \
25
+ --max-seq-len-to-capture 512 --max-num-seqs 1 --api-key "token-abc123" --max-model-len 8000 \
26
+ --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024
27
+ ```
28
+
29
+ ## A method to merge adapter weights to the base model and quantize
30
+ ```python
31
+ import torch
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer
33
+ from peft import PeftModel
34
+ import os
35
+ from awq import AutoAWQForCausalLM
36
+ import gc
37
+
38
+ def clear_gpu_memory():
39
+ """Clear GPU memory and cache"""
40
+ if torch.cuda.is_available():
41
+ torch.cuda.empty_cache()
42
+ gc.collect()
43
+
44
+ def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
45
+ """Merge adapter with base model and save"""
46
+ print("Loading base model...")
47
+ base_model = AutoModelForCausalLM.from_pretrained(
48
+ base_model_path,
49
+ torch_dtype=torch.float16,
50
+ device_map=device
51
+ )
52
+
53
+ print("Loading adapter...")
54
+ adapter_model = PeftModel.from_pretrained(
55
+ base_model,
56
+ adapter_path,
57
+ device_map=device
58
+ )
59
+
60
+ print("Merging adapter with base model...")
61
+ merged_model = adapter_model.merge_and_unload()
62
+
63
+ print("Saving merged model...")
64
+ merged_model.save_pretrained(merged_path)
65
+
66
+ # Clear model from GPU memory
67
+ del base_model
68
+ del adapter_model
69
+ del merged_model
70
+ clear_gpu_memory()
71
+ print("Cleared GPU memory after merge")
72
+
73
+ def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
74
+ """Quantize the merged model"""
75
+ print("Starting quantization...")
76
+ quant_config = {
77
+ "bits": 4,
78
+ "group_size": 128,
79
+ "zero_point": True,
80
+ "modules_to_not_convert": [
81
+ "attention", # keep attention in fp16
82
+ "rotary_emb", # keep embeddings in fp16
83
+ "norm", # keep normalization in fp16
84
+ "adapter", # keep adapter weights in fp16
85
+ "lora" # keep any remaining LoRA weights in fp16
86
+ ]
87
+ }
88
+
89
+ # Load and quantize
90
+ print("Loading merged model for quantization...")
91
+ quantizer = AutoAWQForCausalLM.from_pretrained(
92
+ merged_path,
93
+ **quant_config,
94
+ device_map=device
95
+ )
96
+
97
+ quantized_model = quantizer.quantize(
98
+ examples=128,
99
+ verify_loading=True
100
+ )
101
+
102
+ print("Saving quantized model...")
103
+ quantized_model.save_pretrained(quantized_path)
104
+
105
+ # Clear GPU memory again
106
+ del quantizer
107
+ del quantized_model
108
+ clear_gpu_memory()
109
+ print("Cleared GPU memory after quantization")
110
+
111
+ def process_model(base_model_path: str, adapter_path: str, output_dir: str):
112
+ """Main processing function"""
113
+ os.makedirs(output_dir, exist_ok=True)
114
+ merged_path = os.path.join(output_dir, "merged_model")
115
+ quantized_path = os.path.join(output_dir, "quantized_model")
116
+
117
+ try:
118
+ # Step 1: Merge
119
+ merge_model(base_model_path, adapter_path, merged_path)
120
+
121
+ # Step 2: Quantize
122
+ quantize_model(merged_path, quantized_path)
123
+
124
+ print("Process completed successfully!")
125
+ return True
126
+
127
+ except Exception as e:
128
+ print(f"Error during processing: {str(e)}")
129
+ clear_gpu_memory() # Clear memory if there's an error
130
+ return False
131
+
132
+ if __name__ == "__main__":
133
+ # Configuration
134
+ BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
135
+ ADAPTER_PATH = "./checkpoint-781" # Directory with adapter_config.json
136
+ OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"
137
+
138
+ # Run the process
139
+ success = process_model(
140
+ base_model_path=BASE_MODEL_PATH,
141
+ adapter_path=ADAPTER_PATH,
142
+ output_dir=OUTPUT_DIR
143
+ )
144
+ ```