jsbaicenter
/

Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM

@@ -4,4 +4,141 @@ base_model:
 - meta-llama/Llama-3.3-70B-Instruct
 ---
 # MODEL DESCRIPTION
-Simple compression of llama-3.3-70B-instruct model using AWQ method.

 - meta-llama/Llama-3.3-70B-Instruct
 ---
 # MODEL DESCRIPTION
+Simple compression of llama-3.3-70B-instruct model using AWQ method.
+## Loading model with AutoModelForCausalLM
+```python
+from transformers import AutoModelForCausalLM
+model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"
+model = AutoModelForCausalLM.from_pretrained(model_name)
+print(model)
+```
+## Loading this model with VLLM via docker
+```
+docker run --runtime nvidia --gpus all --env "HUGGING_FACE_HUB_TOKEN = .........."  -p 8000:8000 \
+--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
+--gpu-memory-utilization 0.9 --swap-space 0 \
+--max-seq-len-to-capture 512 --max-num-seqs 1 --api-key "token-abc123" --max-model-len 8000 \
+--trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024
+```
+## A method to merge adapter weights to the base model and quantize
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+import os
+from awq import AutoAWQForCausalLM
+import gc
+def clear_gpu_memory():
+    """Clear GPU memory and cache"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
+    """Merge adapter with base model and save"""
+    print("Loading base model...")
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model_path,
+        torch_dtype=torch.float16,
+        device_map=device
+    )
+    print("Loading adapter...")
+    adapter_model = PeftModel.from_pretrained(
+        base_model,
+        adapter_path,
+        device_map=device
+    )
+    print("Merging adapter with base model...")
+    merged_model = adapter_model.merge_and_unload()
+    print("Saving merged model...")
+    merged_model.save_pretrained(merged_path)
+    # Clear model from GPU memory
+    del base_model
+    del adapter_model
+    del merged_model
+    clear_gpu_memory()
+    print("Cleared GPU memory after merge")
+def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
+    """Quantize the merged model"""
+    print("Starting quantization...")
+    quant_config = {
+        "bits": 4,
+        "group_size": 128,
+        "zero_point": True,
+        "modules_to_not_convert": [
+            "attention",    # keep attention in fp16
+            "rotary_emb",  # keep embeddings in fp16
+            "norm",        # keep normalization in fp16
+            "adapter",     # keep adapter weights in fp16
+            "lora"         # keep any remaining LoRA weights in fp16
+        ]
+    }
+    # Load and quantize
+    print("Loading merged model for quantization...")
+    quantizer = AutoAWQForCausalLM.from_pretrained(
+        merged_path,
+        **quant_config,
+        device_map=device
+    )
+    quantized_model = quantizer.quantize(
+        examples=128,
+        verify_loading=True
+    )
+    print("Saving quantized model...")
+    quantized_model.save_pretrained(quantized_path)
+    # Clear GPU memory again
+    del quantizer
+    del quantized_model
+    clear_gpu_memory()
+    print("Cleared GPU memory after quantization")
+def process_model(base_model_path: str, adapter_path: str, output_dir: str):
+    """Main processing function"""
+    os.makedirs(output_dir, exist_ok=True)
+    merged_path = os.path.join(output_dir, "merged_model")
+    quantized_path = os.path.join(output_dir, "quantized_model")
+    try:
+        # Step 1: Merge
+        merge_model(base_model_path, adapter_path, merged_path)
+        # Step 2: Quantize
+        quantize_model(merged_path, quantized_path)
+        print("Process completed successfully!")
+        return True
+    except Exception as e:
+        print(f"Error during processing: {str(e)}")
+        clear_gpu_memory()  # Clear memory if there's an error
+        return False
+if __name__ == "__main__":
+    # Configuration
+    BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
+    ADAPTER_PATH = "./checkpoint-781"  # Directory with adapter_config.json
+    OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"
+    # Run the process
+    success = process_model(
+        base_model_path=BASE_MODEL_PATH,
+        adapter_path=ADAPTER_PATH,
+        output_dir=OUTPUT_DIR
+    )
+```