MODEL DESCRIPTION

Simple compression of llama-3.3-70B-instruct model using AWQ method.

Loading model with AutoModelForCausalLM

from transformers import AutoModelForCausalLM

model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"

model = AutoModelForCausalLM.from_pretrained(model_name)

print(model)

Loading this model with VLLM via docker

docker run --runtime nvidia --gpus all \
--env "HUGGING_FACE_HUB_TOKEN = .........." \
-p 8000:8000 \
--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
--gpu-memory-utilization 0.9 \
--swap-space 0 \
--max-seq-len-to-capture 512 \
--max-num-seqs 1 \
--api-key "token-abc123" \
--max-model-len 8000 \
--trust-remote-code --enable-chunked-prefill \
--max_num_batched_tokens 1024

A method to merge adapter weights to the base model and quantize

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
from awq import AutoAWQForCausalLM
import gc

def clear_gpu_memory():
    """Clear GPU memory and cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
    """Merge adapter with base model and save"""
    print("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        device_map=device
    )
    
    print("Loading adapter...")
    adapter_model = PeftModel.from_pretrained(
        base_model,
        adapter_path,
        device_map=device
    )
    
    print("Merging adapter with base model...")
    merged_model = adapter_model.merge_and_unload()
    
    print("Saving merged model...")
    merged_model.save_pretrained(merged_path)
    
    # Clear model from GPU memory
    del base_model
    del adapter_model
    del merged_model
    clear_gpu_memory()
    print("Cleared GPU memory after merge")

def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
    """Quantize the merged model"""
    print("Starting quantization...")
    quant_config = {
        "bits": 4,
        "group_size": 128,
        "zero_point": True,
        "modules_to_not_convert": [
            "attention",    # keep attention in fp16
            "rotary_emb",  # keep embeddings in fp16
            "norm",        # keep normalization in fp16
            "adapter",     # keep adapter weights in fp16
            "lora"         # keep any remaining LoRA weights in fp16
        ]
    }
    
    # Load and quantize
    print("Loading merged model for quantization...")
    quantizer = AutoAWQForCausalLM.from_pretrained(
        merged_path,
        **quant_config,
        device_map=device
    )
    
    quantized_model = quantizer.quantize(
        examples=128,
        verify_loading=True
    )
    
    print("Saving quantized model...")
    quantized_model.save_pretrained(quantized_path)
    
    # Clear GPU memory again
    del quantizer
    del quantized_model
    clear_gpu_memory()
    print("Cleared GPU memory after quantization")

def process_model(base_model_path: str, adapter_path: str, output_dir: str):
    """Main processing function"""
    os.makedirs(output_dir, exist_ok=True)
    merged_path = os.path.join(output_dir, "merged_model")
    quantized_path = os.path.join(output_dir, "quantized_model")
    
    try:
        # Step 1: Merge
        merge_model(base_model_path, adapter_path, merged_path)
        
        # Step 2: Quantize
        quantize_model(merged_path, quantized_path)
        
        print("Process completed successfully!")
        return True
        
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        clear_gpu_memory()  # Clear memory if there's an error
        return False

if __name__ == "__main__":
    # Configuration
    BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
    ADAPTER_PATH = "./checkpoint-781"  # Directory with adapter_config.json
    OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"
    
    # Run the process
    success = process_model(
        base_model_path=BASE_MODEL_PATH,
        adapter_path=ADAPTER_PATH,
        output_dir=OUTPUT_DIR
    )
Downloads last month
135
Safetensors
Model size
11.3B params
Tensor type
I32
·
FP16
·
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Model tree for jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM

Quantized
(100)
this model