!pip install bitsandbytes datasets
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

#Model Name
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"

#Enable 4-bit Quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # 4-bit quantization
bnb_4bit_compute_dtype=torch.float16, # Use FP16 for computation
bnb_4bit_use_double_quant=True, # Enable double quantization
bnb_4bit_quant_type="nf4" # Most memory-efficient format
)

Load Tokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Load Model with Quantization

model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config, # Use 4-bit quantization
device_map="auto" # Automatically distributes layers across GPUs
)

#Print Model Device Mapping
print("\n🚀 Model successfully loaded across GPUs! 🚀")
print(model.hf_device_map)

#Load the AIME2025 Dataset
dataset = load_dataset("opencompass/AIME2025", split="train")

#Randomly sample 10 diverse problems for testing
sampled_data = dataset.select(range(12))

#Extract problem and solution columns
problems = sampled_data["question"]
solutions = sampled_data["answer"]

#Define Model Inference Function
def generate_response(model, tokenizer, prompt, max_new_tokens=19000, temperature=0.7):
"""You are a mathemetical genius, Generate response from the model for a given prompt, try to be accurate and not make mistakes."""

#Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3200).to(model.device)

# Generate text
with torch.no_grad():
    output = model.generate(
        inputs.input_ids,
        max_new_tokens=max_new_tokens,  # Lower this if out-of-memory occurs
        temperature=temperature,
        attention_mask=inputs.attention_mask,
        do_sample=True,
        top_k=40,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

return tokenizer.decode(output[0], skip_special_tokens=True)

Generate Model Responses and Print Each Response

for i, problem in enumerate(problems):
print(f"\n🟢 Generating response for Problem {i+1}/10")
print(f"\n🔹 **Problem {i+1}:**\n{problem}")
model_output = generate_response(model, tokenizer, problem)
print(f"\n🤖 **Model Output:**\n{model_output}")
print("\n" + "-"*80)

deepseek-ai
/

DeepSeek-R1-Distill-Qwen-14B

How to load the model on Kaggle or Colab

Load Tokenizer

Load Model with Quantization

Generate Model Responses and Print Each Response