How to load the model on Kaggle or Colab
Follow the script
!pip install bitsandbytes datasets
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
#Model Name
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
#Enable 4-bit Quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # 4-bit quantization
bnb_4bit_compute_dtype=torch.float16, # Use FP16 for computation
bnb_4bit_use_double_quant=True, # Enable double quantization
bnb_4bit_quant_type="nf4" # Most memory-efficient format
)
Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
Load Model with Quantization
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config, # Use 4-bit quantization
device_map="auto" # Automatically distributes layers across GPUs
)
#Print Model Device Mapping
print("\n🚀 Model successfully loaded across GPUs! 🚀")
print(model.hf_device_map)
#Load the AIME2025 Dataset
dataset = load_dataset("opencompass/AIME2025", split="train")
#Randomly sample 10 diverse problems for testing
sampled_data = dataset.select(range(12))
#Extract problem and solution columns
problems = sampled_data["question"]
solutions = sampled_data["answer"]
#Define Model Inference Function
def generate_response(model, tokenizer, prompt, max_new_tokens=19000, temperature=0.7):
"""You are a mathemetical genius, Generate response from the model for a given prompt, try to be accurate and not make mistakes."""
#Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3200).to(model.device)
# Generate text
with torch.no_grad():
output = model.generate(
inputs.input_ids,
max_new_tokens=max_new_tokens, # Lower this if out-of-memory occurs
temperature=temperature,
attention_mask=inputs.attention_mask,
do_sample=True,
top_k=40,
top_p=0.9,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
return tokenizer.decode(output[0], skip_special_tokens=True)
Generate Model Responses and Print Each Response
for i, problem in enumerate(problems):
print(f"\n🟢 Generating response for Problem {i+1}/10")
print(f"\n🔹 **Problem {i+1}:**\n{problem}")
model_output = generate_response(model, tokenizer, problem)
print(f"\n🤖 **Model Output:**\n{model_output}")
print("\n" + "-"*80)