File size: 3,157 Bytes
0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 0ee3719 65802e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# Install required packages first:
# pip install torch transformers datasets accelerate safetensors
import torch
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling
)
# -----------------------------
# 1️⃣ Create a small custom dataset
# -----------------------------
print("📥 Creating small dataset for training...")
train_texts = [
"Hello, my name is Ankit.",
"I love programming in Python.",
"Transformers library makes NLP easy.",
"PyTorch is great for deep learning.",
"I am learning to fine-tune GPT models."
]
test_texts = [
"Hello, I am training a small GPT.",
"Deep learning is fun!",
"Python is my favorite programming language."
]
# Convert to Hugging Face Dataset
train_data = Dataset.from_dict({"text": train_texts})
test_data = Dataset.from_dict({"text": test_texts})
# -----------------------------
# 2️⃣ Load tokenizer
# -----------------------------
print("📝 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token # GPT models don't have pad_token
# Tokenize dataset
def tokenize(batch):
return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=32)
train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)
train_data.set_format('torch', columns=['input_ids', 'attention_mask'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask'])
# -----------------------------
# 3️⃣ Load model
# -----------------------------
print("🤖 Loading model...")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
# -----------------------------
# 4️⃣ Data collator
# -----------------------------
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# -----------------------------
# 5️⃣ Training arguments
# -----------------------------
training_args = TrainingArguments(
output_dir="./mini_gpt_safetensor",
overwrite_output_dir=True,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=3,
save_strategy="epoch",
logging_steps=10,
learning_rate=5e-5,
weight_decay=0.01,
fp16=True if torch.cuda.is_available() else False,
save_total_limit=2,
push_to_hub=False,
report_to=None,
optim="adamw_torch",
save_safetensors=True # saves in safetensors format
)
# -----------------------------
# 6️⃣ Trainer
# -----------------------------
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data,
eval_dataset=test_data,
data_collator=data_collator
)
# -----------------------------
# 7️⃣ Train model
# -----------------------------
print("🏋️ Training model...")
trainer.train()
# -----------------------------
# 8️⃣ Save model in safetensor format
# -----------------------------
print("💾 Saving model in safetensors format...")
trainer.save_model("./mini_gpt_safetensor")
print("✅ Training complete and model saved!")
|