File size: 3,157 Bytes
0ee3719
65802e8
0ee3719
 
65802e8
 
 
 
 
 
 
 
0ee3719
 
65802e8
0ee3719
65802e8
0ee3719
65802e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ee3719
 
65802e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ee3719
 
65802e8
0ee3719
65802e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ee3719
65802e8
 
 
 
 
 
 
 
 
 
0ee3719
65802e8
 
 
 
 
0ee3719
 
65802e8
0ee3719
65802e8
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Install required packages first:
# pip install torch transformers datasets accelerate safetensors

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

# -----------------------------
# 1️⃣ Create a small custom dataset
# -----------------------------
print("📥 Creating small dataset for training...")

train_texts = [
    "Hello, my name is Ankit.",
    "I love programming in Python.",
    "Transformers library makes NLP easy.",
    "PyTorch is great for deep learning.",
    "I am learning to fine-tune GPT models."
]

test_texts = [
    "Hello, I am training a small GPT.",
    "Deep learning is fun!",
    "Python is my favorite programming language."
]

# Convert to Hugging Face Dataset
train_data = Dataset.from_dict({"text": train_texts})
test_data = Dataset.from_dict({"text": test_texts})

# -----------------------------
# 2️⃣ Load tokenizer
# -----------------------------
print("📝 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT models don't have pad_token

# Tokenize dataset
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=32)

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

train_data.set_format('torch', columns=['input_ids', 'attention_mask'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask'])

# -----------------------------
# 3️⃣ Load model
# -----------------------------
print("🤖 Loading model...")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# -----------------------------
# 4️⃣ Data collator
# -----------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# -----------------------------
# 5️⃣ Training arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./mini_gpt_safetensor",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=2,
    push_to_hub=False,
    report_to=None,
    optim="adamw_torch",
    save_safetensors=True  # saves in safetensors format
)

# -----------------------------
# 6️⃣ Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator
)

# -----------------------------
# 7️⃣ Train model
# -----------------------------
print("🏋️ Training model...")
trainer.train()

# -----------------------------
# 8️⃣ Save model in safetensor format
# -----------------------------
print("💾 Saving model in safetensors format...")
trainer.save_model("./mini_gpt_safetensor")

print("✅ Training complete and model saved!")