|
|
|
|
|
|
|
import torch |
|
from datasets import Dataset |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForCausalLM, |
|
Trainer, |
|
TrainingArguments, |
|
DataCollatorForLanguageModeling |
|
) |
|
|
|
|
|
|
|
|
|
print("📥 Creating small dataset for training...") |
|
|
|
train_texts = [ |
|
"Hello, my name is Ankit.", |
|
"I love programming in Python.", |
|
"Transformers library makes NLP easy.", |
|
"PyTorch is great for deep learning.", |
|
"I am learning to fine-tune GPT models." |
|
] |
|
|
|
test_texts = [ |
|
"Hello, I am training a small GPT.", |
|
"Deep learning is fun!", |
|
"Python is my favorite programming language." |
|
] |
|
|
|
|
|
train_data = Dataset.from_dict({"text": train_texts}) |
|
test_data = Dataset.from_dict({"text": test_texts}) |
|
|
|
|
|
|
|
|
|
print("📝 Loading tokenizer...") |
|
tokenizer = AutoTokenizer.from_pretrained("distilgpt2") |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
def tokenize(batch): |
|
return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=32) |
|
|
|
train_data = train_data.map(tokenize, batched=True) |
|
test_data = test_data.map(tokenize, batched=True) |
|
|
|
train_data.set_format('torch', columns=['input_ids', 'attention_mask']) |
|
test_data.set_format('torch', columns=['input_ids', 'attention_mask']) |
|
|
|
|
|
|
|
|
|
print("🤖 Loading model...") |
|
model = AutoModelForCausalLM.from_pretrained("distilgpt2") |
|
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, |
|
mlm=False |
|
) |
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./mini_gpt_safetensor", |
|
overwrite_output_dir=True, |
|
per_device_train_batch_size=2, |
|
per_device_eval_batch_size=2, |
|
num_train_epochs=3, |
|
save_strategy="epoch", |
|
logging_steps=10, |
|
learning_rate=5e-5, |
|
weight_decay=0.01, |
|
fp16=True if torch.cuda.is_available() else False, |
|
save_total_limit=2, |
|
push_to_hub=False, |
|
report_to=None, |
|
optim="adamw_torch", |
|
save_safetensors=True |
|
) |
|
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_data, |
|
eval_dataset=test_data, |
|
data_collator=data_collator |
|
) |
|
|
|
|
|
|
|
|
|
print("🏋️ Training model...") |
|
trainer.train() |
|
|
|
|
|
|
|
|
|
print("💾 Saving model in safetensors format...") |
|
trainer.save_model("./mini_gpt_safetensor") |
|
|
|
print("✅ Training complete and model saved!") |
|
|