# Install required packages first: # pip install torch transformers datasets accelerate safetensors import torch from datasets import Dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling ) # ----------------------------- # 1️⃣ Create a small custom dataset # ----------------------------- print("📥 Creating small dataset for training...") train_texts = [ "Hello, my name is Ankit.", "I love programming in Python.", "Transformers library makes NLP easy.", "PyTorch is great for deep learning.", "I am learning to fine-tune GPT models." ] test_texts = [ "Hello, I am training a small GPT.", "Deep learning is fun!", "Python is my favorite programming language." ] # Convert to Hugging Face Dataset train_data = Dataset.from_dict({"text": train_texts}) test_data = Dataset.from_dict({"text": test_texts}) # ----------------------------- # 2️⃣ Load tokenizer # ----------------------------- print("📝 Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token # GPT models don't have pad_token # Tokenize dataset def tokenize(batch): return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=32) train_data = train_data.map(tokenize, batched=True) test_data = test_data.map(tokenize, batched=True) train_data.set_format('torch', columns=['input_ids', 'attention_mask']) test_data.set_format('torch', columns=['input_ids', 'attention_mask']) # ----------------------------- # 3️⃣ Load model # ----------------------------- print("🤖 Loading model...") model = AutoModelForCausalLM.from_pretrained("distilgpt2") # ----------------------------- # 4️⃣ Data collator # ----------------------------- data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # ----------------------------- # 5️⃣ Training arguments # ----------------------------- training_args = TrainingArguments( output_dir="./mini_gpt_safetensor", overwrite_output_dir=True, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=3, save_strategy="epoch", logging_steps=10, learning_rate=5e-5, weight_decay=0.01, fp16=True if torch.cuda.is_available() else False, save_total_limit=2, push_to_hub=False, report_to=None, optim="adamw_torch", save_safetensors=True # saves in safetensors format ) # ----------------------------- # 6️⃣ Trainer # ----------------------------- trainer = Trainer( model=model, args=training_args, train_dataset=train_data, eval_dataset=test_data, data_collator=data_collator ) # ----------------------------- # 7️⃣ Train model # ----------------------------- print("🏋️ Training model...") trainer.train() # ----------------------------- # 8️⃣ Save model in safetensor format # ----------------------------- print("💾 Saving model in safetensors format...") trainer.save_model("./mini_gpt_safetensor") print("✅ Training complete and model saved!")