import os import pandas as pd from dotenv import load_dotenv from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments import torch from torch.utils.data import Dataset from datetime import datetime # ✅ Load environment variables load_dotenv() hf_token = os.getenv("HF_TOKEN") # ✅ Load CSV dataset print("📂 Loading processed_dataset.csv...") data = pd.read_csv("processed_dataset.csv") # ✅ Check for required columns if "prompt" not in data.columns or "response" not in data.columns: raise ValueError("CSV must contain 'prompt' and 'response' columns.") # ✅ Format each row into OpenAssistant-style input text data["input_text"] = data.apply( lambda row: f"<|prompter|> {row['prompt']} <|endoftext|><|assistant|> {row['response']}", axis=1 ) # ✅ PyTorch Dataset class class ChatDataset(Dataset): def __init__(self, df, tokenizer, max_length=768): self.tokenizer = tokenizer self.inputs = df["input_text"].tolist() self.max_length = max_length def __len__(self): return len(self.inputs) def __getitem__(self, idx): text = self.inputs[idx] encoded = self.tokenizer( text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt" ) input_ids = encoded["input_ids"].squeeze(0) return {"input_ids": input_ids, "labels": input_ids} # ✅ Load tokenizer and model print("📦 Loading model: distilgpt2") tokenizer = AutoTokenizer.from_pretrained("distilgpt2", token=hf_token) model = AutoModelForCausalLM.from_pretrained("distilgpt2", token=hf_token) # ✅ Fix pad token issue tokenizer.pad_token = tokenizer.eos_token model.resize_token_embeddings(len(tokenizer)) # ✅ Prepare dataset train_dataset = ChatDataset(data, tokenizer) print(f"✅ Loaded {len(train_dataset)} training samples.") # ✅ Training configuration training_args = TrainingArguments( output_dir="model_output", per_device_train_batch_size=4, num_train_epochs=3, logging_steps=50, save_steps=500, warmup_steps=100, weight_decay=0.01, logging_dir="logs", save_total_limit=2, report_to="none" ) # ✅ Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) # ✅ Train the model print("🚀 Starting training...") trainer.train() # ✅ Save the model timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = f"trained_model_{timestamp}" model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print(f"✅ Training complete. Model saved to {output_dir}")