my-local-chatbot / train.py
Jasleen05's picture
Upload 12 files
418c329 verified
import os
import pandas as pd
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from datetime import datetime
# ✅ Load environment variables
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
# ✅ Load CSV dataset
print("📂 Loading processed_dataset.csv...")
data = pd.read_csv("processed_dataset.csv")
# ✅ Check for required columns
if "prompt" not in data.columns or "response" not in data.columns:
raise ValueError("CSV must contain 'prompt' and 'response' columns.")
# ✅ Format each row into OpenAssistant-style input text
data["input_text"] = data.apply(
lambda row: f"<|prompter|> {row['prompt']} <|endoftext|><|assistant|> {row['response']}", axis=1
)
# ✅ PyTorch Dataset class
class ChatDataset(Dataset):
def __init__(self, df, tokenizer, max_length=768):
self.tokenizer = tokenizer
self.inputs = df["input_text"].tolist()
self.max_length = max_length
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
text = self.inputs[idx]
encoded = self.tokenizer(
text,
truncation=True,
padding="max_length",
max_length=self.max_length,
return_tensors="pt"
)
input_ids = encoded["input_ids"].squeeze(0)
return {"input_ids": input_ids, "labels": input_ids}
# ✅ Load tokenizer and model
print("📦 Loading model: distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", token=hf_token)
model = AutoModelForCausalLM.from_pretrained("distilgpt2", token=hf_token)
# ✅ Fix pad token issue
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
# ✅ Prepare dataset
train_dataset = ChatDataset(data, tokenizer)
print(f"✅ Loaded {len(train_dataset)} training samples.")
# ✅ Training configuration
training_args = TrainingArguments(
output_dir="model_output",
per_device_train_batch_size=4,
num_train_epochs=3,
logging_steps=50,
save_steps=500,
warmup_steps=100,
weight_decay=0.01,
logging_dir="logs",
save_total_limit=2,
report_to="none"
)
# ✅ Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
# ✅ Train the model
print("🚀 Starting training...")
trainer.train()
# ✅ Save the model
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"trained_model_{timestamp}"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Training complete. Model saved to {output_dir}")