|
import os
|
|
import pandas as pd
|
|
from dotenv import load_dotenv
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
|
import torch
|
|
from torch.utils.data import Dataset
|
|
from datetime import datetime
|
|
|
|
|
|
load_dotenv()
|
|
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
|
|
print("📂 Loading processed_dataset.csv...")
|
|
data = pd.read_csv("processed_dataset.csv")
|
|
|
|
|
|
if "prompt" not in data.columns or "response" not in data.columns:
|
|
raise ValueError("CSV must contain 'prompt' and 'response' columns.")
|
|
|
|
|
|
data["input_text"] = data.apply(
|
|
lambda row: f"<|prompter|> {row['prompt']} <|endoftext|><|assistant|> {row['response']}", axis=1
|
|
)
|
|
|
|
|
|
class ChatDataset(Dataset):
|
|
def __init__(self, df, tokenizer, max_length=768):
|
|
self.tokenizer = tokenizer
|
|
self.inputs = df["input_text"].tolist()
|
|
self.max_length = max_length
|
|
|
|
def __len__(self):
|
|
return len(self.inputs)
|
|
|
|
def __getitem__(self, idx):
|
|
text = self.inputs[idx]
|
|
encoded = self.tokenizer(
|
|
text,
|
|
truncation=True,
|
|
padding="max_length",
|
|
max_length=self.max_length,
|
|
return_tensors="pt"
|
|
)
|
|
input_ids = encoded["input_ids"].squeeze(0)
|
|
return {"input_ids": input_ids, "labels": input_ids}
|
|
|
|
|
|
print("📦 Loading model: distilgpt2")
|
|
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", token=hf_token)
|
|
model = AutoModelForCausalLM.from_pretrained("distilgpt2", token=hf_token)
|
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
train_dataset = ChatDataset(data, tokenizer)
|
|
print(f"✅ Loaded {len(train_dataset)} training samples.")
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir="model_output",
|
|
per_device_train_batch_size=4,
|
|
num_train_epochs=3,
|
|
logging_steps=50,
|
|
save_steps=500,
|
|
warmup_steps=100,
|
|
weight_decay=0.01,
|
|
logging_dir="logs",
|
|
save_total_limit=2,
|
|
report_to="none"
|
|
)
|
|
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=train_dataset
|
|
)
|
|
|
|
|
|
print("🚀 Starting training...")
|
|
trainer.train()
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_dir = f"trained_model_{timestamp}"
|
|
model.save_pretrained(output_dir)
|
|
tokenizer.save_pretrained(output_dir)
|
|
print(f"✅ Training complete. Model saved to {output_dir}") |