File size: 2,763 Bytes
418c329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import pandas as pd
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from datetime import datetime

# ✅ Load environment variables
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

# ✅ Load CSV dataset
print("📂 Loading processed_dataset.csv...")
data = pd.read_csv("processed_dataset.csv")

# ✅ Check for required columns
if "prompt" not in data.columns or "response" not in data.columns:
    raise ValueError("CSV must contain 'prompt' and 'response' columns.")

# ✅ Format each row into OpenAssistant-style input text
data["input_text"] = data.apply(
    lambda row: f"<|prompter|> {row['prompt']} <|endoftext|><|assistant|> {row['response']}", axis=1
)

# ✅ PyTorch Dataset class
class ChatDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=768):
        self.tokenizer = tokenizer
        self.inputs = df["input_text"].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        text = self.inputs[idx]
        encoded = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encoded["input_ids"].squeeze(0)
        return {"input_ids": input_ids, "labels": input_ids}

# ✅ Load tokenizer and model
print("📦 Loading model: distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", token=hf_token)
model = AutoModelForCausalLM.from_pretrained("distilgpt2", token=hf_token)

# ✅ Fix pad token issue
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# ✅ Prepare dataset
train_dataset = ChatDataset(data, tokenizer)
print(f"✅ Loaded {len(train_dataset)} training samples.")

# ✅ Training configuration
training_args = TrainingArguments(
    output_dir="model_output",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=500,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="logs",
    save_total_limit=2,
    report_to="none"
)

# ✅ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# ✅ Train the model
print("🚀 Starting training...")
trainer.train()

# ✅ Save the model
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"trained_model_{timestamp}"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Training complete. Model saved to {output_dir}")