# Fine-tuning Sandbox

Code authored by: Shawhin Talebi <br>
Blog link: https://medium.com/towards-data-science/fine-tuning-large-language-models-llms-23473d763b91

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
# PEFT的全称是Parameter-Efficient Fine-Tuning，是transform开发的一个参数高效微调的库
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np




### dataset

In [2]:
# # how dataset was generated

# # load imdb data
# imdb_dataset = load_dataset("imdb")

# # define subsample size
# N = 1000 
# # generate indexes for random subsample
# rand_idx = np.random.randint(24999, size=N) 

# # extract train and test data
# x_train = imdb_dataset['train'][rand_idx]['text']
# y_train = imdb_dataset['train'][rand_idx]['label']

# x_test = imdb_dataset['test'][rand_idx]['text']
# y_test = imdb_dataset['test'][rand_idx]['label']

# # create new dataset
# dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
#                              'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [3]:
# 加载数据集 训练 验证 测试
dataset = load_dataset('shawhin/imdb-truncated')
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [4]:
# 得出训练集标签的平均值
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5

### model

In [5]:
model_checkpoint = 'distilbert-base-uncased'

# 类别的映射关系
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# 加载预训练的权重 num_labels指明是二分类任务 model_checkpoint 预训练模型的名称
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# display architecture
model = model.cuda()

### 预处理数据

In [7]:
# 创建分词器
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# 判断是否有填充标记 通过 resize_token_embeddings 方法调整模型的 token embeddings，以包含新添加的 pad token。
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [8]:
# 创建分词器函数
def tokenize_function(examples):
    # 提取文本
    text = examples["text"]

    # 设置 tokenizer 的截断位置为左侧。这意味着如果文本超过指定的 max_length，则在左侧截断。这是为了确保重要的文本内容被保留下来。
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        # 返回numpy 类型
        return_tensors="np",
        # 是否进行文本截断
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [9]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [10]:
# 创建数据收集器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [11]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

Using the latest cached version of the module from C:\Users\Administrator\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Fri Mar 15 09:54:33 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


In [12]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    # 计算预测结果和真实标签 返回准确率
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

In [13]:
# define list of examples
text_list = ["I'm sorry.", "You areedespicable person", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # 将文本转化为可以理解的编码 并返回pytorch张量
    inputs = tokenizer.encode(text, return_tensors="pt")
    # 计算对数
    logits = model(inputs.cuda()).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
I'm sorry. - Negative
You areedespicable person - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


### Train model

In [14]:
peft_config = LoraConfig(task_type="SEQ_CLS", # 序列分类任务
                        r = 4, # 递归深度
                        lora_alpha = 32, # alpha 值表示 LORA 模块的影响更大。
                        lora_dropout = 0.01,
                        target_modules = ['q_lin'])

In [15]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False)

In [16]:
# 对模型进行配置
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [17]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [18]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01, #  权重衰减，一种正则化技术，用于控制模型参数的大小。
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True, # 是否在训练结束加载最佳模型
)

### 

In [19]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics, 
)

# train model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33m1321416285[0m ([33mxuuuu[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.438809,{'accuracy': 0.855}
2,0.427600,0.648398,{'accuracy': 0.859}
3,0.427600,0.637398,{'accuracy': 0.877}
4,0.218100,0.689158,{'accuracy': 0.889}
5,0.218100,0.774748,{'accuracy': 0.897}
6,0.073100,0.846054,{'accuracy': 0.887}
7,0.073100,0.9461,{'accuracy': 0.894}
8,0.015500,0.941895,{'accuracy': 0.901}
9,0.015500,0.994161,{'accuracy': 0.898}
10,0.006700,0.999837,{'accuracy': 0.897}


Trainer is attempting to log a value of "{'accuracy': 0.855}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-250 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Trainer is attempting to log a value of "{'accuracy': 0.859}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Trainer is attempting to log a value of "{'accuracy': 0.877}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dr

TrainOutput(global_step=2500, training_loss=0.14819346437454223, metrics={'train_runtime': 174.6372, 'train_samples_per_second': 57.262, 'train_steps_per_second': 14.315, 'total_flos': 1112883852759936.0, 'train_loss': 0.14819346437454223, 'epoch': 10.0})

### Generate prediction

In [20]:
model.to('cuda') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
I'm sorry. - Negative
You areedespicable person - Positive
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative


### Optional: push model to hub

In [21]:
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
hf_name = 'shawhin' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [23]:
model.push_to_hub(model_id) # save model

HfHubHTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/shawhin/distilbert-base-uncased-lora-text-classification.git/info/lfs/objects/batch (Request ID: Root=1-65f44b6d-3a7059390bd0f46b3618a6e6;b93e4a6f-c6a2-4179-8d62-ec4b3235048e)

Authorization error.

In [None]:
trainer.push_to_hub(model_id) # save trainer

### Optional: load peft model

In [None]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)