|
from pathlib import Path |
|
import time |
|
import modal |
|
from modal import App, Image, Volume, enter, method, wsgi_app |
|
import os |
|
|
|
|
|
VOL_MOUNT_PATH = Path("/vol") |
|
|
|
cuda_version = "12.4.0" |
|
flavor = "devel" |
|
_os = "ubuntu22.04" |
|
tag = f"{cuda_version}-{flavor}-{_os}" |
|
|
|
|
|
image = Image.debian_slim(python_version="3.10") |
|
|
|
image = image.pip_install( |
|
"accelerate", |
|
"transformers", |
|
"torch", |
|
"datasets", |
|
"tensorboard", |
|
"trl", |
|
"xformers", |
|
"bitsandbytes", |
|
"peft", |
|
"protobuf==3.20.*", |
|
"onnxruntime", |
|
"onnx", |
|
"setfit", |
|
"nltk", |
|
"firebase_admin", |
|
"openai", |
|
"evaluate", |
|
"sentencepiece", |
|
"pandas", |
|
"scikit-learn", |
|
"huggingface_hub" |
|
) |
|
|
|
app = App( |
|
name="finetune-run", image=image |
|
) |
|
output_vol = Volume.from_name("finetune-volume", create_if_missing=True) |
|
|
|
|
|
@app.function(gpu="any") |
|
def gpu_function(): |
|
import subprocess |
|
|
|
import torch |
|
|
|
subprocess.run(["nvidia-smi"]) |
|
print("Torch version:", torch.__version__) |
|
print("CUDA available:", torch.cuda.is_available()) |
|
print("CUDA device count:", torch.cuda.device_count()) |
|
|
|
|
|
|
|
GPU_CONFIG = modal.gpu.A100(count=1, size="80GB") |
|
|
|
@app.function( |
|
gpu=GPU_CONFIG, |
|
timeout=7200, |
|
volumes={VOL_MOUNT_PATH: output_vol}, |
|
secrets=[modal.Secret.from_dotenv()] |
|
) |
|
def run_finetune(data): |
|
import subprocess |
|
|
|
import torch |
|
|
|
subprocess.run(["nvidia-smi"]) |
|
print("Torch version:", torch.__version__) |
|
print("CUDA available:", torch.cuda.is_available()) |
|
print("CUDA device count:", torch.cuda.device_count()) |
|
|
|
import pandas as pd |
|
open('./features_ms_deberta_v3.json', 'w').write(data) |
|
df = pd.read_json('./features_ms_deberta_v3.json', lines=False) |
|
dfs = [] |
|
for _ in range(50): |
|
dfs.append(df) |
|
df = pd.concat(dfs, ignore_index=True) |
|
|
|
from datasets import Dataset |
|
from transformers import ( |
|
AutoModel,AutoTokenizer, |
|
AutoModelForSequenceClassification, DebertaV2Model, DebertaV2Tokenizer, DebertaV2ForSequenceClassification, |
|
|
|
Trainer, TrainingArguments ,EvalPrediction,DataCollatorWithPadding |
|
) |
|
from transformers import EarlyStoppingCallback |
|
import numpy as np |
|
from itertools import chain |
|
import re |
|
from collections import Counter |
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import random |
|
from sklearn.model_selection import train_test_split |
|
from transformers import pipeline |
|
import torch |
|
from evaluate import load as load_metric |
|
|
|
HF_ORGANIZATION = "rafaelsandroni" |
|
token = os.getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
|
|
|
task = "zero-shot-classification" |
|
|
|
|
|
|
|
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" |
|
|
|
|
|
model_name = "tasksource/deberta-small-long-nli" |
|
|
|
|
|
output_dir = "./" |
|
|
|
|
|
torch.cuda.empty_cache() |
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
|
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
|
|
|
def create_input_sequence(sample): |
|
|
|
text = sample["premise"] |
|
|
|
|
|
hypothesis = sample['hypothesis'] |
|
|
|
|
|
label = sample['class'] |
|
|
|
|
|
encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length') |
|
|
|
|
|
encoded_sequence['labels'] = label |
|
|
|
|
|
encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids) |
|
|
|
return encoded_sequence |
|
|
|
|
|
|
|
|
|
df['class'] = (df['target'] == 'PASS').astype(int).apply(lambda x: 0 if x == 1 else 2) |
|
|
|
print(df.head()) |
|
|
|
print(df.shape) |
|
|
|
|
|
|
|
|
|
|
|
|
|
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42) |
|
|
|
|
|
train_shuffle_df = train_data |
|
|
|
|
|
test_shuffle_df = test_data |
|
|
|
|
|
train = Dataset.from_pandas(train_shuffle_df) |
|
|
|
|
|
test = Dataset.from_pandas(test_shuffle_df) |
|
|
|
|
|
|
|
train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"]) |
|
test_dataset = test.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"]) |
|
|
|
|
|
|
|
|
|
|
|
def compute_metrics(p: EvalPrediction): |
|
|
|
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions |
|
|
|
|
|
preds = np.argmax(preds, axis = 1) |
|
|
|
|
|
ratio = np.mean(preds == 2) |
|
|
|
|
|
result = {} |
|
|
|
|
|
metric_f1 = load_metric("f1") |
|
metric_precision = load_metric("precision") |
|
metric_recall = load_metric("recall") |
|
metric_acc = load_metric("accuracy") |
|
|
|
|
|
result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"] |
|
result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids,average = 'macro')['precision'] |
|
result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids,average = 'macro')["recall"] |
|
result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"] |
|
result["ratio"] = ratio |
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=output_dir, |
|
logging_dir=output_dir + "/logs", |
|
num_train_epochs=1, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=16, |
|
warmup_steps=4, |
|
weight_decay=0.01, |
|
gradient_accumulation_steps=2, |
|
learning_rate=2e-05, |
|
warmup_ratio=0.06, |
|
label_smoothing_factor=0.1, |
|
evaluation_strategy='steps', |
|
logging_strategy='steps', |
|
logging_steps = 10, |
|
eval_steps = 10, |
|
logging_first_step=True, |
|
do_eval=True, |
|
hub_model_id="rafaelsandroni/ms-deberta-v2-xlarge-mnli-finetuned-pt", |
|
load_best_model_at_end=True, |
|
) |
|
|
|
|
|
|
|
|
|
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)] |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=test_dataset, |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics, |
|
callbacks=callbacks |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
trainer.evaluate() |
|
t = time.strftime("%Y%m%d%H%M%S") |
|
v = 2 |
|
commit = f"dev-v{v}-{t}" |
|
trainer.push_to_hub(commit, token=token) |
|
|
|
|
|
|
|
|
|
@app.local_entrypoint() |
|
def run(): |
|
import time |
|
import pandas as pd |
|
t0 = time.time() |
|
|
|
with open('./features_ms_deberta_v3.json') as f: |
|
data = f.read() |
|
run_finetune.remote(data) |
|
|
|
print("Full time spent:", time.time() - t0) |