from datasets import load_dataset from transformers import T5Tokenizer, T5ForConditionalGeneration import torch, csv file_dict = { "train" : "name_dataset.csv", "test" : "name_dataset.csv" } dataset = load_dataset( 'csv', data_files=file_dict, delimiter=',', column_names=['text', 'label'], skiprows=1 ) print(f"Train dataset size: {len(dataset['train'])}") print(f"Test dataset size: {len(dataset['test'])}") from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from datasets import concatenate_datasets model_id = "t5-small" tokenizer = AutoTokenizer.from_pretrained(model_id) def tokenize_function(example): model_inputs = tokenizer(example["text"], truncation=True) targets = tokenizer(example["label"], truncation=True) model_inputs['labels'] = targets['input_ids'] return model_inputs tokenized_datasets = dataset.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns("text") tokenized_datasets = tokenized_datasets.remove_columns("label") from transformers import DataCollatorForSeq2Seq model =T5ForConditionalGeneration.from_pretrained(model_id) from peft import LoraConfig, get_peft_model,TaskType lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type=TaskType.SEQ_2_SEQ_LM ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() label_pad_token_id = -100 data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 ) from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments output_dir = "lora-t5" training_args = Seq2SeqTrainingArguments( output_dir=output_dir, auto_find_batch_size=True, learning_rate=1e-3, num_train_epochs=100, logging_dir=f"{output_dir}/logs", logging_strategy="steps", logging_steps=500, save_strategy="no", # report_to="tensorboard", ) trainer = Seq2SeqTrainer( model=model, args=training_args, data_collator=data_collator, train_dataset=tokenized_datasets["train"], ) model.config.use_cache = False trainer.train() peft_model_id = "name-peft" trainer.model.save_pretrained(peft_model_id) tokenizer.save_pretrained(peft_model_id) from transformers import T5ForConditionalGeneration, AutoTokenizer from peft import PeftModel base_model = T5ForConditionalGeneration.from_pretrained(model_id) peft_model = PeftModel.from_pretrained(base_model, "name-peft") peft_model = peft_model.merge_and_unload() peft_model.save_pretrained("name-extraction") tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.save_pretrained("name-extraction")