In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import pandas as pd

# Convert dataset to Pandas DataFrame
df = dataset["train"].to_pandas()

# Define features and labels
X_text = df["category"] + " " + df["objective"]  # Combine text columns
y = df["label"]

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # Keep top 5000 words
X_tfidf = vectorizer.fit_transform(X_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Convert back to Pandas DataFrame
resampled_df = pd.DataFrame(X_resampled.toarray(), columns=vectorizer.get_feature_names_out())
resampled_df["label"] = y_resampled

# Convert back to Hugging Face Dataset
resampled_dataset = Dataset.from_pandas(resampled_df)
dataset["train"] = resampled_dataset

# Check new label distribution
print("New Label Distribution:", np.unique(y_resampled, return_counts=True))


New Label Distribution: (array([0, 1]), array([9558, 9558]))


In [5]:
from IPython.display import display
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer
)
from peft import get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

dataset = load_dataset("csv", data_files="/content/mobile_marketing_dataset .csv")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", add_prefix_space=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def assign_labels(example):
    example["label"] = 1 if example["revenue"] > 500 else 0
    return example

dataset = dataset.map(assign_labels)

def tokenize_function(examples):
    texts = [examples["category"], examples["objective"]]
    combined_text = " [SEP] ".join(texts)

    tokenized_inputs = tokenizer(
        combined_text,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized_inputs["labels"] = examples["label"]
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function)

print("Label Distribution:", np.unique(np.array(tokenized_dataset["train"]["labels"]), return_counts=True))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}



id2label = {0: "Low Revenue", 1: "High Revenue"}
label2id = {"Low Revenue": 0, "High Revenue": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=["q_lin", "v_lin"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

batch_size = 16

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=4,
    num_train_epochs=10,
    learning_rate=1e-3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Label Distribution: (array([0, 1]), array([ 442, 9558]))


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 665,858 || all params: 67,620,868 || trainable%: 0.9847


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mreddysekhar[0m ([33mreddysekhar-lancers-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.19,0.181103,{'accuracy': 0.9558}
2,0.1973,0.180947,{'accuracy': 0.9558}
3,0.1758,0.181908,{'accuracy': 0.9558}
4,0.189,0.190711,{'accuracy': 0.9558}
5,0.1864,0.188646,{'accuracy': 0.9558}
6,0.19,0.183267,{'accuracy': 0.9558}
7,0.18,0.184369,{'accuracy': 0.9558}
8,0.1863,0.18429,{'accuracy': 0.9558}
9,0.1806,0.180573,{'accuracy': 0.9558}
10,0.1765,0.181132,{'accuracy': 0.9558}


Trainer is attempting to log a value of "{'accuracy': 0.9558}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9558}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9558}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9558}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9558}" of type <class 'dict'> for key "eval/accuracy" as a scalar. T

TrainOutput(global_step=6250, training_loss=0.18548879638671875, metrics={'train_runtime': 4952.1965, 'train_samples_per_second': 20.193, 'train_steps_per_second': 1.262, 'total_flos': 1.34512914432e+16, 'train_loss': 0.18548879638671875, 'epoch': 10.0})

In [12]:
model.to("cpu")

# Define a list of example texts for testing
text_list = [
    "It was good.",
    "Not a fan, don't recommend.",
    "Better than the first one.",
    "This is not worth watching even once.",
    "This one is a pass."
]

print("Trained model predictions:")
print("--------------------------")

for text in text_list:
    inputs = tokenizer(text, return_tensors="pt").to("cpu")  # Ensure correct input format

    with torch.no_grad():  # Disable gradient tracking for inference
        logits = model(**inputs).logits
        predictions = logits.argmax(dim=1)  # Get the predicted label index

    print(f"{text} - {id2label[predictions.item()]}")  # Convert tensor to int and map to label

from huggingface_hub import notebook_login
notebook_login()

Trained model predictions:
--------------------------
It was good. - High Revenue
Not a fan, don't recommend. - High Revenue
Better than the first one. - High Revenue
This is not worth watching even once. - High Revenue
This one is a pass. - High Revenue


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
hf_name = 'Reddy-Sekhar'
model_checkpoint = 'distilbert-base-uncased'
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification"
# %%
model.push_to_hub(model_id)
# %%
trainer.push_to_hub(model_id)
# %%
config = peft_config.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


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

In [25]:
model.push_to_hub(model_id)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Reddy-Sekhar/distilbert-base-uncased-lora-text-classification/commit/9b80a1c9ae7a38af572ad006019fbfd3efdd1078', commit_message='Upload model', commit_description='', oid='9b80a1c9ae7a38af572ad006019fbfd3efdd1078', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Reddy-Sekhar/distilbert-base-uncased-lora-text-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='Reddy-Sekhar/distilbert-base-uncased-lora-text-classification'), pr_revision=None, pr_num=None)

In [40]:
from huggingface_hub import HfApi, Repository
import os
import shutil

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…