{ "cells": [ { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "883dad38-fa21-4372-9946-b11dec49e88c", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# Install Pytorch & other libraries\n", "%pip install torch==2.6.0 tensorboard\n", "\n", "# Install Hugging Face libraries\n", "%pip install transformers==4.55.0 datasets==4.0.0 accelerate==1.10.0 evaluate trl==0.21.0 peft protobuf sentencepiece==0.2.0\n", "\n", "# COMMENT IN: if you are running on a GPU that supports BF16 data type and flash attn, such as NVIDIA L4 or NVIDIA A100\n", "%pip install flash-attn --no-build-isolation\n", "%pip install mlflow tiktoken\n", "dbutils.library.restartPython()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "a25cb48a-6ce3-43d6-85ae-1c6b8c8e107c", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# https://aws.amazon.com/ec2/instance-types/" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9ba2c509-2e7f-4d54-8d24-a7f85ad25607", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "import torch \n", "import transformers\n", "import accelerate\n", "import trl\n", "import sentencepiece\n", "import datasets \n", "\n", "print(\"Torch version: \", torch.__version__)\n", "print(\"Transformers version: \", transformers.__version__)\n", "print(\"Accelerate version: \", accelerate.__version__)\n", "print(\"TRL version: \", trl.__version__)\n", "print(\"Sentencepiece version: \", sentencepiece.__version__)\n", "print(\"Datasets version: \", datasets.__version__)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "ffb21bb7-53e5-4512-bc2c-84c24786645a", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "from huggingface_hub import login\n", "from datasets import load_dataset, Dataset, DatasetDict\n", "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "from transformers import pipeline\n", "from random import randint\n", "import re\n", "import tiktoken\n", "from pathlib import Path\n", "import json\n", "from pyspark.sql import functions as F\n", "from pyspark.sql.types import IntegerType\n", "from sklearn.metrics import classification_report\n", "from trl import SFTConfig\n", "import torch \n", "from trl import SFTTrainer\n" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "6c74f119-7de5-4166-9958-ca3375e26cde", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "\n", "def count_tokens(text, model=\"gpt2\"):\n", " \"\"\"Count the number of tokens in a text\"\"\"\n", " encoding = tiktoken.encoding_for_model(model)\n", " tokens = len(encoding.encode(text))\n", " return tokens\n", "\n", "#udf\n", "count_tokens_udf = F.udf(count_tokens, returnType=IntegerType())" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "563a43ec-ca3e-4af9-9056-95419ab9b15c", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "login()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "58936d51-3e88-46e3-846e-899c92a3b962", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# does not work with python version == 3.11.11. It works with Python version: 3.12.3\n", "base_model = \"google/gemma-3-270m-it\"\n", "\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "# lets try flash_attention_2\n", "model = AutoModelForCausalLM.from_pretrained(base_model,torch_dtype=torch.bfloat16\n", " , device_map=device, attn_implementation='flash_attention_2')\n", "\n", "# training speedup\n", "model.config.use_cache = False\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(base_model)\n" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "30590933-87fa-48dd-8eee-cb77936be4bd", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# #w which side is the pad token is\n", "# tokenizer.pad_token_id" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "bb409321-f8cf-492b-86f8-b60413bfe04d", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "print(f\"Device: {model.device}\")\n", "print(f\"DType: {model.dtype}\")" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "f8b71fa7-2e11-405f-b7f8-f9a55c53ea5e", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# system prompt\n", "system_prompt = \"\"\"You are a policy classifier for Vrbo listings, reviews, and host-traveler exchanges. Decide if a listing violates Vrbo’s Shared Space Policy. Vrbo does not allow rentals where guests share internal living areas (bedroom, bathroom, kitchen, living room, interior hallway) with the host or other unrelated guests.\n", "\n", "Allowed: Shared external spaces (yard, driveway, patio, pool, etc.).\n", "\n", "Requirements for a compliant listing\n", "\t•\tPrivate, secure entrance (guest-controlled lock)\n", "\t•\tPrivate bathroom\n", "\t•\tNo shared internal living areas with host/other guests\n", "\n", "Classification Options\n", "\t•\tyes = Explicit or strong implication of shared internal space.\n", "\t•\tno = Private internal spaces, or only shared external spaces.\n", "\t•\tclarification = Possible but unclear sharing, or contradictory info.\n", "\n", "Output Format\n", "yes | no | clarification\"\"\"\n", "\n", "pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)\n", "\n", "\n", "example_prompt = \"\"\"Welcome to the heart of the bay area. You'll find yourself conveniently located to downtown San Mateo and all the transportation. This is a shared walkway, 1 bed guest suite. The unit has 1 queen sized bed with ample pillows, , mini kitchen, fast wifi, 2 x 43\\\" TV, Netflix, coffee, fast wifi. Self-check in. Mini Kitchen includes: Refrigerator, cook-top, microwave, Keurig / Keurig pods, utensils, cookware. Bathroom includes, towels, blow dryer, iron / ironing board, hand soap, shampoo, conditioner, body wash. Living area has pull out queen sofa bed with mattress topper, blanket, pillows, and sheets.\"\"\"\n", "\n", "example = [{\"role\": \"system\", \"content\": system_prompt}\n", " , {\"role\": \"user\", \"content\": example_prompt}]\n", "\n", "prompt = pipe.tokenizer.apply_chat_template(example, tokenize=False, add_generation_prompt=True)\n", "\n", "output = pipe(example, max_new_tokens=500, disable_compile=True\n", " , truncation=True, return_full_text=False)\n", "\n", "output" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5aa2b213-9cec-48ee-aa99-fd8b7fe99cad", "showTitle": true, "tableResultSettingsMap": {}, "title": "Data" } }, "outputs": [], "source": [ "SELECTED_COLUMNS = ['vrbo_property_id', 'text', 'decision', 'reasoning', 'taxonomy', 'label_excerpt', 'data_source']\n", "PATH=\"s3://apiary-analytics-927134741764-us-east-1-mxt-ml/hyemam/shared_spaces/discovery/model_training/v3/listing_description_labels/\"\n", "listing_description_labels = spark.read.parquet(PATH)\\\n", " .withColumn('data_source', F.lit('listing_description')).select(*SELECTED_COLUMNS)\n", "\n", "LAB_REVIEW_PATH =\"s3://apiary-analytics-927134741764-us-east-1-mxt-ml/hyemam/shared_spaces/discovery/model_training/v3/reviews_labelled/\"\n", "reviews_labelled = spark.read.parquet(LAB_REVIEW_PATH)\\\n", " .withColumn('data_source', F.lit('post_stay_reviews')).select(*SELECTED_COLUMNS)\n", "\n", "MSG_INQUIRY_PATH=\"s3://apiary-analytics-927134741764-us-east-1-mxt-ml/hyemam/shared_spaces/discovery/model_training/v3/message_inquiries_labelled/\"\n", "message_inquiries_labelled = spark.read.parquet(MSG_INQUIRY_PATH)\\\n", " .withColumn('data_source', F.lit('msg_inquiry')).select(*SELECTED_COLUMNS)\n", "\n", "\n", " # .filter(\"decision != 'clarification'\")\\\n", "\n", "data = listing_description_labels.union(reviews_labelled).union(message_inquiries_labelled)\\\n", " .withColumn(\"num_token\", count_tokens_udf(F.col(\"text\")))\\\n", " .filter(\"decision != 'unsure'\")\\\n", " .filter(~(F.col(\"num_token\") > 512))\\\n", " .withColumn(\"decision\", F.lower(F.trim('decision')))\\\n", " .withColumn(\"label\", F.when(F.col(\"decision\") == 'yes', 1).otherwise(0))\n", "\n", "total_count = data.count()\n", "\n", "\n", "data.groupBy('data_source').count()\\\n", " .withColumn(\"total_count\", F.lit(total_count))\\\n", " .withColumn(\"percentage\", F.round(F.col(\"count\") * 100/ F.col(\"total_count\"), 1)).display()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "005dffa7-f209-40a5-87cc-182f10e6281d", "showTitle": false, "tableResultSettingsMap": { "0": { "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1755815873534}", "filterBlob": null, "queryPlanFiltersBlob": null, "tableResultIndex": 0 } }, "title": "" } }, "outputs": [], "source": [ "data.groupBy('decision', 'data_source').count().display()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "005d63d3-c861-4089-b4cd-0943c986273c", "showTitle": true, "tableResultSettingsMap": {}, "title": "Train and test dataset" } }, "outputs": [], "source": [ "# train and test split\n", "train_dataset, test_dataset = data.randomSplit([0.8, 0.2], seed=42)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "888dca21-a579-4733-890a-5bf97e93f5d0", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# training dataset\n", "train_count = train_dataset.count()\n", "print('Train label distribution:')\n", "train_dataset.groupBy('label', 'decision').count()\\\n", " .withColumn(\"total_count\", F.lit(train_count))\\\n", " .withColumn(\"percentage\", F.round(F.col(\"count\") * 100/ F.col(\"total_count\"), 1))\\\n", " .display()\n", "\n", "\n", "# test dataset\n", "test_count = test_dataset.count()\n", "print('Test label distribution:', '\\n')\n", "test_dataset.groupBy('label', 'decision').count()\\\n", " .withColumn(\"total_count\", F.lit(test_count))\\\n", " .withColumn(\"percentage\", F.round(F.col(\"count\") * 100/ F.col(\"total_count\"), 1))\\\n", " .display()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5d4faa36-5e20-4516-847a-79ef0ae7e293", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "def prep_for_training(dataframe):\n", " dataframe['user'] = dataframe['text']\n", " # this is done because the model takes only strings\n", " dataframe['assistant'] = dataframe.apply(lambda x: x['decision'], axis=1)\n", " dataframe['system_prompt'] = system_prompt\n", " dataframe = dataframe[['user', 'assistant', 'system_prompt']]\n", " dataframe['token'] = dataframe.apply(lambda x: count_tokens(x['user']) + count_tokens(x['assistant']) + count_tokens(x['system_prompt']), axis=1)\n", "\n", " return dataframe\n", "\n", "def create_conversation(sample):\n", " return {\"messages\": [{\"role\": \"system\", \"content\": sample['system_prompt']},\n", " {\"role\": \"user\", \"content\": sample['user']},\n", " {\"role\": \"assistant\", \"content\": sample['assistant']}]\n", " }\n", "\n" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5dde7ce9-e4c7-4f7b-8223-64fd713fed83", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# prep training dataset\n", "train_pandas = prep_for_training(train_dataset.toPandas())\n", "test_pandas = prep_for_training(test_dataset.toPandas())\n", "\n", "\n", "dataset_dict = DatasetDict({\n", " \"train\": Dataset.from_pandas(train_pandas),\n", " \"test\": Dataset.from_pandas(test_pandas)\n", "})\n", "\n", "\n", "# Convert dataset to conversational format\n", "data_dataset = dataset_dict.map(create_conversation, batched=False\n", " ,remove_columns=dataset_dict['train'].column_names)\n", "data_dataset" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "33ac7be7-0b37-4e35-a549-1cb20cfbe42c", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "# example\n", "data_dataset['train'][0]" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "159e6589-25e9-48d7-8a85-0ed728f244a6", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "739a37da-2fdd-48c0-8ae9-a636b238da2d", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "from peft import LoraConfig\n", "\n", "\n", "peft_config = LoraConfig(\n", " r=16,\n", " lora_alpha=32,\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " target_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"o_proj\",\"gate_proj\",\"up_proj\",\"down_proj\"], # adjust for your model\n", " task_type=\"CAUSAL_LM\"\n", ")" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b23b0b67-50b8-4d32-aa98-bb841f194ff5", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "import os\n", "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9c1c597c-c70d-4dbd-9b2b-abdb57aa8368", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "max(train_pandas['token'].max(), test_pandas['token'].max())" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "5d6e7b02-348b-4f68-befb-9fa2f5300a51", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "720c8115-ee7d-4f91-8e45-1419fd32a63c", "showTitle": true, "tableResultSettingsMap": {}, "title": "Fine Tune" } }, "outputs": [], "source": [ "\n", "# torch dtype\n", "torch_dtype = model.dtype\n", "\n", "# max sequence length\n", "max_length = max(train_pandas['token'].max(), test_pandas['token'].max()) \n", "\n", "# checkpoint directory\n", "checkpoint_dir = '/Workspace/Users/hyemam@expediagroup.com/Trust_and_Safety/Shared_Spaces/LLMExperiments/TestingNewOpenSourceModels/Gemma-270M-it'\n", "\n", "\n", "\n", "args = SFTConfig(output_dir = checkpoint_dir # directory to save and repository id\n", " , max_length = max_length # max sequence length for model and packing of the dataset\n", " , packing = False # groups multiple samples in the dataset into a single sequence\n", " , num_train_epochs = 3\n", " , per_device_train_batch_size=2 # batch size per device during training\n", " , gradient_checkpointing=False # caching is incompatible with gradient checkpointing\n", " , optim='adamw_torch_fused'\n", " , logging_steps=1 # log every step\n", " , save_strategy='epoch' # save checkpint every epoch\n", " , eval_strategy='epoch' # evaluate checkpoint every epoch\n", " , learning_rate=5e-5\n", " , fp16=False\n", " , bf16=True \n", " , lr_scheduler_type = 'cosine' # use constant learning rate scheduler\n", " , push_to_hub=True\n", " , report_to='mlflow'\n", " , dataset_kwargs = {'add_special_tokens': False # template with special tokens\n", " , 'append_concat_token': True # add EOS token as separator token between examples\n", " }\n", " )" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "561fe3d0-fbfd-4936-b228-fd7d0a6b9f74", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "\n", "\n", "# create trainer object\n", "trainer = SFTTrainer(model=model, args=args\n", " , train_dataset=data_dataset['train']\n", " , eval_dataset=data_dataset['test']\n", " , processing_class=tokenizer\n", " , peft_config=peft_config)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "1f9fe80e-ba40-47d5-bc02-1adca300309f", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "\n", "# start training, the model will be automatically saved to the hub and the output directory\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "79985f91-df8a-4d39-92f5-3a791dc303bb", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "\n", "model_id = 'henokyemam/gemma-3-270m-it-sft-ssp-august21'\n", "\n", "trainer.push_to_hub(model_id, token=True)\n", "# tokenizer.push_to_hub(model_id, token=True)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "8143a69e-75b5-4b0a-8041-553262821a64", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [] } ], "metadata": { "application/vnd.databricks.v1+notebook": { "computePreferences": null, "dashboards": [], "environmentMetadata": { "base_environment": "", "environment_version": "2" }, "inputWidgetPreferences": null, "language": "python", "notebookMetadata": { "experimentId": "3704726302351730", "pythonIndentUnit": 4 }, "notebookName": "fine_tuning_shared_space_01", "widgets": {} }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }