SujitShelar's picture
Rename train.ipynb to train.py
d910774 verified
# 0. Install libraries (one-time) – CUDA 12.4 toolchain + FFmpeg
# system FFmpeg gives TorchCodec the libavutil.so.56 / 57 / 58 / 59 it probes for
!sudo apt-get update -y && sudo apt-get install -y ffmpeg # FFmpeg ≥ 4.4
# PyTorch, TorchVision, TorchAudio **all** built for CUDA 12.4
!pip install -Uq torch==2.6.0+cu124 torchvision==0.21.0 torchaudio==2.6.0 \
--index-url https://download.pytorch.org/whl/cu124 # same extra-index for every wheel
# TorchCodec build that is ABI-matched to Torch 2 .6
!pip install -Uq torchcodec==0.2.1+cu124 \
--index-url https://download.pytorch.org/whl/cu124 # 0.2 Torch 2.6 compat
# everything else unchanged
!pip install -Uq scikit-learn transformers huggingface_hub tensorboard pillow matplotlib
# 1. Login to Hugging Face Hub
from huggingface_hub import login
login() # paste your token when prompted
# 2. Download & unpack HMDB-51
from huggingface_hub import hf_hub_download
import zipfile, pathlib, random, os
data_zip = hf_hub_download(
repo_id="jili5044/hmdb51", # public mirror of HMDB-51
filename="hmdb51.zip",
repo_type="dataset"
)
with zipfile.ZipFile(data_zip) as z:
z.extractall(".") # creates ./hmdb51/
dataset_root = pathlib.Path("hmdb51")
# 3. Build stratified TRAIN / VAL / TEST splits
from sklearn.model_selection import train_test_split
video_paths = list(dataset_root.glob("**/*.avi"))
labels = [p.parent.name for p in video_paths] # parent dir = class
train_paths, temp_paths, y_train, y_temp = train_test_split(
video_paths, labels, test_size=0.30,
stratify=labels, random_state=42
)
val_paths, test_paths, y_val, y_test = train_test_split(
temp_paths, y_temp, test_size=0.50,
stratify=y_temp, random_state=42
)
class_names = sorted(set(labels))
label2id = {lbl: i for i, lbl in enumerate(class_names)}
id2label = {i: lbl for lbl, i in label2id.items()}
# 4. Dataset & DataLoader
import torch
from torch.utils.data import Dataset, DataLoader
from torchcodec.decoders import VideoDecoder
from torchcodec.samplers import clips_at_random_indices
import torchvision.transforms.v2 as T
class HMDBDataset(Dataset):
def __init__(self, paths): self.paths = paths
def __len__(self): return len(self.paths)
def __getitem__(self, idx):
path = self.paths[idx]
label = label2id[path.parent.name]
return path, label
# video transforms
train_tf = T.Compose([
T.RandomResizedCrop((256, 256)),
T.RandomHorizontalFlip(),
])
eval_tf = T.CenterCrop((256, 256))
# `collate` function
def collate(samples, frames_per_clip, tf):
vids, labels = [], []
for path, lbl in samples:
clip = clips_at_random_indices(
VideoDecoder(str(path)),
num_clips=1,
num_frames_per_clip=frames_per_clip,
num_indices_between_frames=3,
).data.squeeze(0) # (T, C, H, W) — remove leading 1
clip = tf(clip) # Apply transforms (keeps same shape)
vids.append(clip)
labels.append(lbl)
vids = torch.stack(vids, dim=0) # (B, T, C, H, W) → 5 dims ✔
return vids, torch.tensor(labels)
batch_size, num_workers = 4, 8
# 5. Load model & processor
from transformers import (
VJEPA2ForVideoClassification,
VJEPA2VideoProcessor
)
ckpt = "facebook/vjepa2-vitl-fpc16-256-ssv2"
processor = VJEPA2VideoProcessor.from_pretrained(ckpt)
model = VJEPA2ForVideoClassification.from_pretrained(
ckpt,
label2id=label2id,
id2label=id2label,
ignore_mismatched_sizes=True,
torch_dtype=torch.float32
).to("cuda")
# freeze backbone, train only classifier
for p in model.vjepa2.parameters():
p.requires_grad = False
# 6. DataLoaders (needs model.config.frames_per_clip)
frames_per_clip = model.config.frames_per_clip
train_loader = DataLoader(
HMDBDataset(train_paths), batch_size=batch_size, shuffle=True,
collate_fn=lambda s: collate(s, frames_per_clip, train_tf),
num_workers=num_workers, pin_memory=True
)
val_loader = DataLoader(
HMDBDataset(val_paths), batch_size=batch_size, shuffle=False,
collate_fn=lambda s: collate(s, frames_per_clip, eval_tf),
num_workers=num_workers, pin_memory=True
)
test_loader = DataLoader(
HMDBDataset(test_paths), batch_size=batch_size, shuffle=False,
collate_fn=lambda s: collate(s, frames_per_clip, eval_tf),
num_workers=num_workers, pin_memory=True
)
# 7. Training & evaluation helpers
def accuracy(loader):
model.eval()
correct = total = 0
with torch.no_grad():
for vids, lbls in loader:
inp = processor(vids, return_tensors="pt").to(model.device)
lbls = lbls.to(model.device)
pred = model(**inp).logits.argmax(-1)
correct += (pred == lbls).sum().item()
total += lbls.size(0)
return correct / total
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
lr=1e-5)
epochs, accum = 5, 4
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/vjepa2_hmdb51")
# 8. Training loop
for epoch in range(1, epochs + 1):
model.train(); running = 0
optimizer.zero_grad()
for step, (vids, lbls) in enumerate(train_loader, 1):
batch = processor(vids, return_tensors="pt").to(model.device)
lbls = lbls.to(model.device)
loss = model(**batch, labels=lbls).loss / accum
loss.backward(); running += loss.item()
if step % accum == 0:
optimizer.step(); optimizer.zero_grad()
print(f"Epoch {epoch} Step {step}: loss {running:.4f}")
writer.add_scalar("train/loss", running, epoch*len(train_loader)+step)
running = 0
val_acc = accuracy(val_loader)
print(f"Epoch {epoch}: val_acc {val_acc:.4f}")
writer.add_scalar("val/acc", val_acc, epoch)
test_acc = accuracy(test_loader)
print(f"Test accuracy: {test_acc:.4f}")
writer.add_scalar("test/acc", test_acc, epochs)
# 9. Push model, processor, and logs to the Hub
repo = "SujitShelar/vjepa2-vitl-fpc16-256-hmdb51"
model.push_to_hub(repo)
processor.push_to_hub(repo)
from huggingface_hub import upload_folder
upload_folder(repo_id=repo, folder_path="runs", path_in_repo="runs")
writer.close()
print("upload complete")