# 0. Install libraries (one-time) – CUDA 12.4 toolchain + FFmpeg # system FFmpeg gives TorchCodec the libavutil.so.56 / 57 / 58 / 59 it probes for !sudo apt-get update -y && sudo apt-get install -y ffmpeg # FFmpeg ≥ 4.4 # PyTorch, TorchVision, TorchAudio **all** built for CUDA 12.4 !pip install -Uq torch==2.6.0+cu124 torchvision==0.21.0 torchaudio==2.6.0 \ --index-url https://download.pytorch.org/whl/cu124 # same extra-index for every wheel # TorchCodec build that is ABI-matched to Torch 2 .6 !pip install -Uq torchcodec==0.2.1+cu124 \ --index-url https://download.pytorch.org/whl/cu124 # 0.2 Torch 2.6 compat # everything else unchanged !pip install -Uq scikit-learn transformers huggingface_hub tensorboard pillow matplotlib # 1. Login to Hugging Face Hub from huggingface_hub import login login() # paste your token when prompted # 2. Download & unpack HMDB-51 from huggingface_hub import hf_hub_download import zipfile, pathlib, random, os data_zip = hf_hub_download( repo_id="jili5044/hmdb51", # public mirror of HMDB-51 filename="hmdb51.zip", repo_type="dataset" ) with zipfile.ZipFile(data_zip) as z: z.extractall(".") # creates ./hmdb51/ dataset_root = pathlib.Path("hmdb51") # 3. Build stratified TRAIN / VAL / TEST splits from sklearn.model_selection import train_test_split video_paths = list(dataset_root.glob("**/*.avi")) labels = [p.parent.name for p in video_paths] # parent dir = class train_paths, temp_paths, y_train, y_temp = train_test_split( video_paths, labels, test_size=0.30, stratify=labels, random_state=42 ) val_paths, test_paths, y_val, y_test = train_test_split( temp_paths, y_temp, test_size=0.50, stratify=y_temp, random_state=42 ) class_names = sorted(set(labels)) label2id = {lbl: i for i, lbl in enumerate(class_names)} id2label = {i: lbl for lbl, i in label2id.items()} # 4. Dataset & DataLoader import torch from torch.utils.data import Dataset, DataLoader from torchcodec.decoders import VideoDecoder from torchcodec.samplers import clips_at_random_indices import torchvision.transforms.v2 as T class HMDBDataset(Dataset): def __init__(self, paths): self.paths = paths def __len__(self): return len(self.paths) def __getitem__(self, idx): path = self.paths[idx] label = label2id[path.parent.name] return path, label # video transforms train_tf = T.Compose([ T.RandomResizedCrop((256, 256)), T.RandomHorizontalFlip(), ]) eval_tf = T.CenterCrop((256, 256)) # `collate` function def collate(samples, frames_per_clip, tf): vids, labels = [], [] for path, lbl in samples: clip = clips_at_random_indices( VideoDecoder(str(path)), num_clips=1, num_frames_per_clip=frames_per_clip, num_indices_between_frames=3, ).data.squeeze(0) # (T, C, H, W) — remove leading 1 clip = tf(clip) # Apply transforms (keeps same shape) vids.append(clip) labels.append(lbl) vids = torch.stack(vids, dim=0) # (B, T, C, H, W) → 5 dims ✔ return vids, torch.tensor(labels) batch_size, num_workers = 4, 8 # 5. Load model & processor from transformers import ( VJEPA2ForVideoClassification, VJEPA2VideoProcessor ) ckpt = "facebook/vjepa2-vitl-fpc16-256-ssv2" processor = VJEPA2VideoProcessor.from_pretrained(ckpt) model = VJEPA2ForVideoClassification.from_pretrained( ckpt, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True, torch_dtype=torch.float32 ).to("cuda") # freeze backbone, train only classifier for p in model.vjepa2.parameters(): p.requires_grad = False # 6. DataLoaders (needs model.config.frames_per_clip) frames_per_clip = model.config.frames_per_clip train_loader = DataLoader( HMDBDataset(train_paths), batch_size=batch_size, shuffle=True, collate_fn=lambda s: collate(s, frames_per_clip, train_tf), num_workers=num_workers, pin_memory=True ) val_loader = DataLoader( HMDBDataset(val_paths), batch_size=batch_size, shuffle=False, collate_fn=lambda s: collate(s, frames_per_clip, eval_tf), num_workers=num_workers, pin_memory=True ) test_loader = DataLoader( HMDBDataset(test_paths), batch_size=batch_size, shuffle=False, collate_fn=lambda s: collate(s, frames_per_clip, eval_tf), num_workers=num_workers, pin_memory=True ) # 7. Training & evaluation helpers def accuracy(loader): model.eval() correct = total = 0 with torch.no_grad(): for vids, lbls in loader: inp = processor(vids, return_tensors="pt").to(model.device) lbls = lbls.to(model.device) pred = model(**inp).logits.argmax(-1) correct += (pred == lbls).sum().item() total += lbls.size(0) return correct / total optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5) epochs, accum = 5, 4 from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter("runs/vjepa2_hmdb51") # 8. Training loop for epoch in range(1, epochs + 1): model.train(); running = 0 optimizer.zero_grad() for step, (vids, lbls) in enumerate(train_loader, 1): batch = processor(vids, return_tensors="pt").to(model.device) lbls = lbls.to(model.device) loss = model(**batch, labels=lbls).loss / accum loss.backward(); running += loss.item() if step % accum == 0: optimizer.step(); optimizer.zero_grad() print(f"Epoch {epoch} Step {step}: loss {running:.4f}") writer.add_scalar("train/loss", running, epoch*len(train_loader)+step) running = 0 val_acc = accuracy(val_loader) print(f"Epoch {epoch}: val_acc {val_acc:.4f}") writer.add_scalar("val/acc", val_acc, epoch) test_acc = accuracy(test_loader) print(f"Test accuracy: {test_acc:.4f}") writer.add_scalar("test/acc", test_acc, epochs) # 9. Push model, processor, and logs to the Hub repo = "SujitShelar/vjepa2-vitl-fpc16-256-hmdb51" model.push_to_hub(repo) processor.push_to_hub(repo) from huggingface_hub import upload_folder upload_folder(repo_id=repo, folder_path="runs", path_in_repo="runs") writer.close() print("upload complete")