File size: 6,368 Bytes
06f2786 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
# 0. Install libraries (one-time) – CUDA 12.4 toolchain + FFmpeg
# system FFmpeg gives TorchCodec the libavutil.so.56 / 57 / 58 / 59 it probes for
!sudo apt-get update -y && sudo apt-get install -y ffmpeg # FFmpeg ≥ 4.4
# PyTorch, TorchVision, TorchAudio **all** built for CUDA 12.4
!pip install -Uq torch==2.6.0+cu124 torchvision==0.21.0 torchaudio==2.6.0 \
--index-url https://download.pytorch.org/whl/cu124 # same extra-index for every wheel
# TorchCodec build that is ABI-matched to Torch 2 .6
!pip install -Uq torchcodec==0.2.1+cu124 \
--index-url https://download.pytorch.org/whl/cu124 # 0.2 Torch 2.6 compat
# everything else unchanged
!pip install -Uq scikit-learn transformers huggingface_hub tensorboard pillow matplotlib
# 1. Login to Hugging Face Hub
from huggingface_hub import login
login() # paste your token when prompted
# 2. Download & unpack HMDB-51
from huggingface_hub import hf_hub_download
import zipfile, pathlib, random, os
data_zip = hf_hub_download(
repo_id="jili5044/hmdb51", # public mirror of HMDB-51
filename="hmdb51.zip",
repo_type="dataset"
)
with zipfile.ZipFile(data_zip) as z:
z.extractall(".") # creates ./hmdb51/
dataset_root = pathlib.Path("hmdb51")
# 3. Build stratified TRAIN / VAL / TEST splits
from sklearn.model_selection import train_test_split
video_paths = list(dataset_root.glob("**/*.avi"))
labels = [p.parent.name for p in video_paths] # parent dir = class
train_paths, temp_paths, y_train, y_temp = train_test_split(
video_paths, labels, test_size=0.30,
stratify=labels, random_state=42
)
val_paths, test_paths, y_val, y_test = train_test_split(
temp_paths, y_temp, test_size=0.50,
stratify=y_temp, random_state=42
)
class_names = sorted(set(labels))
label2id = {lbl: i for i, lbl in enumerate(class_names)}
id2label = {i: lbl for lbl, i in label2id.items()}
# 4. Dataset & DataLoader
import torch
from torch.utils.data import Dataset, DataLoader
from torchcodec.decoders import VideoDecoder
from torchcodec.samplers import clips_at_random_indices
import torchvision.transforms.v2 as T
class HMDBDataset(Dataset):
def __init__(self, paths): self.paths = paths
def __len__(self): return len(self.paths)
def __getitem__(self, idx):
path = self.paths[idx]
label = label2id[path.parent.name]
return path, label
# video transforms
train_tf = T.Compose([
T.RandomResizedCrop((256, 256)),
T.RandomHorizontalFlip(),
])
eval_tf = T.CenterCrop((256, 256))
# `collate` function
def collate(samples, frames_per_clip, tf):
vids, labels = [], []
for path, lbl in samples:
clip = clips_at_random_indices(
VideoDecoder(str(path)),
num_clips=1,
num_frames_per_clip=frames_per_clip,
num_indices_between_frames=3,
).data.squeeze(0) # (T, C, H, W) — remove leading 1
clip = tf(clip) # Apply transforms (keeps same shape)
vids.append(clip)
labels.append(lbl)
vids = torch.stack(vids, dim=0) # (B, T, C, H, W) → 5 dims ✔
return vids, torch.tensor(labels)
batch_size, num_workers = 4, 8
# 5. Load model & processor
from transformers import (
VJEPA2ForVideoClassification,
VJEPA2VideoProcessor
)
ckpt = "facebook/vjepa2-vitl-fpc16-256-ssv2"
processor = VJEPA2VideoProcessor.from_pretrained(ckpt)
model = VJEPA2ForVideoClassification.from_pretrained(
ckpt,
label2id=label2id,
id2label=id2label,
ignore_mismatched_sizes=True,
torch_dtype=torch.float32
).to("cuda")
# freeze backbone, train only classifier
for p in model.vjepa2.parameters():
p.requires_grad = False
# 6. DataLoaders (needs model.config.frames_per_clip)
frames_per_clip = model.config.frames_per_clip
train_loader = DataLoader(
HMDBDataset(train_paths), batch_size=batch_size, shuffle=True,
collate_fn=lambda s: collate(s, frames_per_clip, train_tf),
num_workers=num_workers, pin_memory=True
)
val_loader = DataLoader(
HMDBDataset(val_paths), batch_size=batch_size, shuffle=False,
collate_fn=lambda s: collate(s, frames_per_clip, eval_tf),
num_workers=num_workers, pin_memory=True
)
test_loader = DataLoader(
HMDBDataset(test_paths), batch_size=batch_size, shuffle=False,
collate_fn=lambda s: collate(s, frames_per_clip, eval_tf),
num_workers=num_workers, pin_memory=True
)
# 7. Training & evaluation helpers
def accuracy(loader):
model.eval()
correct = total = 0
with torch.no_grad():
for vids, lbls in loader:
inp = processor(vids, return_tensors="pt").to(model.device)
lbls = lbls.to(model.device)
pred = model(**inp).logits.argmax(-1)
correct += (pred == lbls).sum().item()
total += lbls.size(0)
return correct / total
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
lr=1e-5)
epochs, accum = 5, 4
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/vjepa2_hmdb51")
# 8. Training loop
for epoch in range(1, epochs + 1):
model.train(); running = 0
optimizer.zero_grad()
for step, (vids, lbls) in enumerate(train_loader, 1):
batch = processor(vids, return_tensors="pt").to(model.device)
lbls = lbls.to(model.device)
loss = model(**batch, labels=lbls).loss / accum
loss.backward(); running += loss.item()
if step % accum == 0:
optimizer.step(); optimizer.zero_grad()
print(f"Epoch {epoch} Step {step}: loss {running:.4f}")
writer.add_scalar("train/loss", running, epoch*len(train_loader)+step)
running = 0
val_acc = accuracy(val_loader)
print(f"Epoch {epoch}: val_acc {val_acc:.4f}")
writer.add_scalar("val/acc", val_acc, epoch)
test_acc = accuracy(test_loader)
print(f"Test accuracy: {test_acc:.4f}")
writer.add_scalar("test/acc", test_acc, epochs)
# 9. Push model, processor, and logs to the Hub
repo = "SujitShelar/vjepa2-vitl-fpc16-256-hmdb51"
model.push_to_hub(repo)
processor.push_to_hub(repo)
from huggingface_hub import upload_folder
upload_folder(repo_id=repo, folder_path="runs", path_in_repo="runs")
writer.close()
print("upload complete")
|