|
|
|
|
|
!sudo apt-get update -y && sudo apt-get install -y ffmpeg |
|
|
|
|
|
!pip install -Uq torch==2.6.0+cu124 torchvision==0.21.0 torchaudio==2.6.0 \ |
|
--index-url https://download.pytorch.org/whl/cu124 |
|
|
|
|
|
!pip install -Uq torchcodec==0.2.1+cu124 \ |
|
--index-url https://download.pytorch.org/whl/cu124 |
|
|
|
|
|
!pip install -Uq scikit-learn transformers huggingface_hub tensorboard pillow matplotlib |
|
|
|
|
|
|
|
|
|
|
|
from huggingface_hub import login |
|
login() |
|
|
|
|
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
import zipfile, pathlib, random, os |
|
|
|
data_zip = hf_hub_download( |
|
repo_id="jili5044/hmdb51", |
|
filename="hmdb51.zip", |
|
repo_type="dataset" |
|
) |
|
with zipfile.ZipFile(data_zip) as z: |
|
z.extractall(".") |
|
|
|
dataset_root = pathlib.Path("hmdb51") |
|
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
|
|
video_paths = list(dataset_root.glob("**/*.avi")) |
|
labels = [p.parent.name for p in video_paths] |
|
|
|
train_paths, temp_paths, y_train, y_temp = train_test_split( |
|
video_paths, labels, test_size=0.30, |
|
stratify=labels, random_state=42 |
|
) |
|
val_paths, test_paths, y_val, y_test = train_test_split( |
|
temp_paths, y_temp, test_size=0.50, |
|
stratify=y_temp, random_state=42 |
|
) |
|
|
|
class_names = sorted(set(labels)) |
|
label2id = {lbl: i for i, lbl in enumerate(class_names)} |
|
id2label = {i: lbl for lbl, i in label2id.items()} |
|
|
|
|
|
|
|
|
|
import torch |
|
from torch.utils.data import Dataset, DataLoader |
|
from torchcodec.decoders import VideoDecoder |
|
from torchcodec.samplers import clips_at_random_indices |
|
import torchvision.transforms.v2 as T |
|
|
|
class HMDBDataset(Dataset): |
|
def __init__(self, paths): self.paths = paths |
|
def __len__(self): return len(self.paths) |
|
def __getitem__(self, idx): |
|
path = self.paths[idx] |
|
label = label2id[path.parent.name] |
|
return path, label |
|
|
|
|
|
train_tf = T.Compose([ |
|
T.RandomResizedCrop((256, 256)), |
|
T.RandomHorizontalFlip(), |
|
]) |
|
eval_tf = T.CenterCrop((256, 256)) |
|
|
|
|
|
def collate(samples, frames_per_clip, tf): |
|
vids, labels = [], [] |
|
for path, lbl in samples: |
|
clip = clips_at_random_indices( |
|
VideoDecoder(str(path)), |
|
num_clips=1, |
|
num_frames_per_clip=frames_per_clip, |
|
num_indices_between_frames=3, |
|
).data.squeeze(0) |
|
clip = tf(clip) |
|
vids.append(clip) |
|
labels.append(lbl) |
|
|
|
vids = torch.stack(vids, dim=0) |
|
return vids, torch.tensor(labels) |
|
|
|
batch_size, num_workers = 4, 8 |
|
|
|
|
|
|
|
|
|
from transformers import ( |
|
VJEPA2ForVideoClassification, |
|
VJEPA2VideoProcessor |
|
) |
|
|
|
ckpt = "facebook/vjepa2-vitl-fpc16-256-ssv2" |
|
processor = VJEPA2VideoProcessor.from_pretrained(ckpt) |
|
model = VJEPA2ForVideoClassification.from_pretrained( |
|
ckpt, |
|
label2id=label2id, |
|
id2label=id2label, |
|
ignore_mismatched_sizes=True, |
|
torch_dtype=torch.float32 |
|
).to("cuda") |
|
|
|
|
|
for p in model.vjepa2.parameters(): |
|
p.requires_grad = False |
|
|
|
|
|
|
|
|
|
frames_per_clip = model.config.frames_per_clip |
|
|
|
train_loader = DataLoader( |
|
HMDBDataset(train_paths), batch_size=batch_size, shuffle=True, |
|
collate_fn=lambda s: collate(s, frames_per_clip, train_tf), |
|
num_workers=num_workers, pin_memory=True |
|
) |
|
val_loader = DataLoader( |
|
HMDBDataset(val_paths), batch_size=batch_size, shuffle=False, |
|
collate_fn=lambda s: collate(s, frames_per_clip, eval_tf), |
|
num_workers=num_workers, pin_memory=True |
|
) |
|
test_loader = DataLoader( |
|
HMDBDataset(test_paths), batch_size=batch_size, shuffle=False, |
|
collate_fn=lambda s: collate(s, frames_per_clip, eval_tf), |
|
num_workers=num_workers, pin_memory=True |
|
) |
|
|
|
|
|
|
|
|
|
def accuracy(loader): |
|
model.eval() |
|
correct = total = 0 |
|
with torch.no_grad(): |
|
for vids, lbls in loader: |
|
inp = processor(vids, return_tensors="pt").to(model.device) |
|
lbls = lbls.to(model.device) |
|
pred = model(**inp).logits.argmax(-1) |
|
correct += (pred == lbls).sum().item() |
|
total += lbls.size(0) |
|
return correct / total |
|
|
|
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), |
|
lr=1e-5) |
|
epochs, accum = 5, 4 |
|
from torch.utils.tensorboard import SummaryWriter |
|
writer = SummaryWriter("runs/vjepa2_hmdb51") |
|
|
|
|
|
|
|
|
|
for epoch in range(1, epochs + 1): |
|
model.train(); running = 0 |
|
optimizer.zero_grad() |
|
for step, (vids, lbls) in enumerate(train_loader, 1): |
|
batch = processor(vids, return_tensors="pt").to(model.device) |
|
lbls = lbls.to(model.device) |
|
loss = model(**batch, labels=lbls).loss / accum |
|
loss.backward(); running += loss.item() |
|
if step % accum == 0: |
|
optimizer.step(); optimizer.zero_grad() |
|
print(f"Epoch {epoch} Step {step}: loss {running:.4f}") |
|
writer.add_scalar("train/loss", running, epoch*len(train_loader)+step) |
|
running = 0 |
|
val_acc = accuracy(val_loader) |
|
print(f"Epoch {epoch}: val_acc {val_acc:.4f}") |
|
writer.add_scalar("val/acc", val_acc, epoch) |
|
|
|
test_acc = accuracy(test_loader) |
|
print(f"Test accuracy: {test_acc:.4f}") |
|
writer.add_scalar("test/acc", test_acc, epochs) |
|
|
|
|
|
|
|
|
|
repo = "SujitShelar/vjepa2-vitl-fpc16-256-hmdb51" |
|
model.push_to_hub(repo) |
|
processor.push_to_hub(repo) |
|
|
|
from huggingface_hub import upload_folder |
|
upload_folder(repo_id=repo, folder_path="runs", path_in_repo="runs") |
|
|
|
writer.close() |
|
print("upload complete") |
|
|