onnx-community
/

vjepa2-vitl-fpc32-256-diving48-ONNX

+---
+license: mit
+---
+## Usage
+### ONNXRuntime
+<details>
+<summary>
+First, define the <em>read_gif_frames</em> helper function (click to expand):
+</summary>
+```py
+import numpy as np
+from PIL import Image, ImageSequence
+import requests
+from io import BytesIO
+import os
+def read_gif_frames(path_or_url, shortest_edge=None, center_crop=None):
+    # Load GIF from URL or local path
+    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
+        response = requests.get(path_or_url)
+        gif = Image.open(BytesIO(response.content))
+    elif os.path.exists(path_or_url):
+        gif = Image.open(path_or_url)
+    else:
+        raise ValueError("Invalid URL or file path")
+    # Ensure it's a GIF
+    if gif.format != "GIF":
+        raise ValueError("Not a GIF file")
+    # Extract frames and convert to RGB
+    frames = []
+    for frame in ImageSequence.Iterator(gif):
+        rgb_frame = frame.convert("RGB")  # Force 3 channels
+        # Resize if specified
+        if shortest_edge is not None:
+            w, h = rgb_frame.size
+            if h < w:
+                new_h = shortest_edge
+                new_w = int(w * shortest_edge / h)
+            else:
+                new_w = shortest_edge
+                new_h = int(h * shortest_edge / w)
+            rgb_frame = rgb_frame.resize((new_w, new_h), Image.LANCZOS)
+        # Center crop if specified
+        if center_crop is not None:
+            w, h = rgb_frame.size
+            left = (w - center_crop) // 2
+            top = (h - center_crop) // 2
+            right = left + center_crop
+            bottom = top + center_crop
+            rgb_frame = rgb_frame.crop((left, top, right, bottom))
+        frame_np = np.array(rgb_frame, dtype=np.uint8)
+        frame_np = np.transpose(frame_np, (2, 0, 1))  # HWC -> CHW
+        frames.append(frame_np)
+    return np.stack(frames)  # Shape: [num_frames, 3, height, width]
+```
+</details>
+You can then run the model as follows:
+```py
+import onnxruntime as ort
+from huggingface_hub import hf_hub_download
+from transformers import AutoConfig
+model_id = "onnx-community/vjepa2-vitl-fpc32-256-diving48-ONNX"
+config = AutoConfig.from_pretrained(model_id)
+path = hf_hub_download(
+    repo_id=model_id,
+    filename="onnx/model.onnx",
+)
+ort_session = ort.InferenceSession(path)
+# Load and preprocess video frames
+video = read_gif_frames(
+    "http://www.svcl.ucsd.edu/projects/resound/imgs/19.gif",
+    shortest_edge=292,
+    center_crop=256,
+)
+mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+inputs = {
+    "pixel_values_videos": ((video / 255 - mean) / std)[np.newaxis, ...].astype(np.float32)
+}
+# Run the model
+logits = ort_session.run(
+    None,
+    input_feed=inputs,
+)[0]
+top_k = 5
+indices = np.argsort(logits[0])[-top_k:][::-1]
+# Calculate softmax probabilities
+exp_logits = np.exp(logits[0] - np.max(logits[0]))
+softmax_probs = exp_logits / np.sum(exp_logits)
+print(f"Top {top_k} predicted class names:")
+for idx in indices:
+    text_label = config.id2label[idx]
+    print(f" - {text_label}: {softmax_probs[idx]:.2f}")
+```
+Example output:
+```
+Top 5 predicted class names:
+ - ['Forward', '15som', 'NoTwis', 'PIKE']: 0.69
+ - ['Reverse', 'Dive', 'NoTwis', 'PIKE']: 0.22
+ - ['Inward', '15som', 'NoTwis', 'PIKE']: 0.06
+ - ['Reverse', '15som', '05Twis', 'FREE']: 0.01
+ - ['Forward', '25som', 'NoTwis', 'PIKE']: 0.00
+```