diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 1f48c32e9df46945f0ac7142bfda001a1beb4747..b669260acff5b42bccc03a6c58a4d5c36101c6a1 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,74 @@
----
-license: cc-by-nc-4.0
----
+---
+language: en
+license: cc-by-nc-4.0
+tags:
+- soccer
+- video-qa
+- question-answering
+- vision-language
+- multimodal
+- sports-analysis
+library_name: transformers
+pipeline_tag: video-text-to-text
+---
+
+# Soccer QA 4B - Soccer Video Question Answering Model
+
+**⚠️ RESEARCH USE ONLY - NON-COMMERCIAL LICENSE**
+
+Soccer QA 4B is a unified video question-answering model specifically designed for soccer video understanding.
+
+## Model Description
+
+This model can answer questions about soccer videos by analyzing visual content and generating natural language responses.
+
+**Example:**
+- **Input**: Video + "What unfolded during the game in the video?"
+- **Output**: "During the game, there was a foul committed by a player from the yellow-jerseyed team, leading to a yellow card being issued..."
+
+## Architecture
+- **Vision Encoder**: DWT-VJEPA2-based video encoder (vit_giant, 1408 dim)
+- **Text Model**: LLaMA 3.2-3B with LoRA fine-tuning
+- **Vision-Text Bridge**: Learned projection layer (1408 → 2048 → 3072)
+- **Specialization**: Fine-tuned on soccer video QA data
+
+## Usage (Helper functions are in repo)
+
+```python
+from soccer_qa_inference import SoccerQA
+
+model = SoccerQA("/path/to/model")
+answer = model.ask("video.mp4", "Was this a Foul?", max_tokens=45)
+print(answer)
+```
+
+## Model Details
+- **Parameters**: ~4B total
+- **Input**: Video files (16 frames, 256x256) + text questions
+- **Output**: Natural language answers
+- **Domain**: Soccer/football video analysis
+- **Context**: Handles complex game situations, player actions, fouls, etc.
+
+## Training Data
+- Soccer video clips with question-answer pairs
+- Covers various game situations: fouls, shots, saves, player actions
+- Annotated with detailed descriptions of game events
+
+## Limitations
+- Research use only, no commercial applications
+- Optimized specifically for soccer content
+- May not generalize well to other sports or video domains
+- Requires high-quality video input for best results
+
+## License
+CC-BY-NC-4.0 - Research use only, no commercial applications permitted.
+
+## Citation
+```bibtex
+@misc{soccer-qa-4b-2025,
+  title={Soccer QA 4B: Video Question Answering for Soccer Analysis},
+  author={Varun Kodathala, Sports Vision},
+  year={2025},
+  note={Research model for soccer video understanding}
+}
+```
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a51866840beedee6d50940d53a2af7d07d4b4b8f
--- /dev/null
+++ b/config.json
@@ -0,0 +1,26 @@
+{
+  "model_type": "soccer_qa_4b",
+  "architectures": [
+    "SoccerQA4BModel"
+  ],
+  "vision_dim": 1408,
+  "projection_dim": 2048,
+  "text_dim": 3072,
+  "img_size": 256,
+  "num_frames": 16,
+  "max_length": 256,
+  "temperature": 0.7,
+  "imagenet_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "imagenet_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "hidden_size": 3072,
+  "vocab_size": 128257,
+  "model_description": "Soccer video question answering model"
+}
\ No newline at end of file
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..32e06cad1fb762d4ed84cd6d5fa44c1b2a9d48a0
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f268918412af5ab623937ca776bf5c91eb26f04c3ff7e4cc257598aeda61b7cc
+size 18512562808
diff --git a/soccer_qa_inference.py b/soccer_qa_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..afb5c2ffbf1997c255a95f40ae61f1395f41274c
--- /dev/null
+++ b/soccer_qa_inference.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+"""
+Soccer QA Inference - Single Class, Clean API
+
+Usage in Colab:
+    from soccer_qa_inference import SoccerQA
+    model = SoccerQA("soccer-qa-3b-unified")
+    answer = model.ask("video.mp4", "What happened?", max_tokens=128)
+"""
+
+import os
+import json
+import torch
+import torch.nn as nn
+import numpy as np
+from safetensors.torch import load_file
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from decord import VideoReader
+
+# Import your existing modules
+import src.datasets.utils.video.transforms as video_transforms
+import src.datasets.utils.video.volume_transforms as volume_transforms
+from src.models.vision_transformer import vit_giant_rope
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+def get_video(fname, num_frames=16):
+    """Load video and sample frames uniformly"""
+    vr = VideoReader(fname)
+    frame_idx = np.linspace(0, len(vr) - 1, num=num_frames).astype(np.int64)
+    video = vr.get_batch(frame_idx).asnumpy()
+    return video
+
+def build_video_transform(img_size):
+    """Build video preprocessing transform"""
+    short_side_size = int(256.0 / 224 * img_size)
+    eval_transform = video_transforms.Compose([
+        video_transforms.Resize(short_side_size, interpolation="bilinear"),
+        video_transforms.CenterCrop(size=(img_size, img_size)),
+        volume_transforms.ClipToTensor(),
+        video_transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    ])
+    return eval_transform
+
+class SoccerQA:
+    """Single class for Soccer QA inference - Clean Colab API"""
+    
+    def __init__(self, model_dir="/home/varunkodathala/jepa_llm/soccer_pretrain/soccer-qa-3b-unified"):
+        """Initialize Soccer QA model
+        
+        Args:
+            model_dir: Path to merged model directory
+        """
+        self.model_dir = model_dir
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+        print(f"🚀 Loading Soccer QA from {model_dir}...")
+        
+        # Load config and tokenizer
+        self._load_config()
+        self._load_tokenizer()
+        
+        # Build models
+        self._build_vision_model()
+        self._build_text_model()
+        self._build_projection()
+        
+        # Load all weights
+        self._load_weights()
+        
+        # Build video transforms
+        self.video_transform = build_video_transform(self.img_size)
+        
+        print("✅ Soccer QA ready!")
+    
+    def _load_config(self):
+        """Load model configuration"""
+        config_path = os.path.join(self.model_dir, "config.json")
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+        
+        self.vision_dim = self.config["vision_dim"]  # 1408
+        self.projection_dim = self.config["projection_dim"]  # 2048
+        self.text_dim = self.config["text_dim"]  # 3072
+        self.img_size = self.config["img_size"]  # 256
+        self.num_frames = self.config["num_frames"]  # 16
+    
+    def _load_tokenizer(self):
+        """Load tokenizer with <video> token"""
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    
+    def _build_vision_model(self):
+        """Build vision transformer using your src modules"""
+        self.vision_model = vit_giant_rope(
+            img_size=(self.img_size, self.img_size), 
+            num_frames=self.num_frames
+        )
+        self.vision_model.to(self.device).eval()
+        
+        # Freeze vision model
+        for param in self.vision_model.parameters():
+            param.requires_grad = False
+    
+    def _build_text_model(self):
+        """Build text model - we'll load merged weights later"""
+        self.text_model = AutoModelForCausalLM.from_pretrained(
+            "meta-llama/Llama-3.2-3B",
+            torch_dtype=torch.float32,
+            device_map=self.device,
+            trust_remote_code=True
+        )
+        
+        # Resize for <video> token to match saved model
+        self.text_model.resize_token_embeddings(len(self.tokenizer))
+        self.text_model.eval()
+    
+    def _build_projection(self):
+        """Build vision projection layer"""
+        self.vision_projection = nn.Sequential(
+            nn.Linear(self.vision_dim, self.projection_dim),  # 1408 -> 2048
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(self.projection_dim, self.text_dim),     # 2048 -> 3072
+            nn.LayerNorm(self.text_dim)
+        ).to(self.device)
+    
+    def _load_weights(self):
+        """Load all weights from safetensors - optimized approach"""
+        model_path = os.path.join(self.model_dir, "model.safetensors")
+        print(f"Loading weights from: {model_path}")
+        state_dict = load_file(model_path, device=str(self.device))
+        
+        # Load vision encoder weights
+        vision_state = {}
+        for key, value in state_dict.items():
+            if key.startswith("vision_encoder."):
+                new_key = key.replace("vision_encoder.", "")
+                vision_state[new_key] = value
+        
+        msg = self.vision_model.load_state_dict(vision_state, strict=False)
+        print(f"Vision model loaded: {msg}")
+        
+        # Load projection weights
+        projection_state = {}
+        for key, value in state_dict.items():
+            if key.startswith("vision_projection."):
+                new_key = key.replace("vision_projection.", "")
+                projection_state[new_key] = value
+        
+        self.vision_projection.load_state_dict(projection_state)
+        print("Projection layer loaded")
+        
+        # Load text model weights directly from merged state_dict
+        text_state = {}
+        for key, value in state_dict.items():
+            if key.startswith("text_model."):
+                new_key = key.replace("text_model.", "")
+                text_state[new_key] = value
+        
+        # Apply merged weights directly to text model
+        missing_keys, unexpected_keys = self.text_model.load_state_dict(text_state, strict=False)
+        if missing_keys:
+            print(f"Missing keys in text model: {len(missing_keys)} (this is normal)")
+        if unexpected_keys:
+            print(f"Unexpected keys in text model: {len(unexpected_keys)}")
+        print("✅ Text model loaded with merged weights")
+        
+        # Clear state_dict from memory
+        del state_dict
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+    
+    def _get_video_embeddings(self, video_path):
+        """Extract video embeddings from video file"""
+        with torch.inference_mode():
+            # Load video
+            video = get_video(video_path, self.num_frames)
+            video = torch.from_numpy(video).permute(0, 3, 1, 2)  # T x C x H x W
+            
+            # Preprocess
+            x = self.video_transform(video).to(self.device).unsqueeze(0)  # [1, 16, 3, 256, 256]
+            
+            # Extract features
+            features = self.vision_model(x)  # [1, 2048, 1408]
+            
+            # Handle reshaping
+            squeezed = features.squeeze(0)  # [2048, 1408]
+            if squeezed.shape[0] % 2048 == 0:
+                num_clips = squeezed.shape[0] // 2048
+                reshaped = squeezed.view(num_clips, 2048, 1408)
+            else:
+                reshaped = squeezed.unsqueeze(0)  # [1, 2048, 1408]
+            
+            return reshaped
+    
+    def _project_vision_features(self, vision_features):
+        """Project vision features to text embedding space"""
+        # vision_features: [num_clips, 2048, 1408]
+        num_clips, patches_per_clip, feature_dim = vision_features.shape
+        
+        # Flatten: [num_clips * 2048, 1408]
+        flattened = vision_features.view(-1, feature_dim)
+        
+        # Project: [num_clips * 2048, 3072]
+        projected = self.vision_projection(flattened)
+        
+        # Return flattened for sequence: [total_patches, 3072]
+        return projected
+    
+    def ask(self, video_path, question, max_tokens=128, temperature=0.7, top_p=0.9, 
+            repetition_penalty=1.2, no_repeat_ngram_size=3):
+        """Ask a question about a video
+        
+        Args:
+            video_path: Path to video file
+            question: Question about the video
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_p: Nucleus sampling parameter
+            repetition_penalty: Penalty for repetition
+            no_repeat_ngram_size: N-gram size for repetition blocking
+            
+        Returns:
+            Generated answer as string
+        """
+        with torch.no_grad():
+            # Get video embeddings
+            video_features = self._get_video_embeddings(video_path)  # [num_clips, 2048, 1408]
+            vision_embeds = self._project_vision_features(video_features)  # [total_patches, 3072]
+            vision_embeds = vision_embeds.unsqueeze(0)  # [1, total_patches, 3072]
+            
+            # Process question (remove <video> token if present)
+            question_clean = question.replace("<video>", "").strip()
+            
+            # Tokenize question
+            question_tokens = self.tokenizer(
+                question_clean,
+                return_tensors="pt",
+                add_special_tokens=True
+            ).to(self.device)
+            
+            # Get text embeddings
+            text_embeds = self.text_model.get_input_embeddings()(question_tokens.input_ids)
+            
+            # Combine vision and text embeddings
+            combined_embeds = torch.cat([vision_embeds, text_embeds], dim=1)
+            
+            # Create attention mask
+            vision_attention = torch.ones(
+                1, vision_embeds.shape[1], 
+                dtype=question_tokens.attention_mask.dtype, 
+                device=self.device
+            )
+            combined_attention_mask = torch.cat([vision_attention, question_tokens.attention_mask], dim=1)
+            
+            # Generate response
+            generated_ids = self.text_model.generate(
+                inputs_embeds=combined_embeds,
+                attention_mask=combined_attention_mask,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id,
+                repetition_penalty=repetition_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                use_cache=True,
+                return_dict_in_generate=False
+            )
+            
+            # Handle different return formats from generate()
+            if generated_ids.shape[1] > combined_embeds.shape[1]:
+                # Full sequence returned - slice from combined length
+                new_tokens = generated_ids[:, combined_embeds.shape[1]:]
+            else:
+                # Only new tokens returned - use all
+                new_tokens = generated_ids
+            
+            generated_text = self.tokenizer.batch_decode(
+                new_tokens,
+                skip_special_tokens=True
+            )[0]
+            
+            return generated_text.strip()
+    
+    def batch_ask(self, video_path, questions, **kwargs):
+        """Ask multiple questions about the same video
+        
+        Args:
+            video_path: Path to video file
+            questions: List of questions
+            **kwargs: Generation parameters
+            
+        Returns:
+            List of {"question": str, "answer": str} dicts
+        """
+        results = []
+        for question in questions:
+            answer = self.ask(video_path, question, **kwargs)
+            results.append({"question": question, "answer": answer})
+        return results
\ No newline at end of file
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4a8160c7c15b38b97bc498b76582cac02f848e0
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,26 @@
+{
+  "additional_special_tokens": [
+    "<video>"
+  ],
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/src/datasets/data_manager.py b/src/datasets/data_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e35486395cebbf7810edd71609033bd7a493ea51
--- /dev/null
+++ b/src/datasets/data_manager.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from logging import getLogger
+
+_GLOBAL_SEED = 0
+logger = getLogger()
+
+
+def init_data(
+    batch_size,
+    transform=None,
+    shared_transform=None,
+    data="ImageNet",
+    collator=None,
+    pin_mem=True,
+    num_workers=8,
+    world_size=1,
+    rank=0,
+    root_path=None,
+    image_folder=None,
+    training=True,
+    drop_last=True,
+    subset_file=None,
+    clip_len=None,
+    dataset_fpcs=None,
+    frame_sample_rate=None,
+    duration=None,
+    fps=None,
+    num_clips=1,
+    random_clip_sampling=True,
+    allow_clip_overlap=False,
+    filter_short_videos=False,
+    filter_long_videos=int(1e9),
+    datasets_weights=None,
+    persistent_workers=False,
+    deterministic=True,
+    log_dir=None,
+):
+    if data.lower() == "imagenet":
+        from src.datasets.imagenet1k import make_imagenet1k
+
+        dataset, data_loader, dist_sampler = make_imagenet1k(
+            transform=transform,
+            batch_size=batch_size,
+            collator=collator,
+            pin_mem=pin_mem,
+            training=training,
+            num_workers=num_workers,
+            world_size=world_size,
+            rank=rank,
+            root_path=root_path,
+            image_folder=image_folder,
+            persistent_workers=persistent_workers,
+            drop_last=drop_last,
+            subset_file=subset_file,
+        )
+
+    elif data.lower() == "videodataset":
+        from src.datasets.video_dataset import make_videodataset
+
+        dataset, data_loader, dist_sampler = make_videodataset(
+            data_paths=root_path,
+            batch_size=batch_size,
+            frames_per_clip=clip_len,
+            dataset_fpcs=dataset_fpcs,
+            frame_step=frame_sample_rate,
+            duration=duration,
+            fps=fps,
+            num_clips=num_clips,
+            random_clip_sampling=random_clip_sampling,
+            allow_clip_overlap=allow_clip_overlap,
+            filter_short_videos=filter_short_videos,
+            filter_long_videos=filter_long_videos,
+            shared_transform=shared_transform,
+            transform=transform,
+            datasets_weights=datasets_weights,
+            collator=collator,
+            num_workers=num_workers,
+            pin_mem=pin_mem,
+            persistent_workers=persistent_workers,
+            world_size=world_size,
+            rank=rank,
+            deterministic=deterministic,
+            log_dir=log_dir,
+        )
+
+    return (data_loader, dist_sampler)
diff --git a/src/datasets/imagenet1k.py b/src/datasets/imagenet1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..3597e3a1afee01b837431909a103a456b53cc298
--- /dev/null
+++ b/src/datasets/imagenet1k.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import subprocess
+import time
+from logging import getLogger
+
+import numpy as np
+import torch
+import torchvision
+
+_GLOBAL_SEED = 0
+logger = getLogger()
+
+
+class ImageNet(torchvision.datasets.ImageFolder):
+
+    def __init__(
+        self,
+        root,
+        image_folder="imagenet_full_size/061417/",
+        tar_file="imagenet_full_size-061417.tar.gz",
+        transform=None,
+        train=True,
+        job_id=None,
+        local_rank=None,
+        index_targets=False,
+    ):
+        """
+        ImageNet
+
+        Dataset wrapper
+
+        :param root: root network directory for ImageNet data
+        :param image_folder: path to images inside root network directory
+        :param tar_file: zipped image_folder inside root network directory
+        :param train: whether to load train data (or validation)
+        :param job_id: scheduler job-id used to create dir on local machine
+        :param index_targets: whether to index the id of each labeled image
+        """
+
+        suffix = "train/" if train else "val/"
+        data_path = os.path.join(root, image_folder, suffix)
+        logger.info(f"data-path {data_path}")
+
+        super(ImageNet, self).__init__(root=data_path, transform=transform)
+        logger.info("Initialized ImageNet")
+
+        if index_targets:
+            self.targets = []
+            for sample in self.samples:
+                self.targets.append(sample[1])
+            self.targets = np.array(self.targets)
+            self.samples = np.array(self.samples)
+
+            mint = None
+            self.target_indices = []
+            for t in range(len(self.classes)):
+                indices = np.squeeze(np.argwhere(self.targets == t)).tolist()
+                self.target_indices.append(indices)
+                mint = len(indices) if mint is None else min(mint, len(indices))
+                logger.debug(f"num-labeled target {t} {len(indices)}")
+            logger.info(f"min. labeled indices {mint}")
+
+
+class ImageNetSubset(object):
+
+    def __init__(self, dataset, subset_file):
+        """
+        ImageNetSubset
+
+        :param dataset: ImageNet dataset object
+        :param subset_file: '.txt' file containing IDs of IN1K images to keep
+        """
+        self.dataset = dataset
+        self.subset_file = subset_file
+        self.filter_dataset_(subset_file)
+
+    def filter_dataset_(self, subset_file):
+        """Filter self.dataset to a subset"""
+        root = self.dataset.root
+        class_to_idx = self.dataset.class_to_idx
+        # -- update samples to subset of IN1k targets/samples
+        new_samples = []
+        logger.info(f"Using {subset_file}")
+        with open(subset_file, "r") as rfile:
+            for line in rfile:
+                class_name = line.split("_")[0]
+                target = class_to_idx[class_name]
+                img = line.split("\n")[0]
+                new_samples.append((os.path.join(root, class_name, img), target))
+        self.samples = new_samples
+
+    @property
+    def classes(self):
+        return self.dataset.classes
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, index):
+        path, target = self.samples[index]
+        img = self.dataset.loader(path)
+        if self.dataset.transform is not None:
+            img = self.dataset.transform(img)
+        if self.dataset.target_transform is not None:
+            target = self.dataset.target_transform(target)
+        return img, target
+
+
+def make_imagenet1k(
+    transform,
+    batch_size,
+    collator=None,
+    pin_mem=True,
+    num_workers=8,
+    world_size=1,
+    rank=0,
+    root_path=None,
+    image_folder=None,
+    training=True,
+    drop_last=True,
+    persistent_workers=False,
+    subset_file=None,
+):
+    dataset = ImageNet(
+        root=root_path,
+        image_folder=image_folder,
+        transform=transform,
+        train=training,
+        index_targets=False,
+    )
+    if subset_file is not None:
+        dataset = ImageNetSubset(dataset, subset_file)
+    logger.info("ImageNet dataset created")
+    dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset=dataset, num_replicas=world_size, rank=rank)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        collate_fn=collator,
+        sampler=dist_sampler,
+        batch_size=batch_size,
+        drop_last=drop_last,
+        pin_memory=pin_mem,
+        num_workers=num_workers,
+        persistent_workers=persistent_workers,
+    )
+    logger.info("ImageNet unsupervised data loader created")
+
+    return dataset, data_loader, dist_sampler
diff --git a/src/datasets/utils/dataloader.py b/src/datasets/utils/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ef647fc7de46f422ba27db9e79f84f543216c1c
--- /dev/null
+++ b/src/datasets/utils/dataloader.py
@@ -0,0 +1,234 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import bisect
+import csv
+import io
+import time
+
+import numpy as np
+import torch
+from torch.utils.data import _utils
+from torch.utils.data.dataloader import ExceptionWrapper, _DatasetKind, _MultiProcessingDataLoaderIter
+
+from src.utils.monitoring import ResourceMonitoringThread
+
+
+class ConcatIndices:
+    """Helper to map indices of concatenated/mixed datasets to the sample index for the corresponding dataset."""
+
+    cumulative_sizes: np.ndarray
+
+    def __init__(self, sizes):
+        self.cumulative_sizes = np.cumsum(sizes)
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        # Returns a pair (dataset_idx, sample_idx)
+        if idx < 0 or idx >= len(self):
+            raise ValueError(f"index must be between 0 and the total size ({len(self)})")
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            return dataset_idx, idx
+        return dataset_idx, idx - self.cumulative_sizes[dataset_idx - 1]
+
+
+class CSVLogger(object):
+    """An append-to CSV abstraction. File I/O requires a flush."""
+
+    def __init__(self, fname, header):
+        """Write header to internal buffers."""
+        self.fname = fname
+        self.buffer = io.StringIO()
+        self.writer = csv.writer(self.buffer, quoting=csv.QUOTE_NONNUMERIC)
+        self.writer.writerow(header)
+        self.initialized = False
+
+    def writerow(self, row) -> None:
+        """Write row to internal buffers."""
+        self.writer.writerow(row)
+
+    def flush(self) -> None:
+        """Flush buffer to file."""
+        # Overwrite old file
+        mode = "a+" if self.initialized else "w"
+
+        with open(self.fname, mode, newline="") as f:
+            f.write(self.buffer.getvalue())
+
+        self.buffer = io.StringIO()
+        self.writer = csv.writer(self.buffer, quoting=csv.QUOTE_NONNUMERIC)
+        self.initialized = True
+
+
+class MonitoredDataset(torch.utils.data.Dataset):
+    """Implement resource monitoring on a per-worker basis.
+
+    The sampling occurs every monitor_interval seconds and writes the log
+    every log_interval seconds to a file specified by log_filename, which
+    maps a worker id to a file using the '%w' placeholder.
+
+    Warning: Do not call this dataset before it is consumed in the DataLoader.
+    """
+
+    def __init__(
+        self, dataset: torch.utils.data.Dataset, log_filename: str, log_interval: float, monitor_interval: float
+    ):
+        self.dataset = dataset
+        self.log_filename = str(log_filename)
+        self.log_interval = log_interval
+        self.monitor_interval = monitor_interval
+        self._csv_log = None
+        self._monitoring_thread = None
+        self._last_log_time = None
+        # Patch getitems dynamically
+        if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
+
+            def __getitems__(self, index):
+                self.maybe_start_resource_monitoring()
+                return self.dataset.__getitems__(index)
+
+            self.__getitems__ = __getitems__
+
+    def __del__(self):
+        self.stop_resource_monitoring()
+
+    def __getitem__(self, index):
+        self.maybe_start_resource_monitoring()
+        return self.dataset.__getitem__(index)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def _elapsed_log_time(self):
+        if self._last_log_time is None:
+            return float("inf")
+        else:
+            return time.perf_counter() - self._last_log_time
+
+    def _update_log_time(self):
+        self._last_log_time = time.perf_counter()
+
+    def maybe_start_resource_monitoring(self):
+        if self._monitoring_thread is None:
+
+            def callback_fn(resource_sample):
+                worker_info = torch.utils.data.get_worker_info()
+                worker_id = worker_info.id
+
+                if self._csv_log is None:
+                    header = [f.name for f in resource_sample.fields()]
+                    log_filename = self.log_filename.replace("%w", str(worker_id))
+                    self._csv_log = CSVLogger(log_filename, header)
+                row_values = resource_sample.as_tuple()
+                self._csv_log.writerow(row_values)
+
+                if self._elapsed_log_time() > self.log_interval:
+                    self._csv_log.flush()
+                    self._update_log_time()
+
+            self._monitoring_thread = ResourceMonitoringThread(
+                None, self.monitor_interval, stats_callback_fn=callback_fn
+            )
+            self._monitoring_thread.start()
+
+    def stop_resource_monitoring(self):
+        if self._monitoring_thread:
+            self._monitoring_thread.stop()
+
+
+class NondeterministicDataLoader(torch.utils.data.DataLoader):
+    """Override torch dataloader to return out of order."""
+
+    def __init__(self, *args, **kwargs):
+        """Pass through constructor."""
+        super().__init__(*args, **kwargs)
+
+    def _get_iterator(self):
+        if self.num_workers:
+            self.check_worker_number_rationality()
+            return _SloppyMultiProcessingDataLoaderIter(self)
+        else:
+            return super()._get_iterator()
+
+
+class _SloppyMultiProcessingDataLoaderIter(_MultiProcessingDataLoaderIter):
+
+    def __init__(self, *args, **kwargs):
+        """Pass through constructor."""
+        super().__init__(*args, **kwargs)
+
+    def _next_data(self):
+        """Adds out of order returns."""
+        while True:
+            # If the worker responsible for `self._rcvd_idx` has already ended
+            # and was unable to fulfill this task (due to exhausting an `IterableDataset`),
+            # we try to advance `self._rcvd_idx` to find the next valid index.
+            #
+            # This part needs to run in the loop because both the `self._get_data()`
+            # call and `_IterableDatasetStopIteration` check below can mark
+            # extra worker(s) as dead.
+            while self._rcvd_idx < self._send_idx:
+                info = self._task_info[self._rcvd_idx]
+                if info is None:
+                    # Found a reordered tombstone
+                    del self._task_info[self._rcvd_idx]
+                    self._rcvd_idx += 1
+                    self._try_put_index()
+                else:
+                    worker_id = info[0]
+                    # has data or is still active
+                    if len(info) == 2 or self._workers_status[worker_id]:
+                        break
+                    del self._task_info[self._rcvd_idx]
+                    self._rcvd_idx += 1
+            else:
+                # no valid `self._rcvd_idx` is found (i.e., didn't break)
+                if not self._persistent_workers:
+                    self._shutdown_workers()
+                raise StopIteration
+
+            # Now `self._rcvd_idx` is the batch index we want to fetch
+
+            # Check if the next sample has already been generated
+            if len(self._task_info[self._rcvd_idx]) == 2:
+                data = self._task_info.pop(self._rcvd_idx)[1]
+                return self._process_data(data)
+
+            assert not self._shutdown and self._tasks_outstanding > 0
+            idx, data = self._get_data()
+            self._tasks_outstanding -= 1
+            if self._dataset_kind == _DatasetKind.Iterable:
+                # Check for _IterableDatasetStopIteration
+                if isinstance(data, _utils.worker._IterableDatasetStopIteration):
+                    if self._persistent_workers:
+                        self._workers_status[data.worker_id] = False
+                    else:
+                        self._mark_worker_as_unavailable(data.worker_id)
+                    self._try_put_index()
+                    continue
+
+            if idx != self._rcvd_idx:
+                # Tombstone to recieve later
+                self._task_info[idx] = None
+                if isinstance(data, ExceptionWrapper):
+                    data.reraise()
+                return data
+            else:
+                del self._task_info[idx]
+                return self._process_data(data)
+
+
+def get_worker_info():
+    worker_info = torch.utils.data.get_worker_info()
+    if worker_info is None:
+        num_workers = 1
+        worker_id = 0
+    else:
+        num_workers = worker_info.num_workers
+        worker_id = worker_info.id
+    return num_workers, worker_id
diff --git a/src/datasets/utils/utils.py b/src/datasets/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f967c2b087eb3fed0c2b1ef621b0203e668edbbb
--- /dev/null
+++ b/src/datasets/utils/utils.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from src.utils.cluster import dataset_paths
+from src.utils.logging import get_logger
+
+logger = get_logger("Datasets utils")
+
+
+def get_dataset_paths(datasets: list[str]):
+    paths = []
+    for d in datasets:
+        try:
+            path = dataset_paths().get(d)
+        except Exception:
+            raise Exception(f"Unknown dataset: {d}")
+        paths.append(path)
+    logger.info(f"Datapaths {paths}")
+    return paths
diff --git a/src/datasets/utils/video/__pycache__/functional.cpython-312.pyc b/src/datasets/utils/video/__pycache__/functional.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f0fe56e37c8a6711386a27d24d3708867073101
Binary files /dev/null and b/src/datasets/utils/video/__pycache__/functional.cpython-312.pyc differ
diff --git a/src/datasets/utils/video/__pycache__/randaugment.cpython-312.pyc b/src/datasets/utils/video/__pycache__/randaugment.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f50447461c0fe20680b9673acd0aa54008e429c
Binary files /dev/null and b/src/datasets/utils/video/__pycache__/randaugment.cpython-312.pyc differ
diff --git a/src/datasets/utils/video/__pycache__/transforms.cpython-312.pyc b/src/datasets/utils/video/__pycache__/transforms.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..345e76dbdd97094c38a4e54ba0a622e3da153d9d
Binary files /dev/null and b/src/datasets/utils/video/__pycache__/transforms.cpython-312.pyc differ
diff --git a/src/datasets/utils/video/__pycache__/volume_transforms.cpython-312.pyc b/src/datasets/utils/video/__pycache__/volume_transforms.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c68a72c44daefd3dec413ed6d8e0e4c2ae5ffa49
Binary files /dev/null and b/src/datasets/utils/video/__pycache__/volume_transforms.cpython-312.pyc differ
diff --git a/src/datasets/utils/video/functional.py b/src/datasets/utils/video/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7358760dfc2dc8a5e3f1b233b0cf6583348329
--- /dev/null
+++ b/src/datasets/utils/video/functional.py
@@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numbers
+
+import cv2
+import numpy as np
+import PIL
+import torch
+from torchvision.transforms import functional as tvf
+
+
+def _is_tensor_clip(clip):
+    return torch.is_tensor(clip) and clip.ndimension() == 4
+
+
+def crop_clip(clip, min_h, min_w, h, w):
+    if isinstance(clip[0], np.ndarray) or isinstance(clip[0], torch.Tensor):
+        if clip[0].shape[-1] == 3:
+            cropped = [img[min_h : min_h + h, min_w : min_w + w, :] for img in clip]
+        else:
+            assert clip[0].shape[0] == 3
+            cropped = [img[:, min_h : min_h + h, min_w : min_w + w] for img in clip]
+
+    elif isinstance(clip[0], PIL.Image.Image):
+        cropped = [img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip]
+
+    else:
+        raise TypeError(
+            "Expected numpy.ndarray or PIL.Image or torch.Tensor):" + "but got list of {0}".format(type(clip[0]))
+        )
+    return cropped
+
+
+def resize_clip(clip, size, interpolation="bilinear"):
+    if isinstance(clip[0], np.ndarray) or isinstance(clip[0], torch.Tensor):
+        if isinstance(size, numbers.Number):
+            if clip[0].shape[-1] == 3:
+                im_h, im_w, im_c = clip[0].shape
+            else:
+                assert clip[0].shape[0] == 3
+                im_c, im_h, im_w = clip[0].shape
+            # Min spatial dim already matches minimal size
+            if (im_w <= im_h and im_w == size) or (im_h <= im_w and im_h == size):
+                return clip
+            new_h, new_w = get_resize_sizes(im_h, im_w, size)
+            size = (new_w, new_h)
+        else:
+            size = size[0], size[1]
+
+        if isinstance(clip[0], np.ndarray):
+            if interpolation == "bilinear":
+                np_inter = cv2.INTER_LINEAR
+            else:
+                np_inter = cv2.INTER_NEAREST
+            scaled = [cv2.resize(img, size, interpolation=np_inter) for img in clip]
+        else:  # isinstance(clip[0], torch.Tensor)
+            if interpolation == "bilinear":
+                np_inter = tvf.InterpolationMode.BILINEAR
+            else:
+                np_inter = tvf.InterpolationMode.NEAREST
+            size = (size[1], size[0])  # torchvision transformers expect the size in (h, w) order.
+            scaled = [tvf.resize(img, size, interpolation=np_inter) for img in clip]
+    elif isinstance(clip[0], PIL.Image.Image):
+        if isinstance(size, numbers.Number):
+            im_w, im_h = clip[0].size
+            # Min spatial dim already matches minimal size
+            if (im_w <= im_h and im_w == size) or (im_h <= im_w and im_h == size):
+                return clip
+            new_h, new_w = get_resize_sizes(im_h, im_w, size)
+            size = (new_w, new_h)
+        else:
+            size = size[1], size[0]
+        if interpolation == "bilinear":
+            pil_inter = PIL.Image.BILINEAR
+        else:
+            pil_inter = PIL.Image.NEAREST
+        scaled = [img.resize(size, pil_inter) for img in clip]
+    else:
+        raise TypeError(
+            "Expected numpy.ndarray or PIL.Image or torch.Tensor" + "but got list of {0}".format(type(clip[0]))
+        )
+    return scaled
+
+
+def get_resize_sizes(im_h, im_w, size):
+    if im_w < im_h:
+        ow = size
+        oh = int(size * im_h / im_w)
+    else:
+        oh = size
+        ow = int(size * im_w / im_h)
+    return oh, ow
+
+
+def normalize(clip, mean, std, inplace=False):
+    if not _is_tensor_clip(clip):
+        raise TypeError("tensor is not a torch clip.")
+
+    if not inplace:
+        clip = clip.clone()
+
+    dtype = clip.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=clip.device)
+    std = torch.as_tensor(std, dtype=dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+
+    return clip
diff --git a/src/datasets/utils/video/randaugment.py b/src/datasets/utils/video/randaugment.py
new file mode 100644
index 0000000000000000000000000000000000000000..834af78f6cf3e545b1c9f649c0442b890498a699
--- /dev/null
+++ b/src/datasets/utils/video/randaugment.py
@@ -0,0 +1,536 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# Copyright 2020 Ross Wightman
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This implementation is based on
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py
+# published under an Apache License 2.0.
+
+# COMMENT FROM ORIGINAL:
+# AutoAugment, RandAugment, and AugMix for PyTorch
+# This code implements the searched ImageNet policies with various tweaks and
+# improvements and does not include any of the search code. AA and RA
+# Implementation adapted from:
+#     https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
+# AugMix adapted from:
+#     https://github.com/google-research/augmix
+# Papers:
+#     AutoAugment: Learning Augmentation Policies from Data
+#     https://arxiv.org/abs/1805.09501
+#     Learning Data Augmentation Strategies for Object Detection
+#     https://arxiv.org/abs/1906.11172
+#     RandAugment: Practical automated data augmentation...
+#     https://arxiv.org/abs/1909.13719
+#     AugMix: A Simple Data Processing Method to Improve Robustness and
+#     Uncertainty https://arxiv.org/abs/1912.02781
+
+import math
+import random
+import re
+
+import numpy as np
+import PIL
+from PIL import Image, ImageEnhance, ImageOps
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split(".")[:2]])
+
+_FILL = (128, 128, 128)
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.0
+
+_HPARAMS_DEFAULT = {
+    "translate_const": 250,
+    "img_mean": _FILL,
+}
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop("resample", Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if "fillcolor" in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop("fillcolor")
+    kwargs["resample"] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(
+            -rotn_center[0] - post_trans[0],
+            -rotn_center[1] - post_trans[1],
+            matrix,
+        )
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs["resample"])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.0
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
+    # range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * 0.9
+    level = 1.0 + _randomly_negate(level)
+    return (level,)
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams["translate_const"]
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get("translate_pct", 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return (level,)
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return (int((level / _MAX_LEVEL) * 4),)
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return (4 - _posterize_level_to_arg(level, hparams)[0],)
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return (int((level / _MAX_LEVEL) * 4) + 4,)
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return (int((level / _MAX_LEVEL) * 256),)
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return (256 - _solarize_level_to_arg(level, _hparams)[0],)
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return (int((level / _MAX_LEVEL) * 110),)
+
+
+LEVEL_TO_ARG = {
+    "AutoContrast": None,
+    "Equalize": None,
+    "Invert": None,
+    "Rotate": _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    "Posterize": _posterize_level_to_arg,
+    "PosterizeIncreasing": _posterize_increasing_level_to_arg,
+    "PosterizeOriginal": _posterize_original_level_to_arg,
+    "Solarize": _solarize_level_to_arg,
+    "SolarizeIncreasing": _solarize_increasing_level_to_arg,
+    "SolarizeAdd": _solarize_add_level_to_arg,
+    "Color": _enhance_level_to_arg,
+    "ColorIncreasing": _enhance_increasing_level_to_arg,
+    "Contrast": _enhance_level_to_arg,
+    "ContrastIncreasing": _enhance_increasing_level_to_arg,
+    "Brightness": _enhance_level_to_arg,
+    "BrightnessIncreasing": _enhance_increasing_level_to_arg,
+    "Sharpness": _enhance_level_to_arg,
+    "SharpnessIncreasing": _enhance_increasing_level_to_arg,
+    "ShearX": _shear_level_to_arg,
+    "ShearY": _shear_level_to_arg,
+    "TranslateX": _translate_abs_level_to_arg,
+    "TranslateY": _translate_abs_level_to_arg,
+    "TranslateXRel": _translate_rel_level_to_arg,
+    "TranslateYRel": _translate_rel_level_to_arg,
+}
+
+
+NAME_TO_OP = {
+    "AutoContrast": auto_contrast,
+    "Equalize": equalize,
+    "Invert": invert,
+    "Rotate": rotate,
+    "Posterize": posterize,
+    "PosterizeIncreasing": posterize,
+    "PosterizeOriginal": posterize,
+    "Solarize": solarize,
+    "SolarizeIncreasing": solarize,
+    "SolarizeAdd": solarize_add,
+    "Color": color,
+    "ColorIncreasing": color,
+    "Contrast": contrast,
+    "ContrastIncreasing": contrast,
+    "Brightness": brightness,
+    "BrightnessIncreasing": brightness,
+    "Sharpness": sharpness,
+    "SharpnessIncreasing": sharpness,
+    "ShearX": shear_x,
+    "ShearY": shear_y,
+    "TranslateX": translate_x_abs,
+    "TranslateY": translate_y_abs,
+    "TranslateXRel": translate_x_rel,
+    "TranslateYRel": translate_y_rel,
+}
+
+
+class AugmentOp:
+    """
+    Apply for video.
+    """
+
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = {
+            "fillcolor": hparams["img_mean"] if "img_mean" in hparams else _FILL,
+            "resample": hparams["interpolation"] if "interpolation" in hparams else _RANDOM_INTERPOLATION,
+        }
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_std = self.hparams.get("magnitude_std", 0)
+
+    def __call__(self, img_list):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img_list
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else ()
+
+        if isinstance(img_list, list):
+            return [self.aug_fn(img, *level_args, **self.kwargs) for img in img_list]
+        else:
+            return self.aug_fn(img_list, *level_args, **self.kwargs)
+
+
+_RAND_TRANSFORMS = [
+    "AutoContrast",
+    "Equalize",
+    "Invert",
+    "Rotate",
+    "Posterize",
+    "Solarize",
+    "SolarizeAdd",
+    "Color",
+    "Contrast",
+    "Brightness",
+    "Sharpness",
+    "ShearX",
+    "ShearY",
+    "TranslateXRel",
+    "TranslateYRel",
+]
+
+
+_RAND_INCREASING_TRANSFORMS = [
+    "AutoContrast",
+    "Equalize",
+    "Invert",
+    "Rotate",
+    "PosterizeIncreasing",
+    "SolarizeIncreasing",
+    "SolarizeAdd",
+    "ColorIncreasing",
+    "ContrastIncreasing",
+    "BrightnessIncreasing",
+    "SharpnessIncreasing",
+    "ShearX",
+    "ShearY",
+    "TranslateXRel",
+    "TranslateYRel",
+]
+
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    "Rotate": 0.3,
+    "ShearX": 0.2,
+    "ShearY": 0.2,
+    "TranslateXRel": 0.1,
+    "TranslateYRel": 0.1,
+    "Color": 0.025,
+    "Sharpness": 0.025,
+    "AutoContrast": 0.025,
+    "Solarize": 0.005,
+    "SolarizeAdd": 0.005,
+    "Contrast": 0.005,
+    "Brightness": 0.005,
+    "Equalize": 0.005,
+    "Posterize": 0,
+    "Invert": 0,
+}
+
+_RAND_CHOICE_WEIGHTS_1 = {
+    "Rotate": 0.0,
+    "ShearX": 0.0,
+    "ShearY": 0.0,
+    "TranslateXRel": 0.0,
+    "TranslateYRel": 0.0,
+    "Color": 0.25,
+    "Sharpness": 0.25,
+    "AutoContrast": 0.25,
+    "Solarize": 0.05,
+    "SolarizeAdd": 0.05,
+    "Contrast": 0.05,
+    "Brightness": 0.05,
+    "Equalize": 0.05,
+    "Posterize": 0,
+    "Invert": 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0 or weight_idx == 1  # only two sets of weights currently
+    if weight_idx == 0:
+        rand_weights = _RAND_CHOICE_WEIGHTS_0
+    elif weight_idx == 1:
+        rand_weights = _RAND_CHOICE_WEIGHTS_1
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [AugmentOp(name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms]
+
+
+class RandAugment:
+    def __init__(self, ops, num_layers=2, choice_weights=None):
+        self.ops = ops
+        self.num_layers = num_layers
+        self.choice_weights = choice_weights
+
+    def __call__(self, img):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops,
+            self.num_layers,
+            replace=self.choice_weights is None,
+            p=self.choice_weights,
+        )
+        for op in ops:
+            img = op(img)
+        return img
+
+
+def rand_augment_transform(config_str, hparams):
+    """
+    RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719
+
+    Create a RandAugment transform
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+    :return: A PyTorch compatible Transform
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
+    transforms = _RAND_TRANSFORMS
+    config = config_str.split("-")
+    assert config[0] == "rand"
+    config = config[1:]
+    for c in config:
+        cs = re.split(r"(\d.*)", c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == "mstd":
+            # noise param injected via hparams for now
+            hparams.setdefault("magnitude_std", float(val))
+        elif key == "inc":
+            if bool(val):
+                transforms = _RAND_INCREASING_TRANSFORMS
+        elif key == "m":
+            magnitude = int(val)
+        elif key == "n":
+            num_layers = int(val)
+        elif key == "w":
+            weight_idx = int(val)
+        else:
+            assert NotImplementedError
+    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
+    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
diff --git a/src/datasets/utils/video/randerase.py b/src/datasets/utils/video/randerase.py
new file mode 100644
index 0000000000000000000000000000000000000000..13226caff18b54f2f2969eb9d98b0866b5d7026e
--- /dev/null
+++ b/src/datasets/utils/video/randerase.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# Copyright 2020 Ross Wightman
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This implementation is based on
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/random_erasing.py
+# published under an Apache License 2.0.
+
+
+import math
+import random
+
+import torch
+
+
+def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device="cuda"):
+    # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
+    # paths, flip the order so normal is run on CPU if this becomes a problem
+    # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508
+    if per_pixel:
+        return torch.empty(patch_size, dtype=dtype, device=device).normal_()
+    elif rand_color:
+        return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_()
+    else:
+        return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device)
+
+
+class RandomErasing:
+    """Randomly selects a rectangle region in an image and erases its pixels.
+        'Random Erasing Data Augmentation' by Zhong et al.
+        See https://arxiv.org/pdf/1708.04896.pdf
+        This variant of RandomErasing is intended to be applied to either a batch
+        or single image tensor after it has been normalized by dataset mean and std.
+    Args:
+         probability: Probability that the Random Erasing operation will be performed.
+         min_area: Minimum percentage of erased area wrt input image area.
+         max_area: Maximum percentage of erased area wrt input image area.
+         min_aspect: Minimum aspect ratio of erased area.
+         mode: pixel color mode, one of 'const', 'rand', or 'pixel'
+            'const' - erase block is constant color of 0 for all channels
+            'rand'  - erase block is same per-channel random (normal) color
+            'pixel' - erase block is per-pixel random (normal) color
+        max_count: maximum number of erasing blocks per image, area per box is scaled by count.
+            per-image count is randomly chosen between 1 and this value.
+    """
+
+    def __init__(
+        self,
+        probability=0.5,
+        min_area=0.02,
+        max_area=1 / 3,
+        min_aspect=0.3,
+        max_aspect=None,
+        mode="const",
+        min_count=1,
+        max_count=None,
+        num_splits=0,
+        device="cuda",
+        cube=True,
+    ):
+        self.probability = probability
+        self.min_area = min_area
+        self.max_area = max_area
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+        self.min_count = min_count
+        self.max_count = max_count or min_count
+        self.num_splits = num_splits
+        mode = mode.lower()
+        self.rand_color = False
+        self.per_pixel = False
+        self.cube = cube
+        if mode == "rand":
+            self.rand_color = True  # per block random normal
+        elif mode == "pixel":
+            self.per_pixel = True  # per pixel random normal
+        else:
+            assert not mode or mode == "const"
+        self.device = device
+
+    def _erase(self, img, chan, img_h, img_w, dtype):
+        if random.random() > self.probability:
+            return
+        area = img_h * img_w
+        count = self.min_count if self.min_count == self.max_count else random.randint(self.min_count, self.max_count)
+        for _ in range(count):
+            for _ in range(10):
+                target_area = random.uniform(self.min_area, self.max_area) * area / count
+                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+                h = int(round(math.sqrt(target_area * aspect_ratio)))
+                w = int(round(math.sqrt(target_area / aspect_ratio)))
+                if w < img_w and h < img_h:
+                    top = random.randint(0, img_h - h)
+                    left = random.randint(0, img_w - w)
+                    img[:, top : top + h, left : left + w] = _get_pixels(
+                        self.per_pixel,
+                        self.rand_color,
+                        (chan, h, w),
+                        dtype=dtype,
+                        device=self.device,
+                    )
+                    break
+
+    def _erase_cube(
+        self,
+        img,
+        batch_start,
+        batch_size,
+        chan,
+        img_h,
+        img_w,
+        dtype,
+    ):
+        if random.random() > self.probability:
+            return
+        area = img_h * img_w
+        count = self.min_count if self.min_count == self.max_count else random.randint(self.min_count, self.max_count)
+        for _ in range(count):
+            for _ in range(100):
+                target_area = random.uniform(self.min_area, self.max_area) * area / count
+                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+                h = int(round(math.sqrt(target_area * aspect_ratio)))
+                w = int(round(math.sqrt(target_area / aspect_ratio)))
+                if w < img_w and h < img_h:
+                    top = random.randint(0, img_h - h)
+                    left = random.randint(0, img_w - w)
+                    for i in range(batch_start, batch_size):
+                        img_instance = img[i]
+                        img_instance[:, top : top + h, left : left + w] = _get_pixels(
+                            self.per_pixel,
+                            self.rand_color,
+                            (chan, h, w),
+                            dtype=dtype,
+                            device=self.device,
+                        )
+                    break
+
+    def __call__(self, input):
+        if len(input.size()) == 3:
+            self._erase(input, *input.size(), input.dtype)
+        else:
+            batch_size, chan, img_h, img_w = input.size()
+            # skip first slice of batch if num_splits is set (for clean portion of samples)
+            batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0
+            if self.cube:
+                self._erase_cube(
+                    input,
+                    batch_start,
+                    batch_size,
+                    chan,
+                    img_h,
+                    img_w,
+                    input.dtype,
+                )
+            else:
+                for i in range(batch_start, batch_size):
+                    self._erase(input[i], chan, img_h, img_w, input.dtype)
+        return input
diff --git a/src/datasets/utils/video/transforms.py b/src/datasets/utils/video/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..b92fb517a7d141ba98f97496a03799b65ccf77cd
--- /dev/null
+++ b/src/datasets/utils/video/transforms.py
@@ -0,0 +1,1161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import numbers
+import random
+
+import numpy as np
+import PIL
+import torch
+import torchvision
+import torchvision.transforms.functional as F
+from PIL import Image
+from torch import Tensor
+from torchvision import transforms
+
+import src.datasets.utils.video.functional as FF
+from src.datasets.utils.video.randaugment import rand_augment_transform
+
+_pil_interpolation_to_str = {
+    Image.NEAREST: "PIL.Image.NEAREST",
+    Image.BILINEAR: "PIL.Image.BILINEAR",
+    Image.BICUBIC: "PIL.Image.BICUBIC",
+    Image.LANCZOS: "PIL.Image.LANCZOS",
+    Image.HAMMING: "PIL.Image.HAMMING",
+    Image.BOX: "PIL.Image.BOX",
+}
+
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+PAD_FRAME_METHODS = ["circulant"]
+
+
+def _pil_interp(method):
+    if method == "bicubic":
+        return Image.BICUBIC
+    elif method == "lanczos":
+        return Image.LANCZOS
+    elif method == "hamming":
+        return Image.HAMMING
+    else:
+        return Image.BILINEAR
+
+
+def random_short_side_scale_jitter(images, min_size, max_size, boxes=None, inverse_uniform_sampling=False):
+    """
+    Perform a spatial short scale jittering on the given images and
+    corresponding boxes.
+    Args:
+        images (tensor): images to perform scale jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        min_size (int): the minimal size to scale the frames.
+        max_size (int): the maximal size to scale the frames.
+        boxes (ndarray): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+        inverse_uniform_sampling (bool): if True, sample uniformly in
+            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
+            scale. If False, take a uniform sample from [min_scale, max_scale].
+    Returns:
+        (tensor): the scaled images with dimension of
+            `num frames` x `channel` x `new height` x `new width`.
+        (ndarray or None): the scaled boxes with dimension of
+            `num boxes` x 4.
+    """
+    if inverse_uniform_sampling:
+        size = int(round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size)))
+    else:
+        size = int(round(np.random.uniform(min_size, max_size)))
+
+    height = images.shape[2]
+    width = images.shape[3]
+    if (width <= height and width == size) or (height <= width and height == size):
+        return images, boxes
+    new_width = size
+    new_height = size
+    if width < height:
+        new_height = int(math.floor((float(height) / width) * size))
+        if boxes is not None:
+            boxes = boxes * float(new_height) / height
+    else:
+        new_width = int(math.floor((float(width) / height) * size))
+        if boxes is not None:
+            boxes = boxes * float(new_width) / width
+
+    return (
+        torch.nn.functional.interpolate(
+            images,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        ),
+        boxes,
+    )
+
+
+def crop_boxes(boxes, x_offset, y_offset):
+    """
+    Peform crop on the bounding boxes given the offsets.
+    Args:
+        boxes (ndarray or None): bounding boxes to peform crop. The dimension
+            is `num boxes` x 4.
+        x_offset (int): cropping offset in the x axis.
+        y_offset (int): cropping offset in the y axis.
+    Returns:
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    cropped_boxes = boxes.copy()
+    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+
+    return cropped_boxes
+
+
+def random_crop(images, size, boxes=None):
+    """
+    Perform random spatial crop on the given images and corresponding boxes.
+    Args:
+        images (tensor): images to perform random crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): the size of height and width to crop on the image.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        cropped (tensor): cropped images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    if images.shape[2] == size and images.shape[3] == size:
+        return images
+    height = images.shape[2]
+    width = images.shape[3]
+    y_offset = 0
+    if height > size:
+        y_offset = int(np.random.randint(0, height - size))
+    x_offset = 0
+    if width > size:
+        x_offset = int(np.random.randint(0, width - size))
+    cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
+
+    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+
+    return cropped, cropped_boxes
+
+
+def horizontal_flip(prob, images, boxes=None):
+    """
+    Perform horizontal flip on the given images and corresponding boxes.
+    Args:
+        prob (float): probility to flip the images.
+        images (tensor): images to perform horizontal flip, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        images (tensor): images with dimension of
+            `num frames` x `channel` x `height` x `width`.
+        flipped_boxes (ndarray or None): the flipped boxes with dimension of
+            `num boxes` x 4.
+    """
+    if boxes is None:
+        flipped_boxes = None
+    else:
+        flipped_boxes = boxes.copy()
+
+    if np.random.uniform() < prob:
+        images = images.flip((-1))
+
+        if len(images.shape) == 3:
+            width = images.shape[2]
+        elif len(images.shape) == 4:
+            width = images.shape[3]
+        else:
+            raise NotImplementedError("Dimension does not supported")
+        if boxes is not None:
+            flipped_boxes[:, [0, 2]] = width - boxes[:, [2, 0]] - 1
+
+    return images, flipped_boxes
+
+
+def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
+    """
+    Perform uniform spatial sampling on the images and corresponding boxes.
+    Args:
+        images (tensor): images to perform uniform crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): size of height and weight to crop the images.
+        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+            is larger than height. Or 0, 1, or 2 for top, center, and bottom
+            crop if height is larger than width.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+        scale_size (int): optinal. If not None, resize the images to scale_size before
+            performing any crop.
+    Returns:
+        cropped (tensor): images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    assert spatial_idx in [0, 1, 2]
+    ndim = len(images.shape)
+    if ndim == 3:
+        images = images.unsqueeze(0)
+    height = images.shape[2]
+    width = images.shape[3]
+
+    if scale_size is not None:
+        if width <= height:
+            width, height = scale_size, int(height / width * scale_size)
+        else:
+            width, height = int(width / height * scale_size), scale_size
+        images = torch.nn.functional.interpolate(
+            images,
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+
+    if height > width:
+        if spatial_idx == 0:
+            y_offset = 0
+        elif spatial_idx == 2:
+            y_offset = height - size
+    else:
+        if spatial_idx == 0:
+            x_offset = 0
+        elif spatial_idx == 2:
+            x_offset = width - size
+    cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
+    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    if ndim == 3:
+        cropped = cropped.squeeze(0)
+    return cropped, cropped_boxes
+
+
+def clip_boxes_to_image(boxes, height, width):
+    """
+    Clip an array of boxes to an image with the given height and width.
+    Args:
+        boxes (ndarray): bounding boxes to perform clipping.
+            Dimension is `num boxes` x 4.
+        height (int): given image height.
+        width (int): given image width.
+    Returns:
+        clipped_boxes (ndarray): the clipped boxes with dimension of
+            `num boxes` x 4.
+    """
+    clipped_boxes = boxes.copy()
+    clipped_boxes[:, [0, 2]] = np.minimum(width - 1.0, np.maximum(0.0, boxes[:, [0, 2]]))
+    clipped_boxes[:, [1, 3]] = np.minimum(height - 1.0, np.maximum(0.0, boxes[:, [1, 3]]))
+    return clipped_boxes
+
+
+def blend(images1, images2, alpha):
+    """
+    Blend two images with a given weight alpha.
+    Args:
+        images1 (tensor): the first images to be blended, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+        images2 (tensor): the second images to be blended, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+        alpha (float): the blending weight.
+    Returns:
+        (tensor): blended images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    return images1 * alpha + images2 * (1 - alpha)
+
+
+def grayscale(images):
+    """
+    Get the grayscale for the input images. The channels of images should be
+    in order BGR.
+    Args:
+        images (tensor): the input images for getting grayscale. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        img_gray (tensor): blended images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    # R -> 0.299, G -> 0.587, B -> 0.114.
+    img_gray = torch.tensor(images)
+    gray_channel = 0.299 * images[:, 2] + 0.587 * images[:, 1] + 0.114 * images[:, 0]
+    img_gray[:, 0] = gray_channel
+    img_gray[:, 1] = gray_channel
+    img_gray[:, 2] = gray_channel
+    return img_gray
+
+
+def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0):
+    """
+    Perfrom a color jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        img_brightness (float): jitter ratio for brightness.
+        img_contrast (float): jitter ratio for contrast.
+        img_saturation (float): jitter ratio for saturation.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+
+    jitter = []
+    if img_brightness != 0:
+        jitter.append("brightness")
+    if img_contrast != 0:
+        jitter.append("contrast")
+    if img_saturation != 0:
+        jitter.append("saturation")
+
+    if len(jitter) > 0:
+        order = np.random.permutation(np.arange(len(jitter)))
+        for idx in range(0, len(jitter)):
+            if jitter[order[idx]] == "brightness":
+                images = brightness_jitter(img_brightness, images)
+            elif jitter[order[idx]] == "contrast":
+                images = contrast_jitter(img_contrast, images)
+            elif jitter[order[idx]] == "saturation":
+                images = saturation_jitter(img_saturation, images)
+    return images
+
+
+def brightness_jitter(var, images):
+    """
+    Perfrom brightness jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        var (float): jitter ratio for brightness.
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+
+    img_bright = torch.zeros(images.shape)
+    images = blend(images, img_bright, alpha)
+    return images
+
+
+def contrast_jitter(var, images):
+    """
+    Perfrom contrast jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        var (float): jitter ratio for contrast.
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+
+    img_gray = grayscale(images)
+    img_gray[:] = torch.mean(img_gray, dim=(1, 2, 3), keepdim=True)
+    images = blend(images, img_gray, alpha)
+    return images
+
+
+def saturation_jitter(var, images):
+    """
+    Perfrom saturation jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        var (float): jitter ratio for saturation.
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+    img_gray = grayscale(images)
+    images = blend(images, img_gray, alpha)
+
+    return images
+
+
+def lighting_jitter(images, alphastd, eigval, eigvec):
+    """
+    Perform AlexNet-style PCA jitter on the given images.
+    Args:
+        images (tensor): images to perform lighting jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        alphastd (float): jitter ratio for PCA jitter.
+        eigval (list): eigenvalues for PCA jitter.
+        eigvec (list[list]): eigenvectors for PCA jitter.
+    Returns:
+        out_images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    if alphastd == 0:
+        return images
+    # generate alpha1, alpha2, alpha3.
+    alpha = np.random.normal(0, alphastd, size=(1, 3))
+    eig_vec = np.array(eigvec)
+    eig_val = np.reshape(eigval, (1, 3))
+    rgb = np.sum(
+        eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0),
+        axis=1,
+    )
+    out_images = torch.zeros_like(images)
+    if len(images.shape) == 3:
+        # C H W
+        channel_dim = 0
+    elif len(images.shape) == 4:
+        # T C H W
+        channel_dim = 1
+    else:
+        raise NotImplementedError(f"Unsupported dimension {len(images.shape)}")
+
+    for idx in range(images.shape[channel_dim]):
+        # C H W
+        if len(images.shape) == 3:
+            out_images[idx] = images[idx] + rgb[2 - idx]
+        # T C H W
+        elif len(images.shape) == 4:
+            out_images[:, idx] = images[:, idx] + rgb[2 - idx]
+        else:
+            raise NotImplementedError(f"Unsupported dimension {len(images.shape)}")
+
+    return out_images
+
+
+def color_normalization(images, mean, stddev):
+    """
+    Perform color nomration on the given images.
+    Args:
+        images (tensor): images to perform color normalization. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        mean (list): mean values for normalization.
+        stddev (list): standard deviations for normalization.
+
+    Returns:
+        out_images (tensor): the noramlized images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    if len(images.shape) == 3:
+        assert len(mean) == images.shape[0], "channel mean not computed properly"
+        assert len(stddev) == images.shape[0], "channel stddev not computed properly"
+    elif len(images.shape) == 4:
+        assert len(mean) == images.shape[1], "channel mean not computed properly"
+        assert len(stddev) == images.shape[1], "channel stddev not computed properly"
+    else:
+        raise NotImplementedError(f"Unsupported dimension {len(images.shape)}")
+
+    out_images = torch.zeros_like(images)
+    for idx in range(len(mean)):
+        # C H W
+        if len(images.shape) == 3:
+            out_images[idx] = (images[idx] - mean[idx]) / stddev[idx]
+        elif len(images.shape) == 4:
+            out_images[:, idx] = (images[:, idx] - mean[idx]) / stddev[idx]
+        else:
+            raise NotImplementedError(f"Unsupported dimension {len(images.shape)}")
+    return out_images
+
+
+def _get_param_spatial_crop(scale, ratio, height, width, num_repeat=10, log_scale=True, switch_hw=False):
+    """
+    Given scale, ratio, height and width, return sampled coordinates of the videos.
+    """
+    for _ in range(num_repeat):
+        area = height * width
+        target_area = random.uniform(*scale) * area
+        if log_scale:
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+        else:
+            aspect_ratio = random.uniform(*ratio)
+
+        w = int(round(math.sqrt(target_area * aspect_ratio)))
+        h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+        if np.random.uniform() < 0.5 and switch_hw:
+            w, h = h, w
+
+        if 0 < w <= width and 0 < h <= height:
+            i = random.randint(0, height - h)
+            j = random.randint(0, width - w)
+            return i, j, h, w
+
+    # Fallback to central crop
+    in_ratio = float(width) / float(height)
+    if in_ratio < min(ratio):
+        w = width
+        h = int(round(w / min(ratio)))
+    elif in_ratio > max(ratio):
+        h = height
+        w = int(round(h * max(ratio)))
+    else:  # whole image
+        w = width
+        h = height
+    i = (height - h) // 2
+    j = (width - w) // 2
+    return i, j, h, w
+
+
+def random_resized_crop(
+    images,
+    target_height,
+    target_width,
+    scale=(0.8, 1.0),
+    ratio=(3.0 / 4.0, 4.0 / 3.0),
+):
+    """
+    Crop the given images to random size and aspect ratio. A crop of random
+    size (default: of 0.08 to 1.0) of the original size and a random aspect
+    ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This
+    crop is finally resized to given size. This is popularly used to train the
+    Inception networks.
+
+    Args:
+        images: Images to perform resizing and cropping.
+        target_height: Desired height after cropping.
+        target_width: Desired width after cropping.
+        scale: Scale range of Inception-style area based random resizing.
+        ratio: Aspect ratio range of Inception-style area based random resizing.
+    """
+
+    height = images.shape[2]
+    width = images.shape[3]
+
+    i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width)
+    cropped = images[:, :, i : i + h, j : j + w]
+    return torch.nn.functional.interpolate(
+        cropped,
+        size=(target_height, target_width),
+        mode="bilinear",
+        align_corners=False,
+    )
+
+
+def random_resized_crop_with_shift(
+    images,
+    target_height,
+    target_width,
+    scale=(0.8, 1.0),
+    ratio=(3.0 / 4.0, 4.0 / 3.0),
+):
+    """
+    This is similar to random_resized_crop. However, it samples two different
+    boxes (for cropping) for the first and last frame. It then linearly
+    interpolates the two boxes for other frames.
+
+    Args:
+        images: Images to perform resizing and cropping.
+        target_height: Desired height after cropping.
+        target_width: Desired width after cropping.
+        scale: Scale range of Inception-style area based random resizing.
+        ratio: Aspect ratio range of Inception-style area based random resizing.
+    """
+    t = images.shape[1]
+    height = images.shape[2]
+    width = images.shape[3]
+
+    i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width)
+    i_, j_, h_, w_ = _get_param_spatial_crop(scale, ratio, height, width)
+    i_s = [int(i) for i in torch.linspace(i, i_, steps=t).tolist()]
+    j_s = [int(i) for i in torch.linspace(j, j_, steps=t).tolist()]
+    h_s = [int(i) for i in torch.linspace(h, h_, steps=t).tolist()]
+    w_s = [int(i) for i in torch.linspace(w, w_, steps=t).tolist()]
+    out = torch.zeros((3, t, target_height, target_width))
+    for ind in range(t):
+        out[:, ind : ind + 1, :, :] = torch.nn.functional.interpolate(
+            images[
+                :,
+                ind : ind + 1,
+                i_s[ind] : i_s[ind] + h_s[ind],
+                j_s[ind] : j_s[ind] + w_s[ind],
+            ],
+            size=(target_height, target_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+    return out
+
+
+def create_random_augment(
+    input_size,
+    auto_augment=None,
+    interpolation="bilinear",
+):
+    """
+    Get video randaug transform.
+
+    Args:
+        input_size: The size of the input video in tuple.
+        auto_augment: Parameters for randaug. An example:
+            "rand-m7-n4-mstd0.5-inc1" (m is the magnitude and n is the number
+            of operations to apply).
+        interpolation: Interpolation method.
+    """
+    if isinstance(input_size, tuple):
+        img_size = input_size[-2:]
+    else:
+        img_size = input_size
+
+    if auto_augment:
+        assert isinstance(auto_augment, str)
+        if isinstance(img_size, tuple):
+            img_size_min = min(img_size)
+        else:
+            img_size_min = img_size
+        aa_params = {"translate_const": int(img_size_min * 0.45)}
+        if interpolation and interpolation != "random":
+            aa_params["interpolation"] = _pil_interp(interpolation)
+        if auto_augment.startswith("rand"):
+            return transforms.Compose([rand_augment_transform(auto_augment, aa_params)])
+    raise NotImplementedError
+
+
+def random_sized_crop_img(
+    im,
+    size,
+    jitter_scale=(0.08, 1.0),
+    jitter_aspect=(3.0 / 4.0, 4.0 / 3.0),
+    max_iter=10,
+):
+    """
+    Performs Inception-style cropping (used for training).
+    """
+    assert len(im.shape) == 3, "Currently only support image for random_sized_crop"
+    h, w = im.shape[1:3]
+    i, j, h, w = _get_param_spatial_crop(
+        scale=jitter_scale,
+        ratio=jitter_aspect,
+        height=h,
+        width=w,
+        num_repeat=max_iter,
+        log_scale=False,
+        switch_hw=True,
+    )
+    cropped = im[:, i : i + h, j : j + w]
+    return torch.nn.functional.interpolate(
+        cropped.unsqueeze(0),
+        size=(size, size),
+        mode="bilinear",
+        align_corners=False,
+    ).squeeze(0)
+
+
+def circulant_frame_padding(video: Tensor, total_frames: int) -> Tensor:
+    """
+    Applies circulant frame padding (repeating the video) to a specified size.
+
+    Args:
+        video: The input video to be padded. Expected (C, T, H, W)
+        total_frames: The number of frames after padding.
+
+    Returns
+        The video padded to total_frames.
+    """
+    start_frames = video.shape[1]
+    if start_frames == total_frames:
+        return video
+
+    num_repeats = total_frames // start_frames + (total_frames % start_frames > 0)
+
+    return video.repeat((1, num_repeats) + (1,) * (video.ndim - 2))[:, :total_frames]
+
+
+def frame_pad(video: Tensor, total_frames: int, pad_frame_method: str) -> Tensor:
+    if pad_frame_method not in PAD_FRAME_METHODS:
+        raise ValueError(f"Unrecognized pad_frame_method {pad_frame_method}")
+
+    if pad_frame_method == "circulant":
+        return circulant_frame_padding(video, total_frames)
+
+    return None
+
+
+# The following code are modified based on timm lib, we will replace the following
+# contents with dependency from PyTorchVideo.
+# https://github.com/facebookresearch/pytorchvideo
+class RandomResizedCropAndInterpolation:
+    """Crop the given PIL Image to random size and aspect ratio with random interpolation.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+    Args:
+        size: expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: PIL.Image.BILINEAR
+    """
+
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation="bilinear",
+    ):
+        if isinstance(size, tuple):
+            self.size = size
+        else:
+            self.size = (size, size)
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            print("range should be of kind (min, max)")
+
+        if interpolation == "random":
+            self.interpolation = _RANDOM_INTERPOLATION
+        else:
+            self.interpolation = _pil_interp(interpolation)
+        self.scale = scale
+        self.ratio = ratio
+
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        area = img.size[0] * img.size[1]
+
+        for _ in range(10):
+            target_area = random.uniform(*scale) * area
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w <= img.size[0] and h <= img.size[1]:
+                i = random.randint(0, img.size[1] - h)
+                j = random.randint(0, img.size[0] - w)
+                return i, j, h, w
+
+        # Fallback to central crop
+        in_ratio = img.size[0] / img.size[1]
+        if in_ratio < min(ratio):
+            w = img.size[0]
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = img.size[1]
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = img.size[0]
+            h = img.size[1]
+        i = (img.size[1] - h) // 2
+        j = (img.size[0] - w) // 2
+        return i, j, h, w
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolation = random.choice(self.interpolation)
+        else:
+            interpolation = self.interpolation
+        return F.resized_crop(img, i, j, h, w, self.size, interpolation)
+
+    def __repr__(self):
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolate_str = " ".join([_pil_interpolation_to_str[x] for x in self.interpolation])
+        else:
+            interpolate_str = _pil_interpolation_to_str[self.interpolation]
+        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
+        format_string += ", scale={0}".format(tuple(round(s, 4) for s in self.scale))
+        format_string += ", ratio={0}".format(tuple(round(r, 4) for r in self.ratio))
+        format_string += ", interpolation={0})".format(interpolate_str)
+        return format_string
+
+
+class Compose(object):
+    """Composes several transforms
+    Args:
+    transforms (list of ``Transform`` objects): list of transforms
+    to compose
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, clip):
+        for t in self.transforms:
+            clip = t(clip)
+        return clip
+
+
+class RandomHorizontalFlip(object):
+    """Horizontally flip the list of given images randomly
+    with a probability 0.5
+    """
+
+    def __call__(self, clip):
+        """
+        Args:
+        img (PIL.Image or numpy.ndarray): List of images to be cropped
+        in format (h, w, c) in numpy.ndarray
+        Returns:
+        PIL.Image or numpy.ndarray: Randomly flipped clip
+        """
+        if random.random() < 0.5:
+            if isinstance(clip[0], np.ndarray):
+                return [np.fliplr(img) for img in clip]
+            elif isinstance(clip[0], PIL.Image.Image):
+                return [img.transpose(PIL.Image.FLIP_LEFT_RIGHT) for img in clip]
+            else:
+                raise TypeError("Expected numpy.ndarray or PIL.Image" + " but got list of {0}".format(type(clip[0])))
+        return clip
+
+
+class RandomResize(object):
+    """Resizes a list of (H x W x C) numpy.ndarray to the final size
+    The larger the original image is, the more times it takes to
+    interpolate
+    Args:
+    interpolation (str): Can be one of 'nearest', 'bilinear'
+    defaults to nearest
+    size (tuple): (widht, height)
+    """
+
+    def __init__(self, ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="nearest"):
+        self.ratio = ratio
+        self.interpolation = interpolation
+
+    def __call__(self, clip):
+        scaling_factor = random.uniform(self.ratio[0], self.ratio[1])
+
+        if isinstance(clip[0], np.ndarray):
+            im_h, im_w, im_c = clip[0].shape
+        elif isinstance(clip[0], PIL.Image.Image):
+            im_w, im_h = clip[0].size
+
+        new_w = int(im_w * scaling_factor)
+        new_h = int(im_h * scaling_factor)
+        new_size = (new_w, new_h)
+        resized = FF.resize_clip(clip, new_size, interpolation=self.interpolation)
+        return resized
+
+
+class Resize(object):
+    """Resizes a list of (H x W x C) numpy.ndarray to the final size
+    The larger the original image is, the more times it takes to
+    interpolate
+    Args:
+    interpolation (str): Can be one of 'nearest', 'bilinear'
+    defaults to nearest
+    size (tuple): (widht, height)
+    """
+
+    def __init__(self, size, interpolation="nearest"):
+        self.size = size
+        self.interpolation = interpolation
+
+    def __call__(self, clip):
+        resized = FF.resize_clip(clip, self.size, interpolation=self.interpolation)
+        return resized
+
+
+class RandomCrop(object):
+    """Extract random crop at the same location for a list of images
+    Args:
+    size (sequence or int): Desired output size for the
+    crop in format (h, w)
+    """
+
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            size = (size, size)
+
+        self.size = size
+
+    def __call__(self, clip):
+        """
+        Args:
+        img (PIL.Image or numpy.ndarray): List of images to be cropped
+        in format (h, w, c) in numpy.ndarray
+        Returns:
+        PIL.Image or numpy.ndarray: Cropped list of images
+        """
+        h, w = self.size
+        if isinstance(clip[0], np.ndarray):
+            im_h, im_w, im_c = clip[0].shape
+        elif isinstance(clip[0], PIL.Image.Image):
+            im_w, im_h = clip[0].size
+        else:
+            raise TypeError("Expected numpy.ndarray or PIL.Image" + "but got list of {0}".format(type(clip[0])))
+        if w > im_w or h > im_h:
+            error_msg = (
+                "Initial image size should be larger then "
+                "cropped size but got cropped sizes : ({w}, {h}) while "
+                "initial image is ({im_w}, {im_h})".format(im_w=im_w, im_h=im_h, w=w, h=h)
+            )
+            raise ValueError(error_msg)
+
+        x1 = random.randint(0, im_w - w)
+        y1 = random.randint(0, im_h - h)
+        cropped = FF.crop_clip(clip, y1, x1, h, w)
+
+        return cropped
+
+
+class ThreeCrop(object):
+    """Extract random crop at the same location for a list of images
+    Args:
+    size (sequence or int): Desired output size for the
+    crop in format (h, w)
+    """
+
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            size = (size, size)
+
+        self.size = size
+
+    def __call__(self, clip):
+        """
+        Args:
+        img (PIL.Image or numpy.ndarray): List of images to be cropped
+        in format (h, w, c) in numpy.ndarray
+        Returns:
+        PIL.Image or numpy.ndarray: Cropped list of images
+        """
+        h, w = self.size
+        if isinstance(clip[0], np.ndarray):
+            im_h, im_w, im_c = clip[0].shape
+        elif isinstance(clip[0], PIL.Image.Image):
+            im_w, im_h = clip[0].size
+        else:
+            raise TypeError("Expected numpy.ndarray or PIL.Image" + "but got list of {0}".format(type(clip[0])))
+        if w != im_w and h != im_h:
+            clip = FF.resize_clip(clip, self.size, interpolation="bilinear")
+            im_h, im_w, im_c = clip[0].shape
+
+        step = np.max((np.max((im_w, im_h)) - self.size[0]) // 2, 0)
+        cropped = []
+        for i in range(3):
+            if im_h > self.size[0]:
+                x1 = 0
+                y1 = i * step
+                cropped.extend(FF.crop_clip(clip, y1, x1, h, w))
+            else:
+                x1 = i * step
+                y1 = 0
+                cropped.extend(FF.crop_clip(clip, y1, x1, h, w))
+        return cropped
+
+
+class RandomRotation(object):
+    """Rotate entire clip randomly by a random angle within
+    given bounds
+    Args:
+    degrees (sequence or int): Range of degrees to select from
+    If degrees is a number instead of sequence like (min, max),
+    the range of degrees, will be (-degrees, +degrees).
+    """
+
+    def __init__(self, degrees):
+        if isinstance(degrees, numbers.Number):
+            if degrees < 0:
+                raise ValueError("If degrees is a single number," "must be positive")
+            degrees = (-degrees, degrees)
+        else:
+            if len(degrees) != 2:
+                raise ValueError("If degrees is a sequence," "it must be of len 2.")
+
+        self.degrees = degrees
+
+    def __call__(self, clip):
+        """
+        Args:
+        img (PIL.Image or numpy.ndarray): List of images to be cropped
+        in format (h, w, c) in numpy.ndarray
+        Returns:
+        PIL.Image or numpy.ndarray: Cropped list of images
+        """
+        import skimage
+
+        angle = random.uniform(self.degrees[0], self.degrees[1])
+        if isinstance(clip[0], np.ndarray):
+            rotated = [skimage.transform.rotate(img, angle) for img in clip]
+        elif isinstance(clip[0], PIL.Image.Image):
+            rotated = [img.rotate(angle) for img in clip]
+        else:
+            raise TypeError("Expected numpy.ndarray or PIL.Image" + "but got list of {0}".format(type(clip[0])))
+
+        return rotated
+
+
+class CenterCrop(object):
+    """Extract center crop at the same location for a list of images
+    Args:
+    size (sequence or int): Desired output size for the
+    crop in format (h, w)
+    """
+
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            size = (size, size)
+
+        self.size = size
+
+    def __call__(self, clip):
+        """
+        Args:
+        img (PIL.Image or numpy.ndarray): List of images to be cropped
+        in format (h, w, c) in numpy.ndarray
+        Returns:
+        PIL.Image or numpy.ndarray: Cropped list of images
+        """
+        h, w = self.size
+        if isinstance(clip[0], np.ndarray) or isinstance(clip[0], torch.Tensor):
+            if clip[0].shape[-1] == 3:
+                im_h, im_w, im_c = clip[0].shape
+            else:
+                assert clip[0].shape[0] == 3
+                im_c, im_h, im_w = clip[0].shape
+        elif isinstance(clip[0], PIL.Image.Image):
+            im_w, im_h = clip[0].size
+        else:
+            raise TypeError(
+                "Expected numpy.ndarray or PIL.Image or torch.Tensor" + "but got list of {0}".format(type(clip[0]))
+            )
+        if w > im_w or h > im_h:
+            error_msg = (
+                "Initial image size should be larger then "
+                "cropped size but got cropped sizes : ({w}, {h}) while "
+                "initial image is ({im_w}, {im_h})".format(im_w=im_w, im_h=im_h, w=w, h=h)
+            )
+            raise ValueError(error_msg)
+
+        x1 = int(round((im_w - w) / 2.0))
+        y1 = int(round((im_h - h) / 2.0))
+        cropped = FF.crop_clip(clip, y1, x1, h, w)
+
+        return cropped
+
+
+class ColorJitter(object):
+    """
+    Randomly change the brightness, contrast and saturation and hue of the clip
+
+    Args:
+    brightness (float): How much to jitter brightness. brightness_factor
+    is chosen uniformly from [max(0, 1 - brightness), 1 + brightness].
+    contrast (float): How much to jitter contrast. contrast_factor
+    is chosen uniformly from [max(0, 1 - contrast), 1 + contrast].
+    saturation (float): How much to jitter saturation. saturation_factor
+    is chosen uniformly from [max(0, 1 - saturation), 1 + saturation].
+    hue(float): How much to jitter hue. hue_factor is chosen uniformly from
+    [-hue, hue]. Should be >=0 and <= 0.5.
+    """
+
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+        self.hue = hue
+
+    def get_params(self, brightness, contrast, saturation, hue):
+        if brightness > 0:
+            brightness_factor = random.uniform(max(0, 1 - brightness), 1 + brightness)
+        else:
+            brightness_factor = None
+
+        if contrast > 0:
+            contrast_factor = random.uniform(max(0, 1 - contrast), 1 + contrast)
+        else:
+            contrast_factor = None
+
+        if saturation > 0:
+            saturation_factor = random.uniform(max(0, 1 - saturation), 1 + saturation)
+        else:
+            saturation_factor = None
+
+        if hue > 0:
+            hue_factor = random.uniform(-hue, hue)
+        else:
+            hue_factor = None
+        return brightness_factor, contrast_factor, saturation_factor, hue_factor
+
+    def __call__(self, clip):
+        """
+        Args:
+        clip (list): list of PIL.Image
+        Returns:
+        list PIL.Image : list of transformed PIL.Image
+        """
+        if isinstance(clip[0], np.ndarray):
+            raise TypeError("Color jitter not yet implemented for numpy arrays")
+        elif isinstance(clip[0], PIL.Image.Image):
+            brightness, contrast, saturation, hue = self.get_params(
+                self.brightness, self.contrast, self.saturation, self.hue
+            )
+
+            # Create img transform function sequence
+            img_transforms = []
+            if brightness is not None:
+                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_brightness(img, brightness))
+            if saturation is not None:
+                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_saturation(img, saturation))
+            if hue is not None:
+                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_hue(img, hue))
+            if contrast is not None:
+                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_contrast(img, contrast))
+            random.shuffle(img_transforms)
+
+            # Apply to all images
+            jittered_clip = []
+            for img in clip:
+                for func in img_transforms:
+                    jittered_img = func(img)
+                jittered_clip.append(jittered_img)
+
+        else:
+            raise TypeError("Expected numpy.ndarray or PIL.Image" + "but got list of {0}".format(type(clip[0])))
+        return jittered_clip
+
+
+class Normalize(object):
+    """Normalize a clip with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``torch.*Tensor`` i.e.
+    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
+    .. note::
+        This transform acts out of place, i.e., it does not mutates the input tensor.
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+    """
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (Tensor): Tensor clip of size (T, C, H, W) to be normalized.
+        Returns:
+            Tensor: Normalized Tensor clip.
+        """
+        return FF.normalize(clip, self.mean, self.std)
+
+    def __repr__(self):
+        return self.__class__.__name__ + "(mean={0}, std={1})".format(self.mean, self.std)
diff --git a/src/datasets/utils/video/transforms_builder.py b/src/datasets/utils/video/transforms_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..be745d27c277edced6e84add4037115ad2071979
--- /dev/null
+++ b/src/datasets/utils/video/transforms_builder.py
@@ -0,0 +1,165 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+import torchvision.transforms as transforms
+
+import src.datasets.utils.video.transforms as video_transforms
+from src.datasets.utils.video.randerase import RandomErasing
+
+
+def make_transforms(
+    random_horizontal_flip=True,
+    random_resize_aspect_ratio=(3 / 4, 4 / 3),
+    random_resize_scale=(0.3, 1.0),
+    reprob=0.0,
+    auto_augment=False,
+    motion_shift=False,
+    crop_size=224,
+    normalize=((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    pad_frame_count: Optional[int] = None,
+    pad_frame_method: str = "circulant",
+):
+    _frames_augmentation = VideoTransform(
+        random_horizontal_flip=random_horizontal_flip,
+        random_resize_aspect_ratio=random_resize_aspect_ratio,
+        random_resize_scale=random_resize_scale,
+        reprob=reprob,
+        auto_augment=auto_augment,
+        motion_shift=motion_shift,
+        crop_size=crop_size,
+        normalize=normalize,
+        pad_frame_count=pad_frame_count,
+        pad_frame_method=pad_frame_method,
+    )
+    return _frames_augmentation
+
+
+class VideoTransform(object):
+
+    def __init__(
+        self,
+        random_horizontal_flip=True,
+        random_resize_aspect_ratio=(3 / 4, 4 / 3),
+        random_resize_scale=(0.3, 1.0),
+        reprob=0.0,
+        auto_augment=False,
+        motion_shift=False,
+        crop_size=224,
+        normalize=((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        pad_frame_count: Optional[int] = None,
+        pad_frame_method: str = "circulant",
+    ):
+        self.random_horizontal_flip = random_horizontal_flip
+        self.random_resize_aspect_ratio = random_resize_aspect_ratio
+        self.random_resize_scale = random_resize_scale
+        self.auto_augment = auto_augment
+        self.motion_shift = motion_shift
+        self.crop_size = crop_size
+        self.mean = torch.tensor(normalize[0], dtype=torch.float32)
+        self.std = torch.tensor(normalize[1], dtype=torch.float32)
+        self.pad_frame_count = pad_frame_count
+        self.pad_frame_method = pad_frame_method
+
+        if not self.auto_augment:
+            # Without auto-augment, PIL and tensor conversions simply scale uint8 space by 255.
+            self.mean *= 255.0
+            self.std *= 255.0
+
+        self.autoaug_transform = video_transforms.create_random_augment(
+            input_size=(crop_size, crop_size),
+            auto_augment="rand-m7-n4-mstd0.5-inc1",
+            interpolation="bicubic",
+        )
+
+        self.spatial_transform = (
+            video_transforms.random_resized_crop_with_shift if motion_shift else video_transforms.random_resized_crop
+        )
+
+        self.reprob = reprob
+        self.erase_transform = RandomErasing(
+            reprob,
+            mode="pixel",
+            max_count=1,
+            num_splits=1,
+            device="cpu",
+        )
+
+    def __call__(self, buffer):
+
+        if self.auto_augment:
+            buffer = [transforms.ToPILImage()(frame) for frame in buffer]
+            buffer = self.autoaug_transform(buffer)
+            buffer = [transforms.ToTensor()(img) for img in buffer]
+            buffer = torch.stack(buffer)  # T C H W
+            buffer = buffer.permute(0, 2, 3, 1)  # T H W C
+        elif torch.is_tensor(buffer):
+            # TODO: ensure input is always a tensor?
+            buffer = buffer.to(torch.float32)
+        else:
+            buffer = torch.tensor(buffer, dtype=torch.float32)
+
+        buffer = buffer.permute(3, 0, 1, 2)  # T H W C -> C T H W
+
+        buffer = self.spatial_transform(
+            images=buffer,
+            target_height=self.crop_size,
+            target_width=self.crop_size,
+            scale=self.random_resize_scale,
+            ratio=self.random_resize_aspect_ratio,
+        )
+        if self.random_horizontal_flip:
+            buffer, _ = video_transforms.horizontal_flip(0.5, buffer)
+
+        buffer = _tensor_normalize_inplace(buffer, self.mean, self.std)
+        if self.reprob > 0:
+            buffer = buffer.permute(1, 0, 2, 3)
+            buffer = self.erase_transform(buffer)
+            buffer = buffer.permute(1, 0, 2, 3)
+
+        if self.pad_frame_count is not None:
+            buffer = video_transforms.frame_pad(buffer, self.pad_frame_count, self.pad_frame_method)
+
+        return buffer
+
+
+def tensor_normalize(tensor, mean, std):
+    """
+    Normalize a given tensor by subtracting the mean and dividing the std.
+    Args:
+        tensor (tensor): tensor to normalize.
+        mean (tensor or list): mean value to subtract.
+        std (tensor or list): std to divide.
+    """
+    if tensor.dtype == torch.uint8:
+        tensor = tensor.float()
+        tensor = tensor / 255.0
+    if isinstance(mean, list):
+        mean = torch.tensor(mean)
+    if isinstance(std, list):
+        std = torch.tensor(std)
+    tensor = tensor - mean
+    tensor = tensor / std
+    return tensor
+
+
+def _tensor_normalize_inplace(tensor, mean, std):
+    """
+    Normalize a given tensor by subtracting the mean and dividing the std.
+    Args:
+        tensor (tensor): tensor to normalize (with dimensions C, T, H, W).
+        mean (tensor): mean value to subtract (in 0 to 255 floats).
+        std (tensor): std to divide (in 0 to 255 floats).
+    """
+    if tensor.dtype == torch.uint8:
+        tensor = tensor.float()
+
+    C, T, H, W = tensor.shape
+    tensor = tensor.view(C, -1).permute(1, 0)  # Make C the last dimension
+    tensor.sub_(mean).div_(std)
+    tensor = tensor.permute(1, 0).view(C, T, H, W)  # Put C back in front
+    return tensor
diff --git a/src/datasets/utils/video/volume_transforms.py b/src/datasets/utils/video/volume_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..208d34c44ab399d25bb938e69125348faed38248
--- /dev/null
+++ b/src/datasets/utils/video/volume_transforms.py
@@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def convert_img(img):
+    """Converts (H, W, C) numpy.ndarray to (C, W, H) format"""
+    if len(img.shape) == 3:
+        img = img.transpose(2, 0, 1)
+    if len(img.shape) == 2:
+        img = np.expand_dims(img, 0)
+    return img
+
+
+class ClipToTensor(object):
+    """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255]
+    to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0]
+    """
+
+    def __init__(self, channel_nb=3, div_255=True, numpy=False):
+        self.channel_nb = channel_nb
+        self.div_255 = div_255
+        self.numpy = numpy
+
+    def __call__(self, clip):
+        """
+        Args: clip (list of numpy.ndarray): clip (list of images)
+        to be converted to tensor.
+        """
+        # Retrieve shape
+        if isinstance(clip[0], np.ndarray):
+            h, w, ch = clip[0].shape
+            assert ch == self.channel_nb, "Got {0} instead of 3 channels".format(ch)
+        elif isinstance(clip[0], Image.Image):
+            w, h = clip[0].size
+        elif isinstance(clip[0], torch.Tensor):
+            tensor_clip = torch.stack(clip)
+            # Converting (T, C, H, W) -> (C, T, H, W) to match what `convert_img` followed by
+            # `np_clip[:, img_idx, :, :] = img` does for other data types.
+            tensor_clip = tensor_clip.permute(1, 0, 2, 3)
+            if not isinstance(tensor_clip, torch.FloatTensor):
+                tensor_clip = tensor_clip.float()
+            if self.div_255:
+                tensor_clip = torch.div(tensor_clip, 255)
+            return tensor_clip
+        else:
+            raise TypeError(
+                "Expected numpy.ndarray or PIL.Image or torch.Tensor\
+            but got list of {0}".format(
+                    type(clip[0])
+                )
+            )
+
+        np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)])
+
+        # Convert
+        for img_idx, img in enumerate(clip):
+            if isinstance(img, np.ndarray):
+                pass
+            elif isinstance(img, Image.Image):
+                img = np.array(img, copy=False)
+            else:
+                raise TypeError(
+                    "Expected numpy.ndarray or PIL.Image\
+                but got list of {0}".format(
+                        type(clip[0])
+                    )
+                )
+            img = convert_img(img)
+            np_clip[:, img_idx, :, :] = img
+
+        if self.numpy:
+            if self.div_255:
+                np_clip = np_clip / 255.0
+            return np_clip
+
+        else:
+            tensor_clip = torch.from_numpy(np_clip)
+
+            if not isinstance(tensor_clip, torch.FloatTensor):
+                tensor_clip = tensor_clip.float()
+            if self.div_255:
+                tensor_clip = torch.div(tensor_clip, 255)
+            return tensor_clip
+
+
+# Note this norms data to -1/1
+class ClipToTensor_K(object):
+    """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255]
+    to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0]
+    """
+
+    def __init__(self, channel_nb=3, div_255=True, numpy=False):
+        self.channel_nb = channel_nb
+        self.div_255 = div_255
+        self.numpy = numpy
+
+    def __call__(self, clip):
+        """
+        Args: clip (list of numpy.ndarray): clip (list of images)
+        to be converted to tensor.
+        """
+        # Retrieve shape
+        if isinstance(clip[0], np.ndarray):
+            h, w, ch = clip[0].shape
+            assert ch == self.channel_nb, "Got {0} instead of 3 channels".format(ch)
+        elif isinstance(clip[0], Image.Image):
+            w, h = clip[0].size
+        else:
+            raise TypeError(
+                "Expected numpy.ndarray or PIL.Image\
+            but got list of {0}".format(
+                    type(clip[0])
+                )
+            )
+
+        np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)])
+
+        # Convert
+        for img_idx, img in enumerate(clip):
+            if isinstance(img, np.ndarray):
+                pass
+            elif isinstance(img, Image.Image):
+                img = np.array(img, copy=False)
+            else:
+                raise TypeError(
+                    "Expected numpy.ndarray or PIL.Image\
+                but got list of {0}".format(
+                        type(clip[0])
+                    )
+                )
+            img = convert_img(img)
+            np_clip[:, img_idx, :, :] = img
+        if self.numpy:
+            if self.div_255:
+                np_clip = (np_clip - 127.5) / 127.5
+            return np_clip
+
+        else:
+            tensor_clip = torch.from_numpy(np_clip)
+
+            if not isinstance(tensor_clip, torch.FloatTensor):
+                tensor_clip = tensor_clip.float()
+            if self.div_255:
+                tensor_clip = torch.div(torch.sub(tensor_clip, 127.5), 127.5)
+            return tensor_clip
+
+
+class ToTensor(object):
+    """Converts numpy array to tensor"""
+
+    def __call__(self, array):
+        tensor = torch.from_numpy(array)
+        return tensor
diff --git a/src/datasets/utils/weighted_sampler.py b/src/datasets/utils/weighted_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b57dd33a1e34b9c2c1585b9edd453f1fe7104f0
--- /dev/null
+++ b/src/datasets/utils/weighted_sampler.py
@@ -0,0 +1,336 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Iterator, Optional
+
+import numpy as np
+import torch
+from torch.utils.data import DistributedSampler, RandomSampler
+
+from src.utils.logging import get_logger
+
+logger = get_logger("WeightedSampler")
+
+
+class DistributedWeightedSampler(DistributedSampler):
+    """
+    This class implements a weighted sampler for distributed training.
+    See https://pytorch.org/docs/stable/_modules/torch/utils/data/distributed.html#DistributedSampler for more details.
+
+    It shares the same interface as `torch.utils.data.DistributedSampler`.
+    The effective change is replacing `DistributedSampler`'s `torch.randperm` for generating the sequence
+    of indices with `numpy.random.Generator.choice`, with replacement. This allows weighted sampling and
+    avoiding issue with `torch.randperm` when the number of samples is larger than 2^24 samples.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+    ):
+        logger.info(f"Using DistributedWeightedSampler with rank {rank} / {num_replicas}")
+        assert hasattr(
+            dataset, "sample_weights"
+        ), "Dataset must have sample_weights property for using DistributedWeightedSampler"
+        super().__init__(
+            dataset,
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=drop_last,
+        )
+
+    @property
+    def sample_probabilities(self) -> np.ndarray:
+        sample_weights = self.dataset.sample_weights
+        if isinstance(sample_weights, torch.Tensor):
+            sample_weights = sample_weights.cpu().numpy()
+        elif isinstance(sample_weights, list):
+            sample_weights = np.array(sample_weights)
+        assert isinstance(
+            sample_weights, np.ndarray
+        ), f"sample_weights must be a numpy array, torch.Tensor, or python list; got {type(sample_weights)}"
+        return sample_weights / np.sum(sample_weights)
+
+    def __iter__(self) -> Iterator:
+        n = len(self.dataset)
+
+        # deterministically shuffle based on epoch and seed
+        rng = np.random.default_rng(self.seed + self.epoch)
+        indices = rng.choice(
+            range(0, n),
+            size=self.total_size,
+            p=self.sample_probabilities,
+            replace=True,
+        ).tolist()
+
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
+        else:
+            # remove tail of data to make it evenly divisible
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+
+class MemoryEfficientDistributedWeightedSampler(DistributedSampler):
+    """
+    This class implements a memory efficient version of `DistributedWeightedSampler`.
+    It shares the same interface as `DistributedWeightedSampler`.
+    The effective change is just-in-time sampling of the indices, instead of pre-computing them.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+    ):
+        logger.info(f"Using MemoryEfficientDistributedWeightedSampler with rank {rank} / {num_replicas}")
+        assert hasattr(
+            dataset, "dataset_weights"
+        ), "Dataset must have dataset_weights property for using MemoryEfficientDistributedWeightedSampler"
+        super().__init__(
+            dataset,
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+            seed=seed,
+        )
+
+        self.dataset_weights = dataset.dataset_weights
+        self.dataset_sizes = [len(d) for d in dataset.datasets]
+        if len(self.dataset_sizes) != len(self.dataset_weights):
+            raise ValueError(
+                f"Number of datasets ({len(self.dataset_sizes)}) "
+                f"does not match number of dataset weights ({len(self.dataset_weights)})"
+            )
+
+        if self.shuffle:
+            self.rng = np.random.default_rng(self.seed + self.rank + self.epoch)
+            total_weights = sum(self.dataset_weights)
+            self.dataset_probablities = np.array([w / total_weights for w in self.dataset_weights])
+        else:
+            if any([not isinstance(w, int) for w in self.dataset_weights]):
+                raise ValueError("Dataset weights must be integers when shuffle is False")
+
+            self.dataset_orders = []
+            for i, w in enumerate(self.dataset_weights):
+                self.dataset_orders.extend([i] * w)
+
+            self.drawn_samples = 0
+
+    def __iter__(self) -> Iterator:
+        return self
+
+    def __next__(self) -> int:
+        if self.shuffle:
+            selected_dataset_idx = self.rng.choice(range(len(self.dataset_weights)), p=self.dataset_probablities)
+
+            # In order to avoid sampling the same example multiple times between the ranks,
+            # we limit each rank to a subset of the total number of samples in the dataset.
+            # For example if our dataet is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], and we have 2 ranks,
+            # then rank 0 will ONLY sample from [0, 2, 4, 6, 8], and rank 1 from [1, 3, 5, 7, 9].
+            # In each iteration we first produce `in_rank_sample` which is the sample index in the rank,
+            # based on the size of the subset which that rank can sample from.
+            # Then we computer `sample_idx_in_dataset` for the indx of the sample in the whole dataset.
+            # For the above example if we are sampling for rank 1, we have `self.rng.integers(5)`.
+            # Let's assume the result is 2, then `in_rank_sample` is 2 (number "5" in the subset),
+            # so the sample index in the whole dataset is
+            # `in_rank_sample * self.num_replicas + self.rank`: 2 * 2 + 1 = 5.
+
+            selected_dataset_size = self.dataset_sizes[selected_dataset_idx]
+            # 1) Getting sample index in the rank.
+            # NOTE: this may effectively drops the last batch,
+            # but given the sample sizes that we use this sampler with, it should not be an issue.
+            num_samples_in_rank = selected_dataset_size // self.num_replicas
+            in_rank_sample = self.rng.integers(num_samples_in_rank)
+
+            # 2) Getting sample index in the dataset.
+            sample_idx_in_dataset = in_rank_sample * self.num_replicas + self.rank
+
+        else:
+            # Iterate through the dataset orders in a round-robin fashion, offset by the rank
+            dataset_orders_idx = (self.rank + self.drawn_samples) % len(self.dataset_orders)
+            selected_dataset_idx = self.dataset_orders[dataset_orders_idx]
+            # Get the sample index in the selected dataset by skipping with the num_replicas * drawn_samples
+            sample_idx_in_dataset = (self.drawn_samples * self.num_replicas + self.rank) % self.dataset_sizes[
+                selected_dataset_idx
+            ]
+            self.drawn_samples += 1
+
+        # Getting the index of the sample in the whole dataset
+        # For example if the total dataset has 4 datasets with sizes [10, 20, 30, 5].
+        # and our selected_dataset_idx=3 and sample_idx_in_dataset=5
+        # then the index of the sample in the whole dataset is
+        #   10 (for dataset 1) + 20 (for dataset 1) + 5 (for sample_idx_in_dataset) = 35
+        # This is because the first 10 samples are from the first dataset, the next 20 are from the second dataset,
+        # then we reach at the 3rd dataset which is the selected dataset, and the 5th sample in the 3rd dataset.
+        sample_idx = 0
+        for i, d in enumerate(self.dataset_sizes):
+            if selected_dataset_idx == i:
+                break
+            sample_idx += d
+        sample_idx += sample_idx_in_dataset
+
+        return sample_idx
+
+
+def safe_next(iterator):
+    try:
+        return next(iterator)
+    except StopIteration:
+        return None
+
+
+class MemoryEfficientDistributedWeightedSamplerLessRepeat(DistributedSampler):
+    """
+    This class implements a memory efficient version of `DistributedWeightedSampler`.
+    It shares the same interface as `DistributedWeightedSampler`.
+    The effective change is pre-computing the permutations of indices over a subset of total indices.
+    This subset is the selected with picking the indices in a dataset with steps sizes equal to the world size.
+    For example, if world size is 12 and rank is 2, for a dataset of size N,
+    this sampler only permutes the indices in range(2, n, 12)
+
+    Compared with MemoryEfficientDistributedWeightedSampler, this will reduce the effective number of repeat.
+    See discussions here: https://github.com/fairinternal/jepa-internal/pull/254
+    """
+
+    def __init__(
+        self,
+        dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+    ):
+        logger.info(f"Using MemoryEfficientDistributedWeightedSamplerLessRepeat with rank {rank} / {num_replicas}")
+        assert hasattr(
+            dataset, "dataset_weights"
+        ), "Dataset must have dataset_weights property for using MemoryEfficientDistributedWeightedSamplerLessRepeat"
+        super().__init__(
+            dataset,
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+            seed=seed,
+        )
+
+        self._generator = torch.Generator()
+        self._generator.manual_seed(seed)
+
+        self.dataset_weights = dataset.dataset_weights
+        self.dataset_sizes = [len(d) for d in dataset.datasets]
+        if len(self.dataset_sizes) != len(self.dataset_weights):
+            raise ValueError(
+                f"Number of datasets ({len(self.dataset_sizes)}) "
+                f"does not match number of dataset weights ({len(self.dataset_weights)})"
+            )
+
+        if self.shuffle:
+            self.rng = np.random.default_rng(self.seed + self.rank + self.epoch)
+            total_weights = sum(self.dataset_weights)
+            self.dataset_probablities = np.array([w / total_weights for w in self.dataset_weights])
+
+            # For each dataset we generate a permutation of the indices that will be processed by that rank.
+            # This is going to be the subset of indices, selected by the steps sizes of the world size.
+            logger.info(f"Generating dataset indices for rank {self.rank} / {self.num_replicas}")
+
+            # Getting a RandomSampler for indices assigned to each dataset.
+            self.individual_dataset_sampler = []
+            for ids, ds in enumerate(self.dataset_sizes):
+
+                # NOTE: this may effectively drops the last batch,
+                # but given the sample sizes that we use this sampler with, it should not be an issue.
+                num_samples_in_rank = ds // self.num_replicas
+                self.individual_dataset_sampler.append(self._new_sampler(num_samples_in_rank))
+
+        else:
+            if any([not isinstance(w, int) for w in self.dataset_weights]):
+                raise ValueError("Dataset weights must be integers when shuffle is False")
+
+            self.dataset_orders = []
+            for i, w in enumerate(self.dataset_weights):
+                self.dataset_orders.extend([i] * w)
+
+            self.drawn_samples = 0
+
+    def __iter__(self) -> Iterator:
+        return self
+
+    def _new_sampler(self, sample_size: int) -> RandomSampler:
+        assert self.shuffle
+
+        return iter(
+            RandomSampler(
+                range(sample_size),
+                generator=self._generator,
+            )
+        )
+
+    def _in_rank_next_index_for_dataset(self, dataset_idx: int) -> int:
+        assert self.shuffle
+
+        next_sampler_idx = safe_next(self.individual_dataset_sampler[dataset_idx])
+        if next_sampler_idx is None:
+            # We have reached the end of the dataset, we need to reset the sampler.
+            num_samples_in_rank = self.dataset_sizes[dataset_idx] // self.num_replicas
+            self.individual_dataset_sampler[dataset_idx] = self._new_sampler(num_samples_in_rank)
+            next_sampler_idx = safe_next(self.individual_dataset_sampler[dataset_idx])
+            assert next_sampler_idx is not None
+
+        return next_sampler_idx
+
+    def __next__(self) -> int:
+        if self.shuffle:
+            selected_dataset_idx = self.rng.choice(range(len(self.dataset_weights)), p=self.dataset_probablities)
+            in_rank_sample = self._in_rank_next_index_for_dataset(selected_dataset_idx)
+
+            # 2) Getting sample index in the dataset.
+            sample_idx_in_dataset = in_rank_sample * self.num_replicas + self.rank
+
+        else:
+            # Iterate through the dataset orders in a round-robin fashion, offset by the rank
+            dataset_orders_idx = (self.rank + self.drawn_samples) % len(self.dataset_orders)
+            selected_dataset_idx = self.dataset_orders[dataset_orders_idx]
+            # Get the sample index in the selected dataset by skipping with the num_replicas * drawn_samples
+            sample_idx_in_dataset = (self.drawn_samples * self.num_replicas + self.rank) % self.dataset_sizes[
+                selected_dataset_idx
+            ]
+            self.drawn_samples += 1
+
+        # Getting the index of the sample in the whole dataset
+        # For example if the total dataset has 4 datasets with sizes [10, 20, 30, 5].
+        # and our selected_dataset_idx=3 and sample_idx_in_dataset=5
+        # then the index of the sample in the whole dataset is
+        #   10 (for dataset 1) + 20 (for dataset 1) + 5 (for sample_idx_in_dataset) = 35
+        # This is because the first 10 samples are from the first dataset, the next 20 are from the second dataset,
+        # then we reach at the 3rd dataset which is the selected dataset, and the 5th sample in the 3rd dataset.
+        sample_idx = 0
+        for i, d in enumerate(self.dataset_sizes):
+            if selected_dataset_idx == i:
+                break
+            sample_idx += d
+        sample_idx += sample_idx_in_dataset
+
+        return sample_idx
diff --git a/src/datasets/utils/worker_init_fn.py b/src/datasets/utils/worker_init_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..134bddbccf8c7889a9f08bb60fbfb79a2daac6c1
--- /dev/null
+++ b/src/datasets/utils/worker_init_fn.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# Copyright The Lightning AI team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code originally comes from PyTorch Lighting with some light modificaitons:
+# https://github.com/Lightning-AI/pytorch-lightning/blob/a944e7744e57a5a2c13f3c73b9735edf2f71e329/src/lightning/fabric/utilities/seed.py
+
+
+import os
+import random
+from typing import Optional
+
+import numpy as np
+import torch
+
+from src.utils.logging import get_logger
+
+logger = get_logger("worker_init_fn")
+
+
+def _generate_seed_sequence(base_seed: int, worker_id: int, global_rank: int, count: int) -> list[int]:
+    """Generates a sequence of seeds from a base seed, worker id and rank using the linear congruential generator (LCG)
+    algorithm."""
+    # Combine base seed, worker id and rank into a unique 64-bit number
+    combined_seed = (base_seed << 32) | (worker_id << 16) | global_rank
+    seeds = []
+    for _ in range(count):
+        # x_(n+1) = (a * x_n + c) mod m. With c=1, m=2^64 and a is D. Knuth's constant
+        combined_seed = (combined_seed * 6364136223846793005 + 1) & ((1 << 64) - 1)
+        seeds.append(combined_seed)
+    return seeds
+
+
+def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None:  # pragma: no cover
+    r"""The worker_init_fn that Lightning automatically adds to your dataloader if you previously set the seed with
+    ``seed_everything(seed, workers=True)``.
+
+    See also the PyTorch documentation on
+    `randomness in DataLoaders <https://pytorch.org/docs/stable/notes/randomness.html#dataloader>`_.
+
+    """
+    # implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562
+    if rank is None:
+        procid = os.environ.get("SLURM_PROCID")
+        if procid is None:
+            logger.warning("SLURM_PROCID is not set, setting rank to 0")
+            rank = 0
+        else:
+            rank = int(procid)
+
+    process_seed = torch.initial_seed()
+    # back out the base seed so we can use all the bits
+    base_seed = process_seed - worker_id
+    logger.debug(
+        f"Initializing random number generators of process {rank} worker {worker_id} with base seed {base_seed}"
+    )
+    seed_sequence = _generate_seed_sequence(base_seed, worker_id, rank, count=4)
+    torch.manual_seed(seed_sequence[0])  # torch takes a 64-bit seed
+    random.seed((seed_sequence[1] << 32) | seed_sequence[2])  # combine two 64-bit seeds
+
+    ss = np.random.SeedSequence([base_seed, worker_id, rank])
+    np_rng_seed = ss.generate_state(4)
+
+    np.random.seed(np_rng_seed)
diff --git a/src/datasets/video_dataset.py b/src/datasets/video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0c47f66dd65158af2b3c05608cd6474811a4c29
--- /dev/null
+++ b/src/datasets/video_dataset.py
@@ -0,0 +1,373 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import os
+import pathlib
+import warnings
+from logging import getLogger
+
+import numpy as np
+import pandas as pd
+import torch
+import torchvision
+from decord import VideoReader, cpu
+
+from src.datasets.utils.dataloader import ConcatIndices, MonitoredDataset, NondeterministicDataLoader
+from src.datasets.utils.weighted_sampler import DistributedWeightedSampler
+
+_GLOBAL_SEED = 0
+logger = getLogger()
+
+
+def make_videodataset(
+    data_paths,
+    batch_size,
+    frames_per_clip=8,
+    dataset_fpcs=None,
+    frame_step=4,
+    duration=None,
+    fps=None,
+    num_clips=1,
+    random_clip_sampling=True,
+    allow_clip_overlap=False,
+    filter_short_videos=False,
+    filter_long_videos=int(10**9),
+    transform=None,
+    shared_transform=None,
+    rank=0,
+    world_size=1,
+    datasets_weights=None,
+    collator=None,
+    drop_last=True,
+    num_workers=10,
+    pin_mem=True,
+    persistent_workers=True,
+    deterministic=True,
+    log_dir=None,
+):
+    dataset = VideoDataset(
+        data_paths=data_paths,
+        datasets_weights=datasets_weights,
+        frames_per_clip=frames_per_clip,
+        dataset_fpcs=dataset_fpcs,
+        duration=duration,
+        fps=fps,
+        frame_step=frame_step,
+        num_clips=num_clips,
+        random_clip_sampling=random_clip_sampling,
+        allow_clip_overlap=allow_clip_overlap,
+        filter_short_videos=filter_short_videos,
+        filter_long_videos=filter_long_videos,
+        shared_transform=shared_transform,
+        transform=transform,
+    )
+
+    log_dir = pathlib.Path(log_dir) if log_dir else None
+    if log_dir:
+        log_dir.mkdir(parents=True, exist_ok=True)
+        # Worker ID will replace '%w'
+        resource_log_filename = log_dir / f"resource_file_{rank}_%w.csv"
+        dataset = MonitoredDataset(
+            dataset=dataset,
+            log_filename=str(resource_log_filename),
+            log_interval=10.0,
+            monitor_interval=5.0,
+        )
+
+    logger.info("VideoDataset dataset created")
+    if datasets_weights is not None:
+        dist_sampler = DistributedWeightedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=True)
+    else:
+        dist_sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank, shuffle=True
+        )
+
+    if deterministic:
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=collator,
+            sampler=dist_sampler,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            pin_memory=pin_mem,
+            num_workers=num_workers,
+            persistent_workers=(num_workers > 0) and persistent_workers,
+        )
+    else:
+        data_loader = NondeterministicDataLoader(
+            dataset,
+            collate_fn=collator,
+            sampler=dist_sampler,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            pin_memory=pin_mem,
+            num_workers=num_workers,
+            persistent_workers=(num_workers > 0) and persistent_workers,
+        )
+    logger.info("VideoDataset unsupervised data loader created")
+
+    return dataset, data_loader, dist_sampler
+
+
+class VideoDataset(torch.utils.data.Dataset):
+    """Video classification dataset."""
+
+    def __init__(
+        self,
+        data_paths,
+        datasets_weights=None,
+        frames_per_clip=16,
+        fps=None,
+        dataset_fpcs=None,
+        frame_step=4,
+        num_clips=1,
+        transform=None,
+        shared_transform=None,
+        random_clip_sampling=True,
+        allow_clip_overlap=False,
+        filter_short_videos=False,
+        filter_long_videos=int(10**9),
+        duration=None,  # duration in seconds
+    ):
+        self.data_paths = data_paths
+        self.datasets_weights = datasets_weights
+        self.frame_step = frame_step
+        self.num_clips = num_clips
+        self.transform = transform
+        self.shared_transform = shared_transform
+        self.random_clip_sampling = random_clip_sampling
+        self.allow_clip_overlap = allow_clip_overlap
+        self.filter_short_videos = filter_short_videos
+        self.filter_long_videos = filter_long_videos
+        self.duration = duration
+        self.fps = fps
+
+        if sum([v is not None for v in (fps, duration, frame_step)]) != 1:
+            raise ValueError(f"Must specify exactly one of either {fps=}, {duration=}, or {frame_step=}.")
+
+        if isinstance(data_paths, str):
+            data_paths = [data_paths]
+
+        if dataset_fpcs is None:
+            self.dataset_fpcs = [frames_per_clip for _ in data_paths]
+        else:
+            if len(dataset_fpcs) != len(data_paths):
+                raise ValueError("Frames per clip not properly specified for NFS data paths")
+            self.dataset_fpcs = dataset_fpcs
+
+        if VideoReader is None:
+            raise ImportError('Unable to import "decord" which is required to read videos.')
+
+        # Load video paths and labels
+        samples, labels = [], []
+        self.num_samples_per_dataset = []
+        for data_path in self.data_paths:
+
+            if data_path[-4:] == ".csv":
+                try:
+                    data = pd.read_csv(data_path, header=None, delimiter=" ")
+                except pd.errors.ParserError:
+                    # In image captioning datasets where we have space, we use :: as delimiter.
+                    data = pd.read_csv(data_path, header=None, delimiter="::")
+                samples += list(data.values[:, 0])
+                labels += list(data.values[:, 1])
+                num_samples = len(data)
+                self.num_samples_per_dataset.append(num_samples)
+
+            elif data_path[-4:] == ".npy":
+                data = np.load(data_path, allow_pickle=True)
+                data = list(map(lambda x: repr(x)[1:-1], data))
+                samples += data
+                labels += [0] * len(data)
+                num_samples = len(data)
+                self.num_samples_per_dataset.append(len(data))
+
+        self.per_dataset_indices = ConcatIndices(self.num_samples_per_dataset)
+
+        # [Optional] Weights for each sample to be used by downstream
+        # weighted video sampler
+        self.sample_weights = None
+        if self.datasets_weights is not None:
+            self.sample_weights = []
+            for dw, ns in zip(self.datasets_weights, self.num_samples_per_dataset):
+                self.sample_weights += [dw / ns] * ns
+
+        self.samples = samples
+        self.labels = labels
+
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        loaded_sample = False
+        # Keep trying to load videos until you find a valid sample
+        while not loaded_sample:
+            if not isinstance(sample, str):
+                logger.warning("Invalid sample.")
+            else:
+                if sample.split(".")[-1].lower() in ("jpg", "png", "jpeg"):
+                    loaded_sample = self.get_item_image(index)
+                else:
+                    loaded_sample = self.get_item_video(index)
+
+            if not loaded_sample:
+                index = np.random.randint(self.__len__())
+                sample = self.samples[index]
+
+        return loaded_sample
+
+    def get_item_video(self, index):
+        sample = self.samples[index]
+        dataset_idx, _ = self.per_dataset_indices[index]
+        frames_per_clip = self.dataset_fpcs[dataset_idx]
+
+        buffer, clip_indices = self.loadvideo_decord(sample, frames_per_clip)  # [T H W 3]
+        loaded_video = len(buffer) > 0
+        if not loaded_video:
+            return
+
+        # Label/annotations for video
+        label = self.labels[index]
+
+        def split_into_clips(video):
+            """Split video into a list of clips"""
+            fpc = frames_per_clip
+            nc = self.num_clips
+            return [video[i * fpc : (i + 1) * fpc] for i in range(nc)]
+
+        # Parse video into frames & apply data augmentations
+        if self.shared_transform is not None:
+            buffer = self.shared_transform(buffer)
+        buffer = split_into_clips(buffer)
+        if self.transform is not None:
+            buffer = [self.transform(clip) for clip in buffer]
+
+        return buffer, label, clip_indices
+
+    def get_item_image(self, index):
+        sample = self.samples[index]
+        dataset_idx, _ = self.per_dataset_indices[index]
+        fpc = self.dataset_fpcs[dataset_idx]
+
+        try:
+            image_tensor = torchvision.io.read_image(path=sample, mode=torchvision.io.ImageReadMode.RGB)
+        except Exception:
+            return
+        label = self.labels[index]
+        clip_indices = [np.arange(start=0, stop=fpc, dtype=np.int32)]
+
+        # Expanding the input image [3, H, W] ==> [T, 3, H, W]
+        buffer = image_tensor.unsqueeze(dim=0).repeat((fpc, 1, 1, 1))
+        buffer = buffer.permute((0, 2, 3, 1))  # [T, 3, H, W] ==> [T H W 3]
+
+        if self.shared_transform is not None:
+            # Technically we can have only transform, doing this just for the sake of consistency with videos.
+            buffer = self.shared_transform(buffer)
+
+        if self.transform is not None:
+            buffer = [self.transform(buffer)]
+
+        return buffer, label, clip_indices
+
+    def loadvideo_decord(self, sample, fpc):
+        """Load video content using Decord"""
+
+        fname = sample
+        if not os.path.exists(fname):
+            warnings.warn(f"video path not found {fname=}")
+            return [], None
+
+        _fsize = os.path.getsize(fname)
+        if _fsize > self.filter_long_videos:
+            warnings.warn(f"skipping long video of size {_fsize=} (bytes)")
+            return [], None
+
+        try:
+            vr = VideoReader(fname, num_threads=-1, ctx=cpu(0))
+        except Exception:
+            return [], None
+
+        fstp = self.frame_step
+        if self.duration is not None or self.fps is not None:
+            try:
+                video_fps = math.ceil(vr.get_avg_fps())
+            except Exception as e:
+                logger.warning(e)
+
+            if self.duration is not None:
+                assert self.fps is None
+                fstp = int(self.duration * video_fps / fpc)
+            else:
+                assert self.duration is None
+                fstp = video_fps // self.fps
+
+        assert fstp is not None and fstp > 0
+        clip_len = int(fpc * fstp)
+
+        if self.filter_short_videos and len(vr) < clip_len:
+            warnings.warn(f"skipping video of length {len(vr)}")
+            return [], None
+
+        vr.seek(0)  # Go to start of video before sampling frames
+
+        # Partition video into equal sized segments and sample each clip
+        # from a different segment
+        partition_len = len(vr) // self.num_clips
+
+        all_indices, clip_indices = [], []
+        for i in range(self.num_clips):
+
+            if partition_len > clip_len:
+                # If partition_len > clip len, then sample a random window of
+                # clip_len frames within the segment
+                end_indx = clip_len
+                if self.random_clip_sampling:
+                    end_indx = np.random.randint(clip_len, partition_len)
+                start_indx = end_indx - clip_len
+                indices = np.linspace(start_indx, end_indx, num=fpc)
+                indices = np.clip(indices, start_indx, end_indx - 1).astype(np.int64)
+                # --
+                indices = indices + i * partition_len
+            else:
+                # If partition overlap not allowed and partition_len < clip_len
+                # then repeatedly append the last frame in the segment until
+                # we reach the desired clip length
+                if not self.allow_clip_overlap:
+                    indices = np.linspace(0, partition_len, num=partition_len // fstp)
+                    indices = np.concatenate(
+                        (
+                            indices,
+                            np.ones(fpc - partition_len // fstp) * partition_len,
+                        )
+                    )
+                    indices = np.clip(indices, 0, partition_len - 1).astype(np.int64)
+                    # --
+                    indices = indices + i * partition_len
+
+                # If partition overlap is allowed and partition_len < clip_len
+                # then start_indx of segment i+1 will lie within segment i
+                else:
+                    sample_len = min(clip_len, len(vr)) - 1
+                    indices = np.linspace(0, sample_len, num=sample_len // fstp)
+                    indices = np.concatenate(
+                        (
+                            indices,
+                            np.ones(fpc - sample_len // fstp) * sample_len,
+                        )
+                    )
+                    indices = np.clip(indices, 0, sample_len - 1).astype(np.int64)
+                    # --
+                    clip_step = 0
+                    if len(vr) > clip_len:
+                        clip_step = (len(vr) - clip_len) // (self.num_clips - 1)
+                    indices = indices + i * clip_step
+
+            clip_indices.append(indices)
+            all_indices.extend(list(indices))
+
+        buffer = vr.get_batch(all_indices).asnumpy()
+        return buffer, clip_indices
+
+    def __len__(self):
+        return len(self.samples)
diff --git a/src/hub/__init__.py b/src/hub/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/hub/backbones.py b/src/hub/backbones.py
new file mode 100644
index 0000000000000000000000000000000000000000..35308f8666b8988295b513d163d0395a31874735
--- /dev/null
+++ b/src/hub/backbones.py
@@ -0,0 +1,177 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+VJEPA_BASE_URL = "https://dl.fbaipublicfiles.com/vjepa2"
+
+# for testing
+# VJEPA_BASE_URL = "http://localhost:8300"
+
+ARCH_NAME_MAP = {
+    "vit_large": ("vit_large", "vitl"),
+    "vit_huge": ("vit_huge", "vith"),
+    "vit_giant": ("vit_giant_xformers", "vitg"),
+    "vit_ac_giant": ("vit_giant_xformers", "vjepa2-ac-vitg"),
+    "vit_giant_384": ("vit_giant_xformers", "vitg-384"),
+}
+
+
+def _clean_backbone_key(state_dict):
+    for key, val in state_dict.copy().items():
+        _ = state_dict.pop(key)
+        key = key.replace("module.", "")
+        key = key.replace("backbone.", "")
+        state_dict[key] = val
+    return state_dict
+
+
+def _make_vjepa2_ac_model(
+    *,
+    model_name: str = "vit_ac_giant",
+    img_size=256,
+    patch_size=16,
+    tubelet_size=2,
+    num_frames=64,
+    pretrained: bool = True,
+    **kwargs,
+):
+    from ..models import ac_predictor as vit_ac_predictor
+    from ..models import vision_transformer as vit_encoder
+
+    vit_encoder_kwargs = dict(
+        patch_size=patch_size,
+        img_size=(img_size, img_size),
+        num_frames=num_frames,
+        tubelet_size=tubelet_size,
+        use_sdpa=True,
+        use_SiLU=False,
+        wide_SiLU=True,
+        uniform_power=False,
+        use_rope=True,
+    )
+    vit_encoder_kwargs.update(**kwargs)
+
+    arch_name = ARCH_NAME_MAP[model_name][0]
+    encoder = vit_encoder.__dict__[arch_name](**vit_encoder_kwargs)
+
+    vit_predictor_kwargs = dict(
+        img_size=(img_size, img_size),
+        patch_size=patch_size,
+        num_frames=num_frames,
+        tubelet_size=tubelet_size,
+        embed_dim=encoder.embed_dim,
+    )
+    vit_predictor_kwargs.update(**kwargs)
+
+    predictor = vit_ac_predictor.__dict__["vit_ac_predictor"](**vit_predictor_kwargs)
+
+    if pretrained:
+        model_file = ARCH_NAME_MAP[model_name][-1]
+        url = VJEPA_BASE_URL + f"/{model_file}.pt"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        encoder_state_dict = _clean_backbone_key(state_dict["encoder"])
+        encoder.load_state_dict(encoder_state_dict, strict=False)
+        predictor_state_dict = _clean_backbone_key(state_dict["predictor"])
+        predictor.load_state_dict(predictor_state_dict, strict=True)
+
+    return encoder, predictor
+
+
+def _make_vjepa2_model(
+    *,
+    model_name: str = "vit_large",
+    img_size=256,
+    patch_size=16,
+    tubelet_size=2,
+    num_frames=64,
+    pretrained: bool = True,
+    **kwargs,
+):
+    from ..models import predictor as vit_predictor
+    from ..models import vision_transformer as vit_encoder
+
+    vit_encoder_kwargs = dict(
+        patch_size=patch_size,
+        img_size=(img_size, img_size),
+        num_frames=num_frames,
+        tubelet_size=tubelet_size,
+        use_sdpa=True,
+        use_SiLU=False,
+        wide_SiLU=True,
+        uniform_power=False,
+        use_rope=True,
+    )
+    vit_encoder_kwargs.update(**kwargs)
+
+    arch_name = ARCH_NAME_MAP[model_name][0]
+    encoder = vit_encoder.__dict__[arch_name](**vit_encoder_kwargs)
+
+    vit_predictor_kwargs = dict(
+        img_size=(img_size, img_size),
+        patch_size=patch_size,
+        use_mask_tokens=True,
+        embed_dim=encoder.embed_dim,
+        predictor_embed_dim=384,
+        num_frames=num_frames,
+        tubelet_size=tubelet_size,
+        depth=12,
+        num_heads=12,
+        num_mask_tokens=10,
+        use_rope=True,
+        uniform_power=False,
+        use_sdpa=True,
+        use_silu=False,
+        wide_silu=True,
+    )
+    vit_predictor_kwargs.update(**kwargs)
+
+    predictor = vit_predictor.__dict__["vit_predictor"](**vit_predictor_kwargs)
+
+    if pretrained:
+        model_file = ARCH_NAME_MAP[model_name][-1]
+        url = VJEPA_BASE_URL + f"/{model_file}.pt"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        encoder_state_dict = _clean_backbone_key(state_dict["encoder"])
+        encoder.load_state_dict(encoder_state_dict, strict=False)  # state_dict has pos_embed but we use RoPE
+        predictor_state_dict = _clean_backbone_key(state_dict["predictor"])
+        predictor.load_state_dict(predictor_state_dict, strict=False)  # state_dict has pos_embed but we use RoPE
+
+    return encoder, predictor
+
+
+def vjepa2_vit_large(*, pretrained: bool = True, **kwargs):
+    """
+    VJEPA 2 ViT-Large model
+    """
+    return _make_vjepa2_model(model_name="vit_large", img_size=256, pretrained=pretrained, **kwargs)
+
+
+def vjepa2_vit_huge(*, pretrained: bool = True, **kwargs):
+    """
+    VJEPA 2 ViT-Huge model
+    """
+    return _make_vjepa2_model(model_name="vit_huge", img_size=256, pretrained=pretrained, **kwargs)
+
+
+def vjepa2_vit_giant(*, pretrained: bool = True, **kwargs):
+    """
+    VJEPA 2 ViT-giant model
+    """
+    return _make_vjepa2_model(model_name="vit_giant", img_size=256, pretrained=pretrained, **kwargs)
+
+
+def vjepa2_vit_giant_384(*, pretrained: bool = True, **kwargs):
+    """
+    VJEPA 2 ViT-giant-384 model
+    """
+    return _make_vjepa2_model(model_name="vit_giant_384", img_size=384, pretrained=pretrained, **kwargs)
+
+
+def vjepa2_ac_vit_giant(*, pretrained: bool = True, **kwargs):
+    """
+    VJEPA 2-AC ViT-giant model
+    """
+    return _make_vjepa2_ac_model(model_name="vit_ac_giant", img_size=256, pretrained=pretrained, **kwargs)
diff --git a/src/masks/__pycache__/utils.cpython-312.pyc b/src/masks/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..241a9b19f57c9bc39e59b4ddbc93054c2e2b6329
Binary files /dev/null and b/src/masks/__pycache__/utils.cpython-312.pyc differ
diff --git a/src/masks/default.py b/src/masks/default.py
new file mode 100644
index 0000000000000000000000000000000000000000..11bd3de0a124ba12aba5bb8e27fbe46f44895122
--- /dev/null
+++ b/src/masks/default.py
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from logging import getLogger
+
+import torch
+
+_GLOBAL_SEED = 0
+logger = getLogger()
+
+
+class DefaultCollator(object):
+
+    def __call__(self, batch):
+        collated_batch = torch.utils.data.default_collate(batch)
+        return collated_batch, None, None
diff --git a/src/masks/multiseq_multiblock3d.py b/src/masks/multiseq_multiblock3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d9b049a79e4c9db647f76fc2406462fe4db0a6
--- /dev/null
+++ b/src/masks/multiseq_multiblock3d.py
@@ -0,0 +1,239 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from logging import getLogger
+from multiprocessing import Value
+
+import torch
+
+_GLOBAL_SEED = 0
+logger = getLogger()
+
+
+class MaskCollator(object):
+
+    def __init__(
+        self,
+        cfgs_mask,
+        dataset_fpcs,
+        crop_size=(224, 224),
+        patch_size=(16, 16),
+        tubelet_size=2,
+    ):
+        super(MaskCollator, self).__init__()
+
+        self.mask_generators = dict()
+        for fpc in dataset_fpcs:
+            self.mask_generators[fpc] = []
+            for m in cfgs_mask:
+                mask_generator = _MaskGenerator(
+                    crop_size=crop_size,
+                    num_frames=fpc,
+                    spatial_patch_size=patch_size,
+                    temporal_patch_size=tubelet_size,
+                    spatial_pred_mask_scale=m.get("spatial_scale"),
+                    temporal_pred_mask_scale=m.get("temporal_scale"),
+                    aspect_ratio=m.get("aspect_ratio"),
+                    npred=m.get("num_blocks"),
+                    max_context_frames_ratio=m.get("max_temporal_keep", 1.0),
+                    max_keep=m.get("max_keep", None),
+                    full_complement=m.get("full_complement", False),
+                    pred_full_complement=m.get("pred_full_complement", False),
+                    inv_block=m.get("inv_block", False),
+                )
+                self.mask_generators[fpc].append(mask_generator)
+
+    def step(self):
+        for fpc in self.mask_generators:
+            for mask_generator in self.mask_generators[fpc]:
+                mask_generator.step()
+
+    def __call__(self, batch):
+
+        # Batch: [buffer, label, clip_indices]
+        filtered_batches = {fpc: [] for fpc in self.mask_generators}
+        for sample in batch:
+            fpc = len(sample[-1][-1])
+            filtered_batches[fpc] += [sample]
+
+        fpc_collations = []
+        for fpc in filtered_batches:
+            fpc_batch = filtered_batches[fpc]
+            batch_size = len(fpc_batch)
+            if batch_size == 0:
+                continue
+            collated_batch = torch.utils.data.default_collate(fpc_batch)
+            collated_masks_pred, collated_masks_enc = [], []
+            for i, mask_generator in enumerate(self.mask_generators[fpc]):
+                masks_enc, masks_pred = mask_generator(batch_size)
+                collated_masks_enc.append(masks_enc)
+                collated_masks_pred.append(masks_pred)
+            fpc_collations += [(collated_batch, collated_masks_enc, collated_masks_pred)]
+
+        return fpc_collations
+
+
+class _MaskGenerator(object):
+
+    def __init__(
+        self,
+        crop_size=(224, 224),
+        num_frames=16,
+        spatial_patch_size=(16, 16),
+        temporal_patch_size=2,
+        spatial_pred_mask_scale=(0.2, 0.8),
+        temporal_pred_mask_scale=(1.0, 1.0),
+        aspect_ratio=(0.3, 3.0),
+        npred=1,
+        max_context_frames_ratio=1.0,
+        max_keep=None,
+        inv_block=False,
+        full_complement=False,
+        pred_full_complement=False,
+    ):
+        super(_MaskGenerator, self).__init__()
+        if not isinstance(crop_size, tuple):
+            crop_size = (crop_size,) * 2
+        if not isinstance(spatial_patch_size, tuple):
+            spatial_patch_size = (spatial_patch_size,) * 2
+        self.crop_size = crop_size
+        self.height, self.width = [crop_size[i] // spatial_patch_size[i] for i in (0, 1)]
+        self.duration = num_frames // temporal_patch_size
+        self.full_complement = full_complement
+        self.pred_full_complement = pred_full_complement
+
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+
+        self.aspect_ratio = aspect_ratio
+        self.spatial_pred_mask_scale = spatial_pred_mask_scale
+        self.temporal_pred_mask_scale = temporal_pred_mask_scale
+        self.npred = npred
+        self.max_context_duration = max(
+            1, int(self.duration * max_context_frames_ratio)
+        )  # maximum number of time-steps (frames) spanned by context mask
+        self.max_keep = max_keep  # maximum number of patches to keep in context
+        self._itr_counter = Value("i", -1)  # collator is shared across worker processes
+        self.inv_block = inv_block
+
+    def step(self):
+        i = self._itr_counter
+        with i.get_lock():
+            i.value += 1
+            v = i.value
+        return v
+
+    def _sample_block_size(self, generator, temporal_scale, spatial_scale, aspect_ratio_scale):
+        # -- Sample temporal block mask scale
+        _rand = torch.rand(1, generator=generator).item()
+        min_t, max_t = temporal_scale
+        temporal_mask_scale = min_t + _rand * (max_t - min_t)
+        t = max(1, int(self.duration * temporal_mask_scale))
+
+        # -- Sample spatial block mask scale
+        _rand = torch.rand(1, generator=generator).item()
+        min_s, max_s = spatial_scale
+        spatial_mask_scale = min_s + _rand * (max_s - min_s)
+        spatial_num_keep = int(self.height * self.width * spatial_mask_scale)
+
+        # -- Sample block aspect-ratio
+        _rand = torch.rand(1, generator=generator).item()
+        min_ar, max_ar = aspect_ratio_scale
+        aspect_ratio = min_ar + _rand * (max_ar - min_ar)
+
+        # -- Compute block height and width (given scale and aspect-ratio)
+        h = int(round(math.sqrt(spatial_num_keep * aspect_ratio)))
+        w = int(round(math.sqrt(spatial_num_keep / aspect_ratio)))
+        h = min(h, self.height)
+        w = min(w, self.width)
+
+        return (t, h, w)
+
+    def _sample_block_mask(self, b_size):
+        t, h, w = b_size
+        top = torch.randint(0, self.height - h + 1, (1,))
+        left = torch.randint(0, self.width - w + 1, (1,))
+        start = torch.randint(0, self.duration - t + 1, (1,))
+
+        mask = torch.ones((self.duration, self.height, self.width), dtype=torch.int32)
+        mask[start : start + t, top : top + h, left : left + w] = 0
+
+        # Context mask will only span the first X frames
+        # (X=self.max_context_frames)
+        if self.max_context_duration < self.duration:
+            mask[self.max_context_duration :, :, :] = 0
+
+        # --
+        return mask
+
+    def __call__(self, batch_size):
+        """
+        Create encoder and predictor masks when collating imgs into a batch
+        # 1. sample pred block size using seed
+        # 2. sample several pred block locations for each image (w/o seed)
+        # 3. return pred masks and complement (enc mask)
+        """
+        seed = self.step()
+        g = torch.Generator()
+        g.manual_seed(seed)
+        p_size = self._sample_block_size(
+            generator=g,
+            temporal_scale=self.temporal_pred_mask_scale,
+            spatial_scale=self.spatial_pred_mask_scale,
+            aspect_ratio_scale=self.aspect_ratio,
+        )
+
+        collated_masks_pred, collated_masks_enc = [], []
+        min_keep_enc = min_keep_pred = self.duration * self.height * self.width
+        for _ in range(batch_size):
+
+            empty_context = True
+            while empty_context:
+
+                mask_e = torch.ones((self.duration, self.height, self.width), dtype=torch.int32)
+                for _ in range(self.npred):
+                    mask_e *= self._sample_block_mask(p_size)
+                mask_e = mask_e.flatten()
+
+                mask_p = torch.argwhere(mask_e == 0).squeeze()
+                mask_e = torch.nonzero(mask_e).squeeze()
+
+                empty_context = len(mask_e) == 0
+                if not empty_context:
+                    min_keep_pred = min(min_keep_pred, len(mask_p))
+                    min_keep_enc = min(min_keep_enc, len(mask_e))
+                    collated_masks_pred.append(mask_p)
+                    collated_masks_enc.append(mask_e)
+
+        if self.max_keep is not None:
+            min_keep_enc = min(min_keep_enc, self.max_keep)
+
+        collated_masks_enc = [cm[:min_keep_enc] for cm in collated_masks_enc]
+        collated_masks_pred = [cm[:min_keep_pred] for cm in collated_masks_pred]
+        if self.full_complement:  # predictor mask is just complement of encoder mask
+            collated_masks_pred = [
+                torch.tensor(
+                    sorted(list(set(range(int(self.duration * self.height * self.width))) - set(cm.tolist()))),
+                    dtype=cm.dtype,
+                )
+                for cm in collated_masks_enc
+            ]
+        elif self.pred_full_complement:
+            collated_masks_enc = [
+                torch.tensor(
+                    sorted(list(set(range(int(self.duration * self.height * self.width))) - set(cm.tolist()))),
+                    dtype=cm.dtype,
+                )
+                for cm in collated_masks_pred
+            ]
+
+        collated_masks_enc = torch.utils.data.default_collate(collated_masks_enc)
+        collated_masks_pred = torch.utils.data.default_collate(collated_masks_pred)
+
+        if self.inv_block:
+            return collated_masks_pred, collated_masks_enc  # predict context from block
+        else:
+            return collated_masks_enc, collated_masks_pred
diff --git a/src/masks/utils.py b/src/masks/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1950be58bd0a95e7ab99113fd7abd40f92af4c8
--- /dev/null
+++ b/src/masks/utils.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def apply_masks(x, masks, concat=True):
+    """
+    :param x: tensor of shape [B (batch-size), N (num-patches), D (feature-dim)]
+    :param masks: list of tensors of shape [B, K] containing indices of K patches in [N] to keep
+    """
+    all_x = []
+    for m in masks:
+        mask_keep = m.unsqueeze(-1).repeat(1, 1, x.size(-1))
+        all_x += [torch.gather(x, dim=1, index=mask_keep)]
+    if not concat:
+        return all_x
+
+    return torch.cat(all_x, dim=0)
diff --git a/src/models/__pycache__/attentive_pooler.cpython-312.pyc b/src/models/__pycache__/attentive_pooler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d44ccb3b188bca2bda979c07417d951b75125d3d
Binary files /dev/null and b/src/models/__pycache__/attentive_pooler.cpython-312.pyc differ
diff --git a/src/models/__pycache__/vision_transformer.cpython-312.pyc b/src/models/__pycache__/vision_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7abfff43d598018a8ba5473df62433b30160d20b
Binary files /dev/null and b/src/models/__pycache__/vision_transformer.cpython-312.pyc differ
diff --git a/src/models/ac_predictor.py b/src/models/ac_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2d13ecfcab2ddf54fdef586bc69230c4ed1788
--- /dev/null
+++ b/src/models/ac_predictor.py
@@ -0,0 +1,200 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from src.models.utils.modules import ACBlock as Block
+from src.models.utils.modules import build_action_block_causal_attention_mask
+from src.utils.tensors import trunc_normal_
+
+
+class VisionTransformerPredictorAC(nn.Module):
+    """Action Conditioned Vision Transformer Predictor"""
+
+    def __init__(
+        self,
+        img_size=(224, 224),
+        patch_size=16,
+        num_frames=1,
+        tubelet_size=2,
+        embed_dim=768,
+        predictor_embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        init_std=0.02,
+        uniform_power=True,
+        use_silu=False,
+        wide_silu=True,
+        is_frame_causal=True,
+        use_activation_checkpointing=False,
+        use_rope=True,
+        action_embed_dim=7,
+        use_extrinsics=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.is_frame_causal = is_frame_causal
+        self.use_extrinsics = use_extrinsics
+
+        # Map input to predictor dimension
+        self.predictor_embed = nn.Linear(embed_dim, predictor_embed_dim, bias=True)
+        self.action_encoder = nn.Linear(action_embed_dim, predictor_embed_dim, bias=True)
+        self.state_encoder = nn.Linear(action_embed_dim, predictor_embed_dim, bias=True)
+        self.extrinsics_encoder = nn.Linear(action_embed_dim - 1, predictor_embed_dim, bias=True)
+
+        # Determine positional embedding
+        if type(img_size) is int:
+            img_size = (img_size, img_size)
+        self.img_height, self.img_width = img_size
+        self.patch_size = patch_size
+        # --
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.is_video = num_frames > 1
+
+        self.grid_height = img_size[0] // self.patch_size
+        self.grid_width = img_size[1] // self.patch_size
+        self.use_activation_checkpointing = use_activation_checkpointing
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        # Position embedding
+        self.uniform_power = uniform_power
+
+        # Attention Blocks
+        self.use_rope = use_rope
+        self.predictor_blocks = nn.ModuleList(
+            [
+                Block(
+                    use_rope=use_rope,
+                    grid_size=self.grid_height,
+                    dim=predictor_embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer=nn.SiLU if use_silu else nn.GELU,
+                    wide_silu=wide_silu,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # Normalize & project back to input dimension
+        self.predictor_norm = norm_layer(predictor_embed_dim)
+        self.predictor_proj = nn.Linear(predictor_embed_dim, embed_dim, bias=True)
+
+        # ------ initialize weights
+        self.init_std = init_std
+        self.apply(self._init_weights)
+        self._rescale_blocks()
+
+        attn_mask = None
+        if self.is_frame_causal:
+            grid_depth = self.num_frames // self.tubelet_size
+            grid_height = self.img_height // self.patch_size
+            grid_width = self.img_width // self.patch_size
+            attn_mask = build_action_block_causal_attention_mask(
+                grid_depth, grid_height, grid_width, add_tokens=3 if use_extrinsics else 2
+            )
+        self.attn_mask = attn_mask
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=self.init_std)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _rescale_blocks(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.predictor_blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def forward(self, x, actions, states, extrinsics=None):
+        """
+        :param x: context tokens
+        """
+        # Map tokens to pedictor dimensions
+        x = self.predictor_embed(x)
+        B, N_ctxt, D = x.size()
+        T = N_ctxt // (self.grid_height * self.grid_width)
+
+        # Interleave action tokens
+        s = self.state_encoder(states).unsqueeze(2)
+        a = self.action_encoder(actions).unsqueeze(2)
+        x = x.view(B, T, self.grid_height * self.grid_width, D)  # [B, T, H*W, D]
+        if self.use_extrinsics:
+            e = self.extrinsics_encoder(extrinsics).unsqueeze(2)
+            x = torch.cat([a, s, e, x], dim=2).flatten(1, 2)  # [B, T*(H*W+3), D]
+        else:
+            x = torch.cat([a, s, x], dim=2).flatten(1, 2)  # [B, T*(H*W+2), D]
+
+        cond_tokens = 3 if self.use_extrinsics else 2
+        attn_mask = self.attn_mask[: x.size(1), : x.size(1)].to(x.device, non_blocking=True)
+
+        # Fwd prop
+        for i, blk in enumerate(self.predictor_blocks):
+            if self.use_activation_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(
+                    blk,
+                    x,
+                    mask=None,
+                    attn_mask=attn_mask,
+                    T=T,
+                    H=self.grid_height,
+                    W=self.grid_width,
+                    action_tokens=cond_tokens,
+                    use_reentrant=False,
+                )
+            else:
+                x = blk(
+                    x,
+                    mask=None,
+                    attn_mask=attn_mask,
+                    T=T,
+                    H=self.grid_height,
+                    W=self.grid_width,
+                    action_tokens=cond_tokens,
+                )
+
+        # Split out action and frame tokens
+        x = x.view(B, T, cond_tokens + self.grid_height * self.grid_width, D)  # [B, T, K+H*W, D]
+        x = x[:, :, cond_tokens:, :].flatten(1, 2)
+
+        x = self.predictor_norm(x)
+        x = self.predictor_proj(x)
+
+        return x
+
+
+def vit_ac_predictor(**kwargs):
+    model = VisionTransformerPredictorAC(
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
diff --git a/src/models/attentive_pooler.py b/src/models/attentive_pooler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc2802c26e2f4a6af9d5d930deb8e488f887e25
--- /dev/null
+++ b/src/models/attentive_pooler.py
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import math
+
+import torch
+import torch.nn as nn
+
+from src.models.utils.modules import Block, CrossAttention, CrossAttentionBlock
+from src.utils.tensors import trunc_normal_
+
+
+class AttentivePooler(nn.Module):
+    """Attentive Pooler"""
+
+    def __init__(
+        self,
+        num_queries=1,
+        embed_dim=768,
+        num_heads=12,
+        mlp_ratio=4.0,
+        depth=1,
+        norm_layer=nn.LayerNorm,
+        init_std=0.02,
+        qkv_bias=True,
+        complete_block=True,
+        use_activation_checkpointing=False,
+    ):
+        super().__init__()
+        self.use_activation_checkpointing = use_activation_checkpointing
+        self.query_tokens = nn.Parameter(torch.zeros(1, num_queries, embed_dim))
+
+        self.complete_block = complete_block
+        if complete_block:
+            self.cross_attention_block = CrossAttentionBlock(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, norm_layer=norm_layer
+            )
+        else:
+            self.cross_attention_block = CrossAttention(dim=embed_dim, num_heads=num_heads, qkv_bias=qkv_bias)
+
+        self.blocks = None
+        if depth > 1:
+            self.blocks = nn.ModuleList(
+                [
+                    Block(
+                        dim=embed_dim,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        qk_scale=False,
+                        norm_layer=norm_layer,
+                    )
+                    for i in range(depth - 1)
+                ]
+            )
+
+        self.init_std = init_std
+        trunc_normal_(self.query_tokens, std=self.init_std)
+        self.apply(self._init_weights)
+        self._rescale_blocks()
+
+    def _rescale_blocks(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        layer_id = 0
+        if self.blocks is not None:
+            for layer_id, layer in enumerate(self.blocks):
+                rescale(layer.attn.proj.weight.data, layer_id + 1)
+                rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+        if self.complete_block:
+            rescale(self.cross_attention_block.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=self.init_std)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            trunc_normal_(m.weight, std=self.init_std)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        if self.blocks is not None:
+            for blk in self.blocks:
+                if self.use_activation_checkpointing:
+                    x = torch.utils.checkpoint.checkpoint(blk, x, False, None, use_reentrant=False)
+                else:
+                    x = blk(x)
+        q = self.query_tokens.repeat(len(x), 1, 1)
+        q = self.cross_attention_block(q, x)
+        return q
+
+
+class AttentiveClassifier(nn.Module):
+    """Attentive Classifier"""
+
+    def __init__(
+        self,
+        embed_dim=768,
+        num_heads=12,
+        mlp_ratio=4.0,
+        depth=1,
+        norm_layer=nn.LayerNorm,
+        init_std=0.02,
+        qkv_bias=True,
+        num_classes=1000,
+        complete_block=True,
+        use_activation_checkpointing=False,
+    ):
+        super().__init__()
+        self.pooler = AttentivePooler(
+            num_queries=1,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            depth=depth,
+            norm_layer=norm_layer,
+            init_std=init_std,
+            qkv_bias=qkv_bias,
+            complete_block=complete_block,
+            use_activation_checkpointing=use_activation_checkpointing,
+        )
+        self.linear = nn.Linear(embed_dim, num_classes, bias=True)
+
+    def forward(self, x):
+        x = self.pooler(x).squeeze(1)
+        x = self.linear(x)
+        return x
diff --git a/src/models/predictor.py b/src/models/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..30313838d715bf631e6bb9ab46f5aebdca6ab02a
--- /dev/null
+++ b/src/models/predictor.py
@@ -0,0 +1,253 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from src.masks.utils import apply_masks
+from src.models.utils.modules import Block
+from src.models.utils.pos_embs import get_2d_sincos_pos_embed, get_3d_sincos_pos_embed
+from src.utils.tensors import repeat_interleave_batch, trunc_normal_
+
+
+class VisionTransformerPredictor(nn.Module):
+    """Vision Transformer"""
+
+    def __init__(
+        self,
+        img_size=(224, 224),
+        patch_size=16,
+        num_frames=1,
+        tubelet_size=2,
+        embed_dim=768,
+        predictor_embed_dim=384,
+        depth=6,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        init_std=0.02,
+        uniform_power=False,
+        use_mask_tokens=False,
+        num_mask_tokens=2,
+        zero_init_mask_tokens=True,
+        use_silu=False,
+        wide_silu=True,
+        use_activation_checkpointing=False,
+        return_all_tokens=False,
+        chop_last_n_tokens=0,
+        use_rope=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.return_all_tokens = return_all_tokens
+        self.chop_last_n_tokens = chop_last_n_tokens
+
+        # Map input to predictor dimension
+        self.predictor_embed = nn.Linear(embed_dim, predictor_embed_dim, bias=True)
+
+        # Mask tokens
+        self.mask_tokens = None
+        self.num_mask_tokens = 0
+        if use_mask_tokens:
+            self.num_mask_tokens = num_mask_tokens
+            self.mask_tokens = nn.ParameterList(
+                [nn.Parameter(torch.zeros(1, 1, predictor_embed_dim)) for i in range(num_mask_tokens)]
+            )
+
+        # Determine positional embedding
+        if type(img_size) is int:
+            img_size = (img_size, img_size)
+        self.img_height, self.img_width = img_size
+        self.patch_size = patch_size
+        # --
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.is_video = num_frames > 1
+
+        self.grid_height = img_size[0] // self.patch_size
+        self.grid_width = img_size[1] // self.patch_size
+        self.grid_depth = num_frames // self.tubelet_size
+        self.use_activation_checkpointing = use_activation_checkpointing
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if self.is_video:
+            self.num_patches = num_patches = (
+                (num_frames // tubelet_size) * (img_size[0] // patch_size) * (img_size[1] // patch_size)
+            )
+        else:
+            self.num_patches = num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
+        # Position embedding
+        self.uniform_power = uniform_power
+
+        self.predictor_pos_embed = None
+        if not use_rope:
+            self.predictor_pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches, predictor_embed_dim), requires_grad=False
+            )
+
+        # Attention Blocks
+        self.use_rope = use_rope
+        self.predictor_blocks = nn.ModuleList(
+            [
+                Block(
+                    use_rope=use_rope,
+                    grid_size=self.grid_height,
+                    grid_depth=self.grid_depth,
+                    dim=predictor_embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer=nn.SiLU if use_silu else nn.GELU,
+                    wide_silu=wide_silu,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # Normalize & project back to input dimension
+        self.predictor_norm = norm_layer(predictor_embed_dim)
+        self.predictor_proj = nn.Linear(predictor_embed_dim, embed_dim, bias=True)
+
+        # ------ initialize weights
+        if self.predictor_pos_embed is not None:
+            self._init_pos_embed(self.predictor_pos_embed.data)  # sincos pos-embed
+        self.init_std = init_std
+        if not zero_init_mask_tokens:
+            for mt in self.mask_tokens:
+                trunc_normal_(mt, std=init_std)
+        self.apply(self._init_weights)
+        self._rescale_blocks()
+
+    def _init_pos_embed(self, pos_embed):
+        embed_dim = pos_embed.size(-1)
+        grid_size = self.img_height // self.patch_size  # TODO: update; currently assumes square input
+        if self.is_video:
+            grid_depth = self.num_frames // self.tubelet_size
+            sincos = get_3d_sincos_pos_embed(
+                embed_dim, grid_size, grid_depth, cls_token=False, uniform_power=self.uniform_power
+            )
+        else:
+            sincos = get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False)
+        pos_embed.copy_(torch.from_numpy(sincos).float().unsqueeze(0))
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=self.init_std)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _rescale_blocks(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.predictor_blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def forward(self, x, masks_x, masks_y, mask_index=1, has_cls=False):
+        """
+        :param x: context tokens
+        :param masks_x: indices of context tokens in input
+        :params masks_y: indices of target tokens in input
+        """
+        assert (masks_x is not None) and (masks_y is not None), "Cannot run predictor without mask indices"
+        if not isinstance(masks_x, list):
+            masks_x = [masks_x]
+        if not isinstance(masks_y, list):
+            masks_y = [masks_y]
+
+        # Batch Size
+        B = len(x) // len(masks_x)
+
+        # Map context tokens to pedictor dimensions
+        x = self.predictor_embed(x)
+        if has_cls:
+            x_cls = x[:, :1, :]
+            x = x[:, 1:, :]
+        _, N_ctxt, D = x.shape
+
+        # Add positional embedding to ctxt tokens
+        if not self.use_rope:
+            x_pos_embed = self.predictor_pos_embed.repeat(B, 1, 1)
+            x += apply_masks(x_pos_embed, masks_x)
+
+        # Make target tokens
+        mask_index = mask_index % self.num_mask_tokens
+        pred_tokens = self.mask_tokens[mask_index]
+        pred_tokens = pred_tokens.repeat(B, self.num_patches, 1)
+        pred_tokens = apply_masks(pred_tokens, masks_y)
+        # -- add pos embed
+        if not self.use_rope:
+            pos_embs = self.predictor_pos_embed.repeat(B, 1, 1)
+            pos_embs = apply_masks(pos_embs, masks_y)
+            pos_embs = repeat_interleave_batch(pos_embs, B, repeat=len(masks_x))
+            pred_tokens += pos_embs
+
+        # Concatenate context & target tokens
+        x = x.repeat(len(masks_x), 1, 1)
+        x = torch.cat([x, pred_tokens], dim=1)
+
+        # Positions of context & target tokens
+        masks_x = torch.cat(masks_x, dim=0)
+        masks_y = torch.cat(masks_y, dim=0)
+        masks = torch.cat([masks_x, masks_y], dim=1)
+
+        # Put tokens in sorted order
+        argsort = torch.argsort(masks, dim=1)  # [B, N]
+        masks = torch.stack([masks[i, row] for i, row in enumerate(argsort)], dim=0)
+        x = torch.stack([x[i, row, :] for i, row in enumerate(argsort)], dim=0)
+
+        # Remove the last n tokens of sorted sequence before processing
+        if self.chop_last_n_tokens > 0:
+            x = x[:, : -self.chop_last_n_tokens]
+            masks = masks[:, : -self.chop_last_n_tokens]
+
+        if has_cls:
+            x = torch.cat([x_cls, x], dim=1)
+
+        # Fwd prop
+        for i, blk in enumerate(self.predictor_blocks):
+            if self.use_activation_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(blk, x, masks, None, use_reentrant=False)
+            else:
+                x = blk(x, mask=masks, attn_mask=None)
+        x = self.predictor_norm(x)
+
+        if has_cls:
+            x = x[:, 1:, :]
+
+        # Return output corresponding to target tokens
+        if not self.return_all_tokens:
+            reverse_argsort = torch.argsort(argsort, dim=1)  # [B, N]
+            x = torch.stack([x[i, row, :] for i, row in enumerate(reverse_argsort)], dim=0)
+            x = x[:, N_ctxt:]
+
+        x = self.predictor_proj(x)
+
+        return x
+
+
+def vit_predictor(**kwargs):
+    model = VisionTransformerPredictor(
+        mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs
+    )
+    return model
diff --git a/src/models/utils/__pycache__/modules.cpython-312.pyc b/src/models/utils/__pycache__/modules.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f3c3e97ea90db5975c3bb7dbfc50fb8ce3a9951
Binary files /dev/null and b/src/models/utils/__pycache__/modules.cpython-312.pyc differ
diff --git a/src/models/utils/__pycache__/patch_embed.cpython-312.pyc b/src/models/utils/__pycache__/patch_embed.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e481b5739bc4c6d29c7203b14f6ca75c180dd580
Binary files /dev/null and b/src/models/utils/__pycache__/patch_embed.cpython-312.pyc differ
diff --git a/src/models/utils/__pycache__/pos_embs.cpython-312.pyc b/src/models/utils/__pycache__/pos_embs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a70ae7d26a0e6981dac43d91f8764452deb75b9f
Binary files /dev/null and b/src/models/utils/__pycache__/pos_embs.cpython-312.pyc differ
diff --git a/src/models/utils/modules.py b/src/models/utils/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..d21aba03bf180dbdcd8ec5e03f8fe30110267f5b
--- /dev/null
+++ b/src/models/utils/modules.py
@@ -0,0 +1,610 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import drop_path
+
+
+def build_action_block_causal_attention_mask(T, H, W, add_tokens=1):
+    N_T = add_tokens + (H * W)
+    N = T * N_T
+    mask = torch.zeros(N, N).bool()
+    mask_block = torch.ones(N_T, N_T).bool()
+    local_window_time = T
+
+    for t1 in range(T):
+        for t2 in range(max(0, t1 - local_window_time + 1), t1 + 1):
+            mask[t1 * N_T : (t1 + 1) * N_T, t2 * N_T : (t2 + 1) * N_T] = mask_block
+
+    return mask
+
+
+def rotate_queries_or_keys(x, pos):
+    B, num_heads, N, D = x.size()
+    assert D % 2 == 0, "Embedding dimension must be a multiple of 2 for block matrix rotation"
+
+    # -- compute angle for each position
+    omega = torch.arange(D // 2, dtype=x.dtype, device=x.device)
+    omega /= D / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    freq = torch.einsum("..., f -> ... f", pos, omega)  # (..., N, D/2), outer product
+
+    # -- build rotation matrix and apply
+    emb_sin = freq.sin()  # (..., N, D/2)
+    emb_cos = freq.cos()  # (..., N, D/2)
+
+    emb_sin = emb_sin.squeeze(-1).repeat(1, 1, 1, 2)
+    emb_cos = emb_cos.squeeze(-1).repeat(1, 1, 1, 2)
+
+    # --
+    y = x.unflatten(-1, (-1, 2))
+    y1, y2 = y.unbind(
+        dim=-1,
+    )
+    y = torch.stack((-y2, y1), dim=-1)
+    y = y.flatten(-2)
+    return (x * emb_cos) + (y * emb_sin)
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.0, wide_silu=True
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        swiglu_hidden_features = hidden_features = hidden_features or in_features
+        if wide_silu:
+            swiglu_hidden_features = int(2 * hidden_features / 3)
+            align_as = 8
+            swiglu_hidden_features = (swiglu_hidden_features + align_as - 1) // align_as * align_as
+        self.fc1 = nn.Linear(in_features, swiglu_hidden_features)
+        self.fc2 = nn.Linear(in_features, swiglu_hidden_features)
+        self.act = act_layer()
+        self.fc3 = nn.Linear(swiglu_hidden_features, out_features)
+
+    def forward(self, x):
+        x1 = self.fc1(x)
+        x2 = self.fc2(x)
+        hidden = F.silu(x1) * x2
+        return self.fc3(hidden)
+
+
+class ACRoPEAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        use_sdpa=True,
+        is_causal=False,
+        grid_size=16,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop_prob = proj_drop
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.use_sdpa = use_sdpa
+        # --
+        self.d_dim = int(2 * ((head_dim // 3) // 2))
+        self.h_dim = int(2 * ((head_dim // 3) // 2))
+        self.w_dim = int(2 * ((head_dim // 3) // 2))
+        self.grid_size = grid_size
+        self.is_causal = is_causal
+
+    def _get_frame_pos(self, ids, H_patches, W_patches):
+        tokens_per_frame = int(H_patches * W_patches)
+        return ids // tokens_per_frame
+
+    def _get_height_pos(self, ids, H_patches, W_patches):
+        # Remove frame component from ids
+        tokens_per_frame = int(H_patches * W_patches)
+        tokens_per_row = W_patches
+        frame_ids = self._get_frame_pos(ids, H_patches, W_patches)
+        ids = ids - tokens_per_frame * frame_ids
+        # --
+        return ids // tokens_per_row
+
+    def separate_positions(self, ids, H_patches, W_patches):
+        tokens_per_frame = int(H_patches * W_patches)
+        tokens_per_row = W_patches
+        frame_ids = self._get_frame_pos(ids, H_patches, W_patches)
+        # --
+        height_ids = self._get_height_pos(ids, H_patches, W_patches)
+        # --
+        # Remove frame component from ids (1st term) and height component (2nd term)
+        width_ids = (ids - tokens_per_frame * frame_ids) - tokens_per_row * height_ids
+        return 1.0 * frame_ids, 1.0 * height_ids, 1.0 * width_ids
+
+    def forward(self, x, mask=None, attn_mask=None, T=None, H=None, W=None, action_tokens=0):
+        B, N, C = x.size()
+
+        # -- compute position of each frame token
+        if mask is not None:
+            mask = mask.unsqueeze(1).repeat(1, self.num_heads, 1)
+            d_mask, h_mask, w_mask = self.separate_positions(mask, H, W)
+        else:
+            mask = torch.arange(int(T * H * W), device=x.device)
+            d_mask, h_mask, w_mask = self.separate_positions(mask, H, W)
+
+        # -- snap spatial positions to grid size
+        h_mask *= self.grid_size / H
+        w_mask *= self.grid_size / W
+
+        # -- split out action tokens from sequence
+        if action_tokens > 0:
+            x = x.view(B, -1, action_tokens + H * W, C)  # [B, T, 1+H*W, D]
+
+            action_q, action_k, action_v = [], [], []
+            for i in range(action_tokens):
+                a = x[:, :, i : i + 1, :].flatten(1, 2)
+                # Note action tokens do not work with masking
+                # -- compute qkv for action tokens and rotate
+                qkv = self.qkv(a).unflatten(-1, (3, self.num_heads, -1)).permute(2, 0, 3, 1, 4)
+                q, k, v = qkv[0], qkv[1], qkv[2]  # [B, num_heads, N, D]
+                # --
+                qd = rotate_queries_or_keys(q[..., : self.d_dim], pos=torch.arange(T, device=x.device))
+                kd = rotate_queries_or_keys(k[..., : self.d_dim], pos=torch.arange(T, device=x.device))
+                qr = q[..., self.d_dim :]
+                kr = k[..., self.d_dim :]
+                action_q += [torch.cat([qd, qr], dim=-1).view(B, self.num_heads, T, 1, -1)]
+                action_k += [torch.cat([kd, kr], dim=-1).view(B, self.num_heads, T, 1, -1)]
+                action_v += [v.view(B, self.num_heads, T, 1, -1)]
+
+            action_q = torch.cat(action_q, dim=3).flatten(2, 3)
+            action_k = torch.cat(action_k, dim=3).flatten(2, 3)
+            action_v = torch.cat(action_v, dim=3).flatten(2, 3)
+            x = x[:, :, action_tokens:, :].flatten(1, 2)
+
+        # -- compute qkv for frame tokens and rotate
+        qkv = self.qkv(x).unflatten(-1, (3, self.num_heads, -1)).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # [B, num_heads, N, D]
+
+        s = 0
+        # Rotate depth
+        qd = rotate_queries_or_keys(q[..., s : s + self.d_dim], pos=d_mask)
+        kd = rotate_queries_or_keys(k[..., s : s + self.d_dim], pos=d_mask)
+        s += self.d_dim
+        # Rotate height dim
+        qh = rotate_queries_or_keys(q[..., s : s + self.h_dim], pos=h_mask)
+        kh = rotate_queries_or_keys(k[..., s : s + self.h_dim], pos=h_mask)
+        s += self.h_dim
+        # Rotate width dim
+        qw = rotate_queries_or_keys(q[..., s : s + self.w_dim], pos=w_mask)
+        kw = rotate_queries_or_keys(k[..., s : s + self.w_dim], pos=w_mask)
+        s += self.w_dim
+
+        # Combine rotated dimension
+        if s < self.head_dim:
+            qr = q[..., s:]
+            kr = k[..., s:]
+            q = torch.cat([qd, qh, qw, qr], dim=-1)
+            k = torch.cat([kd, kh, kw, kr], dim=-1)
+        else:
+            q = torch.cat([qd, qh, qw], dim=-1)
+            k = torch.cat([kd, kh, kw], dim=-1)
+
+        if action_tokens > 0:
+
+            def merge_(tx, ta):
+                """tx, tx in [B, num_heads, N, D]"""
+                tx = tx.view(B, self.num_heads, T, H * W, -1)  # [B, T, H*W, D]
+                ta = ta.view(B, self.num_heads, T, action_tokens, -1)  # [B, T, A, D]
+                return torch.cat([ta, tx], dim=3).flatten(2, 3)
+
+            q = merge_(q, action_q)
+            k = merge_(k, action_k)
+            v = merge_(v, action_v)
+
+        if attn_mask is not None or self.use_sdpa:
+            with torch.backends.cuda.sdp_kernel():
+                x = F.scaled_dot_product_attention(
+                    q, k, v, dropout_p=self.proj_drop_prob, is_causal=self.is_causal, attn_mask=attn_mask
+                )
+                attn = None
+        else:
+            attn = (q @ k.transpose(-2, -1)) * self.scale  # [B, num_heads, D, D]
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class RoPEAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        use_sdpa=True,
+        grid_size=14,
+        is_causal=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop_prob = proj_drop
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.use_sdpa = use_sdpa
+        # --
+        self.d_dim = int(2 * ((head_dim // 3) // 2))
+        self.h_dim = int(2 * ((head_dim // 3) // 2))
+        self.w_dim = int(2 * ((head_dim // 3) // 2))
+        self.grid_size = grid_size
+        self.is_causal = is_causal
+
+    def _get_frame_pos(self, ids, H_patches=None, W_patches=None):
+        if H_patches is None or W_patches is None:
+            tokens_per_frame = int(self.grid_size * self.grid_size)
+        else:
+            tokens_per_frame = int(H_patches * W_patches)
+        return ids // tokens_per_frame
+
+    def _get_height_pos(self, ids, H_patches=None, W_patches=None):
+        # Remove frame component from ids
+        if H_patches is None or W_patches is None:
+            tokens_per_frame = int(self.grid_size * self.grid_size)
+            tokens_per_row = self.grid_size
+        else:
+            tokens_per_frame = int(H_patches * W_patches)
+            tokens_per_row = W_patches
+        frame_ids = self._get_frame_pos(ids, H_patches, W_patches)
+        ids = ids - tokens_per_frame * frame_ids
+        # --
+        return ids // tokens_per_row
+
+    def separate_positions(self, ids, H_patches=None, W_patches=None):
+        if H_patches is None or W_patches is None:
+            tokens_per_frame = int(self.grid_size * self.grid_size)
+            tokens_per_row = self.grid_size
+        else:
+            tokens_per_frame = int(H_patches * W_patches)
+            tokens_per_row = W_patches
+        frame_ids = self._get_frame_pos(ids, H_patches, W_patches)
+        # --
+        height_ids = self._get_height_pos(ids, H_patches, W_patches)
+        # --
+        # Remove frame component from ids (1st term) and height component (2nd term)
+        width_ids = (ids - tokens_per_frame * frame_ids) - tokens_per_row * height_ids
+        return frame_ids, height_ids, width_ids
+
+    def forward(self, x, mask=None, attn_mask=None, T=None, H_patches=None, W_patches=None):
+        B, N, C = x.size()
+        grid_depth = int(N // (self.grid_size * self.grid_size))
+
+        qkv = self.qkv(x).unflatten(-1, (3, self.num_heads, -1)).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # [B, num_heads, N, D]
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).repeat(1, self.num_heads, 1)
+            d_mask, h_mask, w_mask = self.separate_positions(mask, H_patches, W_patches)
+        else:
+            if T is None or H_patches is None or W_patches is None:
+                mask = torch.arange(int(grid_depth * self.grid_size * self.grid_size), device=x.device)
+            else:
+                mask = torch.arange(int(T * H_patches * W_patches), device=x.device)
+            d_mask, h_mask, w_mask = self.separate_positions(mask, H_patches, W_patches)
+
+        s = 0
+        # Rotate depth
+        qd = rotate_queries_or_keys(q[..., s : s + self.d_dim], pos=d_mask)
+        kd = rotate_queries_or_keys(k[..., s : s + self.d_dim], pos=d_mask)
+        s += self.d_dim
+        # Rotate height dim
+        qh = rotate_queries_or_keys(q[..., s : s + self.h_dim], pos=h_mask)
+        kh = rotate_queries_or_keys(k[..., s : s + self.h_dim], pos=h_mask)
+        s += self.h_dim
+        # Rotate width dim
+        qw = rotate_queries_or_keys(q[..., s : s + self.w_dim], pos=w_mask)
+        kw = rotate_queries_or_keys(k[..., s : s + self.w_dim], pos=w_mask)
+        s += self.w_dim
+
+        # Combine rotated dimension
+        if s < self.head_dim:
+            qr = q[..., s:]
+            kr = k[..., s:]
+            q = torch.cat([qd, qh, qw, qr], dim=-1)
+            k = torch.cat([kd, kh, kw, kr], dim=-1)
+        else:
+            q = torch.cat([qd, qh, qw], dim=-1)
+            k = torch.cat([kd, kh, kw], dim=-1)
+
+        if attn_mask is not None or self.use_sdpa:
+            with torch.backends.cuda.sdp_kernel():
+                x = F.scaled_dot_product_attention(
+                    q, k, v, dropout_p=self.proj_drop_prob, is_causal=self.is_causal, attn_mask=attn_mask
+                )
+                attn = None
+        else:
+            attn = (q @ k.transpose(-2, -1)) * self.scale  # [B, num_heads, D, D]
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        use_sdpa=True,
+        is_causal=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop_prob = proj_drop
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.use_sdpa = use_sdpa
+        self.is_causal = is_causal
+
+    def forward(self, x, mask=None, attn_mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # [B, num_heads, N, D]
+
+        if attn_mask is not None or self.use_sdpa:
+            with torch.backends.cuda.sdp_kernel():
+                x = F.scaled_dot_product_attention(
+                    q, k, v, dropout_p=self.proj_drop_prob, is_causal=self.is_causal, attn_mask=attn_mask
+                )
+                attn = None
+        else:
+            attn = (q @ k.transpose(-2, -1)) * self.scale  # [B, num_heads, D, D]
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class ACBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        wide_silu=True,
+        norm_layer=nn.LayerNorm,
+        use_sdpa=True,
+        is_causal=False,
+        grid_size=16,
+        use_rope=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        if use_rope:
+            self.attn = ACRoPEAttention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                use_sdpa=use_sdpa,
+                is_causal=is_causal,
+                grid_size=grid_size,
+                proj_drop=drop,
+            )
+        else:
+            self.attn = Attention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                use_sdpa=use_sdpa,
+                is_causal=is_causal,
+                proj_drop=drop,
+            )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if act_layer is nn.SiLU:
+            self.mlp = SwiGLUFFN(
+                in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, wide_silu=wide_silu, drop=drop
+            )
+        else:
+            self.mlp = MLP(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x, mask=None, attn_mask=None, T=None, H=None, W=None, action_tokens=0):
+        y = self.norm1(x)
+        if isinstance(self.attn, ACRoPEAttention):
+            y = self.attn(y, mask=mask, attn_mask=attn_mask, T=T, H=H, W=W, action_tokens=action_tokens)
+        else:
+            y = self.attn(y, mask=mask, attn_mask=attn_mask)
+        x = x + self.drop_path(y)
+        y = self.norm2(x)
+        x = x + self.drop_path(self.mlp(y))
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        wide_silu=True,
+        norm_layer=nn.LayerNorm,
+        use_sdpa=True,
+        is_causal=False,
+        grid_size=16,
+        use_rope=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        if use_rope:
+            self.attn = RoPEAttention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                use_sdpa=use_sdpa,
+                is_causal=is_causal,
+                grid_size=grid_size,
+                proj_drop=drop,
+            )
+        else:
+            self.attn = Attention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                use_sdpa=use_sdpa,
+                is_causal=is_causal,
+                proj_drop=drop,
+            )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if act_layer is nn.SiLU:
+            self.mlp = SwiGLUFFN(
+                in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, wide_silu=wide_silu, drop=drop
+            )
+        else:
+            self.mlp = MLP(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x, mask=None, attn_mask=None, T=None, H_patches=None, W_patches=None):
+        if isinstance(self.attn, RoPEAttention):
+            y = self.attn(self.norm1(x), mask=mask, attn_mask=attn_mask, T=T, H_patches=H_patches, W_patches=W_patches)
+        else:
+            y = self.attn(self.norm1(x), mask=mask, attn_mask=attn_mask)
+        x = x + self.drop_path(y)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, dim, num_heads=12, qkv_bias=False, use_sdpa=True):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, int(dim * 2), bias=qkv_bias)
+        # self.proj = nn.Linear(dim, dim)
+        self.use_sdpa = use_sdpa
+
+    def forward(self, q, x):
+        B, n, C = q.shape
+        q = self.q(q).reshape(B, n, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        B, N, C = x.shape
+        kv = self.kv(x).reshape(B, N, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]  # (batch_size, num_heads, seq_len, feature_dim_per_head)
+
+        if self.use_sdpa:
+            with torch.backends.cuda.sdp_kernel():
+                q = F.scaled_dot_product_attention(q, k, v)
+        else:
+            xattn = (q @ k.transpose(-2, -1)) * self.scale
+            xattn = xattn.softmax(dim=-1)  # (batch_size, num_heads, query_len, seq_len)
+            q = xattn @ v
+
+        q = q.transpose(1, 2).reshape(B, n, C)
+        return q
+
+
+class CrossAttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.xattn = CrossAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
+
+    def forward(self, q, x):
+        y = self.xattn(q, self.norm1(x))
+        q = q + y
+        q = q + self.mlp(self.norm2(q))
+        return q
diff --git a/src/models/utils/patch_embed.py b/src/models/utils/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6405f0e804a6278b712ae255570700cfa7445b7
--- /dev/null
+++ b/src/models/utils/patch_embed.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+from einops import rearrange
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding
+    """
+
+    def __init__(self, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class PatchEmbed3D(nn.Module):
+    """
+    Image to Patch Embedding
+    """
+
+    def __init__(
+        self,
+        patch_size=16,
+        tubelet_size=2,
+        in_chans=3,
+        embed_dim=768,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.tubelet_size = tubelet_size
+
+        self.proj = nn.Conv3d(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=(tubelet_size, patch_size, patch_size),
+            stride=(tubelet_size, patch_size, patch_size),
+        )
+
+    def forward(self, x, **kwargs):
+        B, C, T, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
diff --git a/src/models/utils/pos_embs.py b/src/models/utils/pos_embs.py
new file mode 100644
index 0000000000000000000000000000000000000000..8630c14c844344316dc38b1f14b9adfc51e81f9d
--- /dev/null
+++ b/src/models/utils/pos_embs.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+
+def get_3d_sincos_pos_embed(embed_dim, grid_size, grid_depth, cls_token=False, uniform_power=False):
+    """
+    grid_size: int of the grid height and width
+    grid_depth: int of the grid depth
+    returns:
+        pos_embed: [grid_depth*grid_size*grid_size, embed_dim] (w/o cls_token)
+                or [1+grid_depth*grid_size*grid_size, embed_dim] (w/ cls_token)
+    """
+    grid_d = np.arange(grid_depth, dtype=float)
+    grid_h = np.arange(grid_size, dtype=float)
+    grid_w = np.arange(grid_size, dtype=float)
+    grid_h, grid_d, grid_w = np.meshgrid(
+        grid_h, grid_d, grid_w
+    )  # order of meshgrid is very important for indexing as [d,h,w]
+
+    if not uniform_power:
+        h_embed_dim = embed_dim // 4
+        w_embed_dim = embed_dim // 4
+        d_embed_dim = embed_dim // 2
+    else:
+        h_embed_dim = w_embed_dim = d_embed_dim = int(np.ceil(embed_dim / 6) * 2)
+
+    emb_h = get_1d_sincos_pos_embed_from_grid(h_embed_dim, grid_h)  # (T*H*W, D1)
+    emb_w = get_1d_sincos_pos_embed_from_grid(w_embed_dim, grid_w)  # (T*H*W, D2)
+    emb_d = get_1d_sincos_pos_embed_from_grid(d_embed_dim, grid_d)  # (T*H*W, D3)
+    pos_embed = np.concatenate([emb_d, emb_h, emb_w], axis=1)
+    pos_embed = pos_embed[:, :embed_dim]
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    returns:
+        pos_embed: [grid_size*grid_size, embed_dim] (w/o cls_token)
+                or [1+grid_size*grid_size, embed_dim] (w/ cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=float)
+    grid_w = np.arange(grid_size, dtype=float)
+    grid_w, grid_h = np.meshgrid(grid_w, grid_h)  # order of meshgrid is very important for indexing as [h, w]
+
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid_h)  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid_w)  # (H*W, D/2)
+    pos_embed = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_1d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    embed_dim: output dimension for each position
+    grid_size: int of the grid length
+    returns:
+        pos_embed: [grid_size, embed_dim] (w/o cls_token)
+                or [1+grid_size, embed_dim] (w/ cls_token)
+    """
+    grid = np.arange(grid_size, dtype=float)
+    pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    returns: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
diff --git a/src/models/vision_transformer.py b/src/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..51d155283ba53a47fc2466b4915405c42e66932a
--- /dev/null
+++ b/src/models/vision_transformer.py
@@ -0,0 +1,487 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from src.masks.utils import apply_masks
+from src.models.utils.modules import Block
+from src.models.utils.patch_embed import PatchEmbed, PatchEmbed3D
+from src.models.utils.pos_embs import get_2d_sincos_pos_embed, get_3d_sincos_pos_embed
+from src.utils.tensors import trunc_normal_
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer"""
+
+    def __init__(
+        self,
+        img_size=(224, 224),
+        patch_size=16,
+        num_frames=1,
+        tubelet_size=2,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        init_std=0.02,
+        out_layers=None,
+        uniform_power=False,
+        use_silu=False,
+        wide_silu=True,
+        use_sdpa=True,
+        use_activation_checkpointing=False,
+        use_rope=False,
+        handle_nonsquare_inputs=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.out_layers = out_layers
+        self.handle_nonsquare_inputs = handle_nonsquare_inputs
+
+        if type(img_size) is int:
+            img_size = (img_size, img_size)
+        self.img_height, self.img_width = img_size
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.is_video = num_frames > 1
+
+        self.use_activation_checkpointing = use_activation_checkpointing
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        # Tokenize pixels with convolution
+        if self.is_video:
+            self.patch_embed = PatchEmbed3D(
+                patch_size=patch_size, tubelet_size=tubelet_size, in_chans=in_chans, embed_dim=embed_dim
+            )
+            self.num_patches = (num_frames // tubelet_size) * (img_size[0] // patch_size) * (img_size[1] // patch_size)
+        else:
+            self.patch_embed = PatchEmbed(patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+            self.num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
+
+        # Position embedding
+        self.uniform_power = uniform_power
+        self.use_rope = use_rope
+        if self.use_rope:
+            self.pos_embed = None
+        else:
+            self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim), requires_grad=False)
+
+        # Attention Blocks
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    use_rope=use_rope,
+                    grid_size=img_size[0] // patch_size,
+                    grid_depth=num_frames // tubelet_size,
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    use_sdpa=use_sdpa,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer=nn.SiLU if use_silu else nn.GELU,
+                    wide_silu=wide_silu,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim)
+
+        # ------ initialize weights
+        if self.pos_embed is not None:
+            self._init_pos_embed(self.pos_embed.data)  # sincos pos-embed
+        self.init_std = init_std
+        self.apply(self._init_weights)
+        self._rescale_blocks()
+
+    def _init_pos_embed(self, pos_embed):
+        embed_dim = pos_embed.size(-1)
+        grid_size = self.img_height // self.patch_size  # TODO: update; currently assumes square input
+        if self.is_video:
+            grid_depth = self.num_frames // self.tubelet_size
+            sincos = get_3d_sincos_pos_embed(
+                embed_dim, grid_size, grid_depth, cls_token=False, uniform_power=self.uniform_power
+            )
+        else:
+            sincos = get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False)
+        pos_embed.copy_(torch.from_numpy(sincos).float().unsqueeze(0))
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=self.init_std)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            trunc_normal_(m.weight, std=self.init_std)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv3d):
+            trunc_normal_(m.weight, std=self.init_std)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def _rescale_blocks(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    def no_weight_decay(self):
+        return {}
+
+    def forward(self, x, masks=None):
+        """
+        :param x: input image/video
+        :param masks: indices of patch tokens to mask (remove)
+        """
+        if masks is not None and not isinstance(masks, list):
+            masks = [masks]
+
+        # Tokenize input
+        # Image
+        if x.ndim == 4:
+            _, _, H, W = x.shape
+            T = 1
+        # Video
+        elif x.ndim == 5:
+            _, _, T, H, W = x.shape
+            T = T // self.tubelet_size
+        H_patches = H // self.patch_size
+        W_patches = W // self.patch_size
+        if not self.handle_nonsquare_inputs:
+            T = H_patches = W_patches = None
+
+        if not self.use_rope:
+            pos_embed = self.interpolate_pos_encoding(x, self.pos_embed)
+            x = self.patch_embed(x)
+            x += pos_embed
+        else:
+            x = self.patch_embed(x)
+
+        # Mask away unwanted tokens (if masks provided)
+        if masks is not None:
+            x = apply_masks(x, masks)
+            masks = torch.cat(masks, dim=0)
+
+        # Fwd prop
+        outs = []
+        for i, blk in enumerate(self.blocks):
+            if self.use_activation_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(
+                    blk, x, masks, None, T=T, H_patches=H_patches, W_patches=W_patches, use_reentrant=False
+                )
+            else:
+                x = blk(x, mask=masks, attn_mask=None, T=T, H_patches=H_patches, W_patches=W_patches)
+            if self.out_layers is not None and i in self.out_layers:
+                outs.append(self.norm(x))
+
+        if self.out_layers is not None:
+            return outs
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x
+
+    def interpolate_pos_encoding(self, x, pos_embed):
+
+        _, N, dim = pos_embed.shape
+
+        if self.is_video:
+
+            # If pos_embed already corret size, just return
+            _, _, T, H, W = x.shape
+            if H == self.img_height and W == self.img_width and T == self.num_frames:
+                return pos_embed
+
+            # Just chop off last N tokens of positional embedding
+            elif H == self.img_height and W == self.img_width and T < self.num_frames:
+                new_N = int((T // self.tubelet_size) * (H // self.patch_size) * (W // self.patch_size))
+                return pos_embed[:, :new_N, :]
+
+            # Convert depth, height, width of input to be measured in patches
+            # instead of pixels/frames
+            T = T // self.tubelet_size
+            H = H // self.patch_size
+            W = W // self.patch_size
+
+            # Compute the initialized shape of the positional embedding measured
+            # in patches
+            N_t = self.num_frames // self.tubelet_size
+            N_h = self.img_height // self.patch_size
+            N_w = self.img_width // self.patch_size
+            assert N_h * N_w * N_t == N, "Positional embedding initialized incorrectly"
+
+            # Compute scale factor for spatio-temporal interpolation
+            scale_factor = (T / N_t, H / N_h, W / N_w)
+
+            pos_embed = nn.functional.interpolate(
+                pos_embed.reshape(1, N_t, N_h, N_w, dim).permute(0, 4, 1, 2, 3),
+                scale_factor=scale_factor,
+                mode="trilinear",
+            )
+            pos_embed = pos_embed.permute(0, 2, 3, 4, 1).view(1, -1, dim)
+            return pos_embed
+
+        else:
+
+            # If pos_embed already corret size, just return
+            _, _, H, W = x.shape
+            if H == self.img_height and W == self.img_width:
+                return pos_embed
+
+            # Compute scale factor for spatial interpolation
+            npatch = (H // self.patch_size) * (W // self.patch_size)
+            scale_factor = math.sqrt(npatch / N)
+
+            pos_embed = nn.functional.interpolate(
+                pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+                scale_factor=scale_factor,
+                mode="bicubic",
+            )
+            pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+            return pos_embed
+
+
+def vit_large(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_huge(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_giant_xformers(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1408,
+        depth=40,
+        num_heads=22,
+        mlp_ratio=48 / 11,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+# We do not use any of the following ViT definitions in V-JEPA 2, but retain them for
+# compatibility reasons.
+def vit_synthetic(patch_size=16, **kwargs):
+    # For performance testing only
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1,
+        depth=1,
+        num_heads=1,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_tiny(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_small(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_base(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_large_rope(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        use_rope=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_huge_rope(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        use_rope=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_giant(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1408,
+        depth=40,
+        num_heads=16,
+        mlp_ratio=48 / 11,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_giant_rope(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1408,
+        depth=40,
+        num_heads=16,
+        mlp_ratio=48 / 11,
+        qkv_bias=True,
+        use_rope=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_giant_xformers_rope(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1408,
+        depth=40,
+        num_heads=22,
+        mlp_ratio=48 / 11,
+        qkv_bias=True,
+        use_rope=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_gigantic(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1664,
+        depth=48,
+        num_heads=16,
+        mpl_ratio=64 / 13,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+def vit_gigantic_xformers(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1664,
+        depth=48,
+        num_heads=26,
+        mpl_ratio=64 / 13,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+
+
+VIT_EMBED_DIMS = {
+    "vit_synthetic": 1,
+    "vit_tiny": 192,
+    "vit_small": 384,
+    "vit_base": 768,
+    "vit_large": 1024,
+    "vit_huge": 1280,
+    "vit_giant": 1408,
+    "vit_gigantic": 1664,
+}
diff --git a/src/utils/__pycache__/tensors.cpython-312.pyc b/src/utils/__pycache__/tensors.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90c5111fc123845e511a66fb9c82fa052145564b
Binary files /dev/null and b/src/utils/__pycache__/tensors.cpython-312.pyc differ
diff --git a/src/utils/checkpoint_loader.py b/src/utils/checkpoint_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e7b2f055f818528c490183e1b6d1b11a5a6bd1
--- /dev/null
+++ b/src/utils/checkpoint_loader.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import random
+import time
+from typing import Any
+
+import torch
+from torch.serialization import MAP_LOCATION
+
+from src.utils.logging import get_logger
+
+logger = get_logger(os.path.basename(__file__))
+
+
+def robust_checkpoint_loader(r_path: str, map_location: MAP_LOCATION = "cpu", max_retries: int = 3) -> Any:
+    """
+    Loads a checkpoint from a path, retrying up to max_retries times if the checkpoint is not found.
+    """
+    retries = 0
+
+    while retries < max_retries:
+        try:
+            return torch.load(r_path, map_location=map_location)
+        except Exception as e:
+            logger.warning(f"Encountered exception when loading checkpoint {e}")
+            retries += 1
+            if retries < max_retries:
+                sleep_time_s = (2**retries) * random.uniform(1.0, 1.1)
+                logger.warning(f"Sleeping {sleep_time_s}s and trying again, count {retries}/{max_retries}")
+                time.sleep(sleep_time_s)
+                continue
+            else:
+                raise e
diff --git a/src/utils/distributed.py b/src/utils/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7cef30fe5a2f31376a6929c0f2e646dbe423052
--- /dev/null
+++ b/src/utils/distributed.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+
+from src.utils.logging import get_logger
+
+logger = get_logger()
+
+
+def init_distributed(port=37129, rank_and_world_size=(None, None)):
+    # try to set all environment variables to avoid triggering a segfault
+    # environment variables can be reallocated during the execution of torch.distributed.init_process_group
+    # the idea is a race condition may trigger if init_progress_group is modifying an environment variable at
+    # the same time as Python, so we try to set all environs before initializing distributed
+    if "SLURM_JOB_ID" in os.environ:
+        # Use the slurm_tmpdir (if it exists) instead of /tmp
+        tmpdir = Path(f"/scratch/slurm_tmpdir/{os.environ['SLURM_JOB_ID']}")
+        if tmpdir.exists():
+            os.environ["TMPDIR"] = str(tmpdir)
+
+    if dist.is_available() and dist.is_initialized():
+        return dist.get_world_size(), dist.get_rank()
+
+    rank, world_size = rank_and_world_size
+    os.environ["MASTER_ADDR"] = "localhost"
+
+    if (rank is None) or (world_size is None):
+        try:
+            world_size = int(os.environ["SLURM_NTASKS"])
+            rank = int(os.environ["SLURM_PROCID"])
+            os.environ["MASTER_ADDR"] = os.environ["HOSTNAME"]
+        except Exception:
+            logger.info("SLURM vars not set (distributed training not available)")
+            world_size, rank = 1, 0
+            return world_size, rank
+
+    try:
+        os.environ["MASTER_PORT"] = str(port)
+        torch.distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank)
+    except Exception as e:
+        world_size, rank = 1, 0
+        logger.info(f"Rank: {rank}. Distributed training not available {e}")
+
+    return world_size, rank
+
+
+class AllGather(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x):
+        if dist.is_available() and dist.is_initialized() and (dist.get_world_size() > 1):
+            x = x.contiguous()
+            outputs = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
+            dist.all_gather(outputs, x)
+            return torch.cat(outputs, 0)
+        return x
+
+    @staticmethod
+    def backward(ctx, grads):
+        if dist.is_available() and dist.is_initialized() and (dist.get_world_size() > 1):
+            s = (grads.shape[0] // dist.get_world_size()) * dist.get_rank()
+            e = (grads.shape[0] // dist.get_world_size()) * (dist.get_rank() + 1)
+            grads = grads.contiguous()
+            dist.all_reduce(grads)
+            return grads[s:e]
+        return grads
+
+
+class AllReduceSum(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x):
+        if dist.is_available() and dist.is_initialized() and (dist.get_world_size() > 1):
+            x = x.contiguous()
+            dist.all_reduce(x)
+        return x
+
+    @staticmethod
+    def backward(ctx, grads):
+        return grads
+
+
+class AllReduce(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x):
+        if dist.is_available() and dist.is_initialized() and (dist.get_world_size() > 1):
+            x = x.contiguous() / dist.get_world_size()
+            dist.all_reduce(x)
+        return x
+
+    @staticmethod
+    def backward(ctx, grads):
+        return grads
diff --git a/src/utils/logging.py b/src/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..949c0e5ffbd5f8bcec1209a7f5a9ee829ff1b311
--- /dev/null
+++ b/src/utils/logging.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import subprocess
+import sys
+
+import torch
+
+
+def gpu_timer(closure, log_timings=True):
+    """Helper to time gpu-time to execute closure()"""
+    log_timings = log_timings and torch.cuda.is_available()
+
+    elapsed_time = -1.0
+    if log_timings:
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+
+    result = closure()
+
+    if log_timings:
+        end.record()
+        torch.cuda.synchronize()
+        elapsed_time = start.elapsed_time(end)
+
+    return result, elapsed_time
+
+
+LOG_FORMAT = "[%(levelname)-8s][%(asctime)s][%(name)-20s][%(funcName)-25s] %(message)s"
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+
+
+def get_logger(name=None, force=False):
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT, force=force)
+    return logging.getLogger(name=name)
+
+
+class CSVLogger(object):
+
+    def __init__(self, fname, *argv, **kwargs):
+        self.fname = fname
+        self.types = []
+        mode = kwargs.get("mode", "+a")
+        self.delim = kwargs.get("delim", ",")
+        # -- print headers
+        with open(self.fname, mode) as f:
+            for i, v in enumerate(argv, 1):
+                self.types.append(v[0])
+                if i < len(argv):
+                    print(v[1], end=self.delim, file=f)
+                else:
+                    print(v[1], end="\n", file=f)
+
+    def log(self, *argv):
+        with open(self.fname, "+a") as f:
+            for i, tv in enumerate(zip(self.types, argv), 1):
+                end = self.delim if i < len(argv) else "\n"
+                print(tv[0] % tv[1], end=end, file=f)
+
+
+class AverageMeter(object):
+    """computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.max = float("-inf")
+        self.min = float("inf")
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        try:
+            self.max = max(val, self.max)
+            self.min = min(val, self.min)
+        except Exception:
+            pass
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def jepa_rootpath():
+    this_file = os.path.abspath(__file__)
+    return "/".join(this_file.split("/")[:-3])
+
+
+def git_information():
+    jepa_root = jepa_rootpath()
+    try:
+        resp = (
+            subprocess.check_output(["git", "-C", jepa_root, "rev-parse", "HEAD", "--abbrev-ref", "HEAD"])
+            .decode("ascii")
+            .strip()
+        )
+        commit, branch = resp.split("\n")
+        return f"branch: {branch}\ncommit: {commit}\n"
+    except Exception:
+        return "unknown"
diff --git a/src/utils/monitoring.py b/src/utils/monitoring.py
new file mode 100644
index 0000000000000000000000000000000000000000..33c2c903858e77fee6c418e6fed51497512d3096
--- /dev/null
+++ b/src/utils/monitoring.py
@@ -0,0 +1,171 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import dataclasses
+import threading
+import time
+from typing import Dict, Tuple
+
+import psutil
+
+
+@dataclasses.dataclass
+class ResourceStatsSample:
+    timestamp: float
+    cpu_percent: float
+    read_count: int
+    write_count: int
+    read_bytes: int
+    write_bytes: int
+    read_chars: int
+    write_chars: int
+    cpu_times_user: float
+    cpu_times_system: float
+    cpu_times_children_user: float
+    cpu_times_children_system: float
+    cpu_times_iowait: float
+    cpu_affinity: str
+    cpu_num: int
+    num_threads: int
+    num_voluntary_ctx_switches: int
+    num_involuntary_ctx_switches: int
+
+    def as_tuple(self) -> Dict:
+        """Return values mirroring fields."""
+        return dataclasses.astuple(self)
+
+    def fields(self) -> Tuple[dataclasses.Field, ...]:
+        """Return fields in this dataclass."""
+        return dataclasses.fields(self.__class__)
+
+
+class ResourceMonitoringThread(threading.Thread):
+    def __init__(self, pid=None, refresh_interval=None, stats_callback_fn=None):
+        """Starts a thread to monitor pid every refresh_interval seconds.
+
+        Passes a ResourceStatsSample object to the callback."""
+        super(ResourceMonitoringThread, self).__init__()
+        if refresh_interval is None:
+            refresh_interval = 5
+        self.is_running_event = threading.Event()
+        self.p = psutil.Process(pid)
+        self.refresh_interval = refresh_interval
+        if stats_callback_fn is None:
+            # Default callback
+            def stats_callback_fn(resource_sample: ResourceStatsSample):
+                print(f"PID {self.p.pid} Stats: {resource_sample.resource_stats}")
+
+        elif not callable(stats_callback_fn):
+            raise ValueError("Callback needs to be callable, got {}".format(type(stats_callback_fn)))
+        self.stats_callback_fn = stats_callback_fn
+
+    def stop(self) -> None:
+        self.is_running_event.set()
+
+    def run(self) -> None:
+        while not self.is_running_event.is_set():
+            self.sample_counters()
+            self.is_running_event.wait(self.refresh_interval)
+
+    def log_sample(self, resource_sample: ResourceStatsSample) -> None:
+        self.stats_callback_fn(resource_sample)
+
+    def sample_counters(self) -> None:
+        if not self.p.is_running():
+            self.stop()
+            return
+
+        with self.p.oneshot():
+            cpu_percent = self.p.cpu_percent()
+            cpu_times = self.p.cpu_times()
+            io_counters = self.p.io_counters()
+            cpu_affinity = self.p.cpu_affinity()
+            cpu_num = self.p.cpu_num()
+            num_threads = self.p.num_threads()
+            num_ctx_switches = self.p.num_ctx_switches()
+        timestamp = time.time()
+
+        read_count = io_counters.read_count
+        write_count = io_counters.write_count
+        read_bytes = io_counters.read_bytes
+        write_bytes = io_counters.write_bytes
+        read_chars = io_counters.read_chars
+        write_chars = io_counters.write_chars
+
+        def compress_cpu_affinity(cpu_affinity):
+            """Change list representation to interval/range representation."""
+            if not cpu_affinity:
+                return ""
+            cpu_affinity_compressed = []
+            min_x = None
+            max_x = None
+            last_x = None
+
+            # Find contiguous ranges
+            for x in cpu_affinity:
+                if last_x is None:
+                    # Start interval
+                    min_x = x
+                    max_x = x
+                    last_x = x
+                    continue
+                elif x == (last_x + 1):
+                    # Move interval up
+                    max_x = x
+                elif max_x is not None:
+                    # Interval ended, start again
+                    if min_x == max_x:
+                        cpu_affinity_compressed.append("{}".format(min_x))
+                    else:
+                        cpu_affinity_compressed.append("{}-{}".format(min_x, max_x))
+                    min_x = x
+                    max_x = x
+                last_x = x
+            # Terminate last range
+            if max_x is not None:
+                if min_x == max_x:
+                    cpu_affinity_compressed.append("{}".format(min_x))
+                else:
+                    cpu_affinity_compressed.append("{}-{}".format(min_x, max_x))
+
+            # Concat
+            cpu_affinity_compressed = ",".join(cpu_affinity_compressed)
+
+            return cpu_affinity_compressed
+
+        cpu_affinity = compress_cpu_affinity(cpu_affinity)
+
+        resource_sample = ResourceStatsSample(
+            timestamp=timestamp,
+            cpu_percent=cpu_percent,
+            read_count=read_count,
+            write_count=write_count,
+            read_bytes=read_bytes,
+            write_bytes=write_bytes,
+            read_chars=read_chars,
+            write_chars=write_chars,
+            cpu_times_user=cpu_times.user,
+            cpu_times_system=cpu_times.system,
+            cpu_times_children_user=cpu_times.children_user,
+            cpu_times_children_system=cpu_times.children_system,
+            cpu_times_iowait=cpu_times.iowait,
+            cpu_affinity=cpu_affinity,
+            cpu_num=cpu_num,
+            num_threads=num_threads,
+            num_voluntary_ctx_switches=num_ctx_switches.voluntary,
+            num_involuntary_ctx_switches=num_ctx_switches.involuntary,
+        )
+        self.log_sample(resource_sample)
+
+
+if __name__ == "__main__":
+    import multiprocessing
+
+    pid = multiprocessing.current_process().pid
+    monitor_thread = ResourceMonitoringThread(pid, 1)
+    monitor_thread.start()
+    time.sleep(5)
+    print("Shutdown")
+    monitor_thread.stop()
diff --git a/src/utils/schedulers.py b/src/utils/schedulers.py
new file mode 100644
index 0000000000000000000000000000000000000000..c551b5e83dbc3735bb41429627fae7187b081e60
--- /dev/null
+++ b/src/utils/schedulers.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+
+class WSDSchedule(object):
+
+    def __init__(self, optimizer, warmup_steps, anneal_steps, T_max, start_lr, ref_lr, final_lr=0.0):
+        self.optimizer = optimizer
+        self.start_lr = start_lr
+        self.ref_lr = ref_lr
+        self.final_lr = final_lr
+        self.anneal_steps = anneal_steps
+        self.warmup_steps = warmup_steps
+        self.T_max = T_max - warmup_steps - anneal_steps
+        self._step = 0.0
+
+    def step(self):
+        self._step += 1
+        if self._step < self.warmup_steps:
+            progress = float(self._step) / float(max(1, self.warmup_steps))
+            new_lr = self.start_lr + progress * (self.ref_lr - self.start_lr)
+        elif self._step < self.T_max + self.warmup_steps:
+            new_lr = self.ref_lr
+        else:
+            _step = self._step - (self.T_max + self.warmup_steps)
+            progress = float(_step) / float(max(1, self.anneal_steps))
+            new_lr = self.ref_lr + progress * (self.final_lr - self.ref_lr)
+
+        for group in self.optimizer.param_groups:
+            group["lr"] = new_lr
+            if "lr_scale" in group:
+                group["lr"] *= group["lr_scale"]
+
+        return new_lr
+
+
+class WarmupCosineSchedule(object):
+
+    def __init__(self, optimizer, warmup_steps, start_lr, ref_lr, T_max, last_epoch=-1, final_lr=0.0):
+        self.optimizer = optimizer
+        self.start_lr = start_lr
+        self.ref_lr = ref_lr
+        self.final_lr = final_lr
+        self.warmup_steps = warmup_steps
+        self.T_max = T_max - warmup_steps
+        self._step = 0.0
+
+    def step(self):
+        self._step += 1
+        if self._step < self.warmup_steps:
+            progress = float(self._step) / float(max(1, self.warmup_steps))
+            new_lr = self.start_lr + progress * (self.ref_lr - self.start_lr)
+        else:
+            # -- progress after warmup
+            progress = float(self._step - self.warmup_steps) / float(max(1, self.T_max))
+            new_lr = max(
+                self.final_lr,
+                self.final_lr + (self.ref_lr - self.final_lr) * 0.5 * (1.0 + math.cos(math.pi * progress)),
+            )
+
+        for group in self.optimizer.param_groups:
+            group["lr"] = new_lr
+
+        return new_lr
+
+
+class CosineWDSchedule(object):
+
+    def __init__(self, optimizer, ref_wd, T_max, final_wd=0.0):
+        self.optimizer = optimizer
+        self.ref_wd = ref_wd
+        self.final_wd = final_wd
+        self.T_max = T_max
+        self._step = 0.0
+
+    def step(self):
+        self._step += 1
+        progress = self._step / self.T_max
+        new_wd = self.final_wd + (self.ref_wd - self.final_wd) * 0.5 * (1.0 + math.cos(math.pi * progress))
+
+        if self.final_wd <= self.ref_wd:
+            new_wd = max(self.final_wd, new_wd)
+        else:
+            new_wd = min(self.final_wd, new_wd)
+
+        for group in self.optimizer.param_groups:
+            if ("WD_exclude" not in group) or not group["WD_exclude"]:
+                group["weight_decay"] = new_wd
+        return new_wd
diff --git a/src/utils/tensors.py b/src/utils/tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf7bfafc67c69bfd10f663ce259c4fd7bfe6ab45
--- /dev/null
+++ b/src/utils/tensors.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from logging import getLogger
+
+import torch
+
+logger = getLogger()
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [lower, upper], then translate to
+        # [2*lower-1, 2*upper-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def repeat_interleave_batch(x, B, repeat):
+    N = len(x) // B
+    x = torch.cat([torch.cat([x[i * B : (i + 1) * B] for _ in range(repeat)], dim=0) for i in range(N)], dim=0)
+    return x
diff --git a/src/utils/wrappers.py b/src/utils/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e08ec2dedd604350cd66a4f001ca3add7080563
--- /dev/null
+++ b/src/utils/wrappers.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+
+
+class MultiSeqWrapper(nn.Module):
+
+    def __init__(self, backbone):
+        super().__init__()
+        self.backbone = backbone
+
+    def forward(self, x, masks=None):
+        """
+        :param x: [list] List of Tensors of different seq lengths
+        :param masks: [list[list]] List of Tensors (out index: masks for given seq length, inner index: multimasks for that seq len)
+        """
+        if masks is None:
+            return [self.backbone(xi) for xi in x]
+
+        outs = [[] for _ in x]
+        for i, (xi, mi) in enumerate(zip(x, masks)):
+            for mij in mi:
+                outs[i] += [self.backbone(xi, masks=mij)]
+        return outs
+
+
+class PredictorMultiSeqWrapper(nn.Module):
+
+    def __init__(self, backbone):
+        super().__init__()
+        self.backbone = backbone
+
+    def forward(self, x, masks_x, masks_y, has_cls=False):
+        n = 0
+        outs = [[] for _ in x]
+        for i, (xi, mxi, myi) in enumerate(zip(x, masks_x, masks_y)):
+            for xij, mxij, myij in zip(xi, mxi, myi):
+                outs[i] += [self.backbone(xij, mxij, myij, mask_index=i, has_cls=has_cls)]
+                n += 1
+        return outs
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..768afcc450e66c1d2dab3a5bc76ca15957227589
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4aa645548d069f0235573e314b319436bc9c7f4a7aa6e2c07f494de56a57b955
+size 17210205
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f806949078f54c23233d6565f4dfded3a7d47d3c
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2078 @@
+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|finetune_right_pad_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|python_tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128256": {
+      "content": "<video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<video>"
+  ],
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "extra_special_tokens": {},
+  "max_length": 131072,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|end_of_text|>",
+  "stride": 0,
+  "tokenizer_class": "PreTrainedTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first"
+}