app: vjepa nodes: 16 tasks_per_node: 8 data: dataset_type: VideoDataset datasets: - /your_path_to_kinetics710_csv_file_index.csv - /your_path_to_ssv2_csv_file_index.csv - /your_path_to_howto100m_csv_file_index.csv decode_one_clip: true batch_size: 24 num_clips: 1 num_frames: 16 tubelet_size: 2 sampling_rate: 4 crop_size: 224 patch_size: 16 pin_mem: true num_workers: 12 filter_short_videos: false clip_duration: null data_aug: auto_augment: false motion_shift: false random_resize_aspect_ratio: - 0.75 - 1.35 random_resize_scale: - 0.3 - 1.0 reprob: 0.0 logging: folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/ write_tag: jepa loss: loss_exp: 1.0 reg_coeff: 0.0 mask: - aspect_ratio: - 0.75 - 1.5 num_blocks: 8 spatial_scale: - 0.15 - 0.15 temporal_scale: - 1.0 - 1.0 max_temporal_keep: 1.0 max_keep: null - aspect_ratio: - 0.75 - 1.5 num_blocks: 2 spatial_scale: - 0.7 - 0.7 temporal_scale: - 1.0 - 1.0 max_temporal_keep: 1.0 max_keep: null meta: load_checkpoint: false read_checkpoint: null seed: 234 eval_freq: 100 use_sdpa: true dtype: bfloat16 model: model_name: vit_large pred_depth: 12 pred_embed_dim: 384 uniform_power: true use_mask_tokens: true zero_init_mask_tokens: true optimization: ipe: 300 ipe_scale: 1.25 clip_grad: 10.0 weight_decay: 0.04 final_weight_decay: 0.4 epochs: 300 warmup: 40 start_lr: 0.0002 lr: 0.000625 final_lr: 1.0e-06 ema: - 0.998 - 1.0