nodes: 8 | |
tasks_per_node: 8 | |
tag: k400-16x8x3 | |
eval_name: video_classification_frozen | |
resume_checkpoint: false | |
data: | |
dataset_train: /your_path_to_kinetics400_train_csv_file_index.csv | |
dataset_val: /your_path_to_kinetics400_val_csv_file_index.csv | |
dataset_type: VideoDataset | |
num_classes: 400 | |
frames_per_clip: 16 | |
num_segments: 8 | |
num_views_per_segment: 3 | |
frame_step: 4 | |
optimization: | |
attend_across_segments: true | |
num_epochs: 20 | |
resolution: 224 | |
batch_size: 4 | |
weight_decay: 0.01 | |
lr: 0.001 | |
start_lr: 0.001 | |
final_lr: 0.0 | |
warmup: 0. | |
use_bfloat16: true | |
pretrain: | |
model_name: vit_large | |
checkpoint_key: target_encoder | |
clip_duration: null | |
frames_per_clip: 16 | |
tubelet_size: 2 | |
uniform_power: true | |
use_silu: false | |
tight_silu: false | |
use_sdpa: true | |
patch_size: 16 | |
folder: /your_absolute_file_path_to_directory_where_pretrained_models_are_contained/ | |
checkpoint: jepa-latest.pth.tar # name of pretrained model file inside folder | |
write_tag: jepa | |