|
from configs.data import * |
|
from configs.model import * |
|
|
|
|
|
train_corpus = "webvid_10m" |
|
train_file = "${available_corpus[${train_corpus}]}" |
|
test_file = dict( |
|
test=[ |
|
"/ibex/project/c2134/LSMDC/annotations/LSMDC16_challenge_1000_publictest.json", |
|
"/ibex/project/c2134/LSMDC/videos/", |
|
"video", |
|
], |
|
) |
|
test_types = ["test"] |
|
num_workers = 10 |
|
|
|
stop_key = None |
|
|
|
|
|
num_frames = 1 |
|
num_frames_test = 1 |
|
batch_size = 512 |
|
batch_size_test = 64 |
|
max_txt_l = 32 |
|
|
|
inputs = dict( |
|
image_res=224, |
|
video_input=dict( |
|
num_frames="${num_frames}", |
|
sample_type="rand", |
|
num_frames_test="${num_frames_test}", |
|
sample_type_test="middle", |
|
random_aug=False, |
|
), |
|
max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"), |
|
batch_size=dict(image="${batch_size}", video="${batch_size}"), |
|
batch_size_test=dict(image="${batch_size_test}", video="${batch_size_test}"), |
|
) |
|
|
|
|
|
text_enc = "bert_large" |
|
model = dict( |
|
model_cls="ViCLIP", |
|
vision_encoder=dict( |
|
|
|
name="vit_b16", |
|
pretrained='CLIP-ViT-B/16', |
|
d_model=1024, |
|
kernel_size=1, |
|
center=True, |
|
drop_path_rate=0.1, |
|
masking_prob=0.9, |
|
checkpoint_num=24, |
|
), |
|
text_encoder=dict( |
|
pretrained='CLIP-ViT-B/16', |
|
name="vit_b16", |
|
d_model=512, |
|
vocab_size=49408, |
|
), |
|
requires_raw_text=True, |
|
embed_dim=768, |
|
temp=1 / 100.0, |
|
temp_min=1 / 100.0, |
|
freeze_text=True, |
|
) |
|
|
|
criterion = dict( |
|
loss_weight=dict( |
|
vtc=1.0, |
|
|
|
|
|
|
|
|
|
), |
|
) |
|
|
|
optimizer = dict( |
|
opt="adamW", |
|
lr=4e-4, |
|
opt_betas=[0.9, 0.98], |
|
weight_decay=0.2, |
|
max_grad_norm=-1, |
|
|
|
different_lr=dict(enable=False, module_names=[], lr=1e-3), |
|
) |
|
|
|
scheduler = dict(sched="cosine", epochs=12, min_lr_multi=0.01, warmup_epochs=0.5) |
|
|
|
evaluate = False |
|
deep_fusion = False |
|
evaluation = dict( |
|
eval_frame_ensemble="concat", |
|
eval_x_only=False, |
|
k_test=128, |
|
eval_offload=True, |
|
) |
|
|
|
fp16 = True |
|
gradient_checkpointing = True |
|
|
|
|
|
wandb = dict( |
|
enable=True, |
|
entity="likunchang", |
|
project="vindlu_videoclip", |
|
) |
|
dist_url = "env://" |
|
device = "cuda" |
|
mode = "pt" |
|
|
|
|
|
output_dir = None |
|
resume = False |
|
debug = False |
|
log_freq = 10 |
|
seed = 42 |
|
|
|
save_latest = True |
|
auto_resume = True |
|
pretrained_path = "" |
|
|
|
deepspeed = dict( |
|
enable=False, |
|
stage=2, |
|
) |
|
|
|
wiseft = dict( |
|
enable=False, |
|
coef=0.5, |
|
keys_to_exclude=["vision_encoder.temporal_positional_embedding"] |
|
) |
|
|