File size: 3,398 Bytes
401fa20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from configs.data import *
from configs.model import *
# ========================= data ==========================
train_corpus = "webvid_10m"
train_file = "${available_corpus[${train_corpus}]}" # for lazy evaluation
test_file = dict(
test=[
"/ibex/project/c2134/LSMDC/annotations/LSMDC16_challenge_1000_publictest.json",
"/ibex/project/c2134/LSMDC/videos/",
"video",
],
)
test_types = ["test"]
num_workers = 10
stop_key = None
# ========================= input ==========================
num_frames = 1
num_frames_test = 1
batch_size = 512
batch_size_test = 64
max_txt_l = 32
inputs = dict(
image_res=224,
video_input=dict(
num_frames="${num_frames}",
sample_type="rand",
num_frames_test="${num_frames_test}",
sample_type_test="middle",
random_aug=False,
),
max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
batch_size=dict(image="${batch_size}", video="${batch_size}"),
batch_size_test=dict(image="${batch_size_test}", video="${batch_size_test}"),
)
# ========================= model ==========================
text_enc = "bert_large"
model = dict(
model_cls="ViCLIP",
vision_encoder=dict(
# backbone
name="vit_b16",
pretrained='CLIP-ViT-B/16',
d_model=1024,
kernel_size=1,
center=True,
drop_path_rate=0.1,
masking_prob=0.9,
checkpoint_num=24,
),
text_encoder=dict(
pretrained='CLIP-ViT-B/16', # This is for vindlu default tokenizer, this is never used
name="vit_b16",
d_model=512,
vocab_size=49408,
),
requires_raw_text=True,
embed_dim=768,
temp=1 / 100.0,
temp_min=1 / 100.0,
freeze_text=True,
)
criterion = dict(
loss_weight=dict(
vtc=1.0,
# mlm=1.0,
# vtm=1.0,
# mvm=0.0,
# mac=1.0,
), # 0: disabled.
)
optimizer = dict(
opt="adamW",
lr=4e-4,
opt_betas=[0.9, 0.98], # default
weight_decay=0.2,
max_grad_norm=-1, # requires a positive float, use -1 to disable
# use a different lr for some modules, e.g., larger lr for new modules
different_lr=dict(enable=False, module_names=[], lr=1e-3),
)
scheduler = dict(sched="cosine", epochs=12, min_lr_multi=0.01, warmup_epochs=0.5)
evaluate = False
deep_fusion = False
evaluation = dict(
eval_frame_ensemble="concat", # [concat, max, mean, lse]
eval_x_only=False,
k_test=128,
eval_offload=True, # offload gpu tensors to cpu to save memory.
)
fp16 = True
gradient_checkpointing = True
# ========================= wandb ==========================
wandb = dict(
enable=True,
entity="likunchang", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
project="vindlu_videoclip", # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "pt"
# ========================= others ==========================
output_dir = None # output dir
resume = False # if True, load optimizer and scheduler states as well
debug = False
log_freq = 10
seed = 42
save_latest = True
auto_resume = True
pretrained_path = "" # path to pretrained model weights, for resume only?
deepspeed = dict(
enable=False,
stage=2,
)
wiseft = dict(
enable=False,
coef=0.5,
keys_to_exclude=["vision_encoder.temporal_positional_embedding"]
)
|