SMILE / configs /config.py
fmthoker's picture
Upload 95 files
401fa20 verified
raw
history blame
3.4 kB
from configs.data import *
from configs.model import *
# ========================= data ==========================
train_corpus = "webvid_10m"
train_file = "${available_corpus[${train_corpus}]}" # for lazy evaluation
test_file = dict(
test=[
"/ibex/project/c2134/LSMDC/annotations/LSMDC16_challenge_1000_publictest.json",
"/ibex/project/c2134/LSMDC/videos/",
"video",
],
)
test_types = ["test"]
num_workers = 10
stop_key = None
# ========================= input ==========================
num_frames = 1
num_frames_test = 1
batch_size = 512
batch_size_test = 64
max_txt_l = 32
inputs = dict(
image_res=224,
video_input=dict(
num_frames="${num_frames}",
sample_type="rand",
num_frames_test="${num_frames_test}",
sample_type_test="middle",
random_aug=False,
),
max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
batch_size=dict(image="${batch_size}", video="${batch_size}"),
batch_size_test=dict(image="${batch_size_test}", video="${batch_size_test}"),
)
# ========================= model ==========================
text_enc = "bert_large"
model = dict(
model_cls="ViCLIP",
vision_encoder=dict(
# backbone
name="vit_b16",
pretrained='CLIP-ViT-B/16',
d_model=1024,
kernel_size=1,
center=True,
drop_path_rate=0.1,
masking_prob=0.9,
checkpoint_num=24,
),
text_encoder=dict(
pretrained='CLIP-ViT-B/16', # This is for vindlu default tokenizer, this is never used
name="vit_b16",
d_model=512,
vocab_size=49408,
),
requires_raw_text=True,
embed_dim=768,
temp=1 / 100.0,
temp_min=1 / 100.0,
freeze_text=True,
)
criterion = dict(
loss_weight=dict(
vtc=1.0,
# mlm=1.0,
# vtm=1.0,
# mvm=0.0,
# mac=1.0,
), # 0: disabled.
)
optimizer = dict(
opt="adamW",
lr=4e-4,
opt_betas=[0.9, 0.98], # default
weight_decay=0.2,
max_grad_norm=-1, # requires a positive float, use -1 to disable
# use a different lr for some modules, e.g., larger lr for new modules
different_lr=dict(enable=False, module_names=[], lr=1e-3),
)
scheduler = dict(sched="cosine", epochs=12, min_lr_multi=0.01, warmup_epochs=0.5)
evaluate = False
deep_fusion = False
evaluation = dict(
eval_frame_ensemble="concat", # [concat, max, mean, lse]
eval_x_only=False,
k_test=128,
eval_offload=True, # offload gpu tensors to cpu to save memory.
)
fp16 = True
gradient_checkpointing = True
# ========================= wandb ==========================
wandb = dict(
enable=True,
entity="likunchang", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
project="vindlu_videoclip", # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "pt"
# ========================= others ==========================
output_dir = None # output dir
resume = False # if True, load optimizer and scheduler states as well
debug = False
log_freq = 10
seed = 42
save_latest = True
auto_resume = True
pretrained_path = "" # path to pretrained model weights, for resume only?
deepspeed = dict(
enable=False,
stage=2,
)
wiseft = dict(
enable=False,
coef=0.5,
keys_to_exclude=["vision_encoder.temporal_positional_embedding"]
)