SMILE

File size: 3,398 Bytes

401fa20

from configs.data import *
from configs.model import *

# ========================= data ==========================
train_corpus = "webvid_10m"
train_file = "${available_corpus[${train_corpus}]}"  # for lazy evaluation
test_file = dict(
    test=[
        "/ibex/project/c2134/LSMDC/annotations/LSMDC16_challenge_1000_publictest.json",
        "/ibex/project/c2134/LSMDC/videos/",
        "video",
    ],
)
test_types = ["test"]
num_workers = 10

stop_key = None

# ========================= input ==========================
num_frames = 1
num_frames_test = 1
batch_size = 512
batch_size_test = 64
max_txt_l = 32

inputs = dict(
    image_res=224,
    video_input=dict(
        num_frames="${num_frames}",
        sample_type="rand",
        num_frames_test="${num_frames_test}",
        sample_type_test="middle",
        random_aug=False,
    ),
    max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
    batch_size=dict(image="${batch_size}", video="${batch_size}"),
    batch_size_test=dict(image="${batch_size_test}", video="${batch_size_test}"),
)

# ========================= model ==========================
text_enc = "bert_large"
model = dict(
    model_cls="ViCLIP",
    vision_encoder=dict(
        # backbone
        name="vit_b16",
        pretrained='CLIP-ViT-B/16',
        d_model=1024,
        kernel_size=1,
        center=True,
        drop_path_rate=0.1,
        masking_prob=0.9,
        checkpoint_num=24,
    ),
    text_encoder=dict(
        pretrained='CLIP-ViT-B/16',  # This is for vindlu default tokenizer, this is never used
        name="vit_b16",
        d_model=512,
        vocab_size=49408,
    ),
    requires_raw_text=True,
    embed_dim=768,
    temp=1 / 100.0,
    temp_min=1 / 100.0,
    freeze_text=True,
)

criterion = dict(
    loss_weight=dict(
        vtc=1.0, 
        # mlm=1.0, 
        # vtm=1.0, 
        # mvm=0.0,
        # mac=1.0,
    ),  # 0: disabled.
)

optimizer = dict(
    opt="adamW",
    lr=4e-4,
    opt_betas=[0.9, 0.98],  # default
    weight_decay=0.2,
    max_grad_norm=-1,  # requires a positive float, use -1 to disable
    # use a different lr for some modules, e.g., larger lr for new modules
    different_lr=dict(enable=False, module_names=[], lr=1e-3),
)

scheduler = dict(sched="cosine", epochs=12, min_lr_multi=0.01, warmup_epochs=0.5)

evaluate = False
deep_fusion = False
evaluation = dict(
    eval_frame_ensemble="concat",  # [concat, max, mean, lse]
    eval_x_only=False,
    k_test=128,
    eval_offload=True,  # offload gpu tensors to cpu to save memory.
)

fp16 = True
gradient_checkpointing = True

# ========================= wandb ==========================
wandb = dict(
    enable=True,
    entity="likunchang",  # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
    project="vindlu_videoclip",  # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "pt"

# ========================= others ==========================
output_dir = None  # output dir
resume = False  # if True, load optimizer and scheduler states as well
debug = False
log_freq = 10
seed = 42

save_latest = True
auto_resume = True
pretrained_path = ""  # path to pretrained model weights, for resume only?

deepspeed = dict(
    enable=False,
    stage=2,
)

wiseft = dict(
    enable=False,
    coef=0.5,
    keys_to_exclude=["vision_encoder.temporal_positional_embedding"]
)