File size: 3,398 Bytes
401fa20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from configs.data import *
from configs.model import *

# ========================= data ==========================
train_corpus = "webvid_10m"
train_file = "${available_corpus[${train_corpus}]}"  # for lazy evaluation
test_file = dict(
    test=[
        "/ibex/project/c2134/LSMDC/annotations/LSMDC16_challenge_1000_publictest.json",
        "/ibex/project/c2134/LSMDC/videos/",
        "video",
    ],
)
test_types = ["test"]
num_workers = 10

stop_key = None

# ========================= input ==========================
num_frames = 1
num_frames_test = 1
batch_size = 512
batch_size_test = 64
max_txt_l = 32

inputs = dict(
    image_res=224,
    video_input=dict(
        num_frames="${num_frames}",
        sample_type="rand",
        num_frames_test="${num_frames_test}",
        sample_type_test="middle",
        random_aug=False,
    ),
    max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
    batch_size=dict(image="${batch_size}", video="${batch_size}"),
    batch_size_test=dict(image="${batch_size_test}", video="${batch_size_test}"),
)

# ========================= model ==========================
text_enc = "bert_large"
model = dict(
    model_cls="ViCLIP",
    vision_encoder=dict(
        # backbone
        name="vit_b16",
        pretrained='CLIP-ViT-B/16',
        d_model=1024,
        kernel_size=1,
        center=True,
        drop_path_rate=0.1,
        masking_prob=0.9,
        checkpoint_num=24,
    ),
    text_encoder=dict(
        pretrained='CLIP-ViT-B/16',  # This is for vindlu default tokenizer, this is never used
        name="vit_b16",
        d_model=512,
        vocab_size=49408,
    ),
    requires_raw_text=True,
    embed_dim=768,
    temp=1 / 100.0,
    temp_min=1 / 100.0,
    freeze_text=True,
)

criterion = dict(
    loss_weight=dict(
        vtc=1.0, 
        # mlm=1.0, 
        # vtm=1.0, 
        # mvm=0.0,
        # mac=1.0,
    ),  # 0: disabled.
)

optimizer = dict(
    opt="adamW",
    lr=4e-4,
    opt_betas=[0.9, 0.98],  # default
    weight_decay=0.2,
    max_grad_norm=-1,  # requires a positive float, use -1 to disable
    # use a different lr for some modules, e.g., larger lr for new modules
    different_lr=dict(enable=False, module_names=[], lr=1e-3),
)

scheduler = dict(sched="cosine", epochs=12, min_lr_multi=0.01, warmup_epochs=0.5)

evaluate = False
deep_fusion = False
evaluation = dict(
    eval_frame_ensemble="concat",  # [concat, max, mean, lse]
    eval_x_only=False,
    k_test=128,
    eval_offload=True,  # offload gpu tensors to cpu to save memory.
)

fp16 = True
gradient_checkpointing = True

# ========================= wandb ==========================
wandb = dict(
    enable=True,
    entity="likunchang",  # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
    project="vindlu_videoclip",  # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "pt"

# ========================= others ==========================
output_dir = None  # output dir
resume = False  # if True, load optimizer and scheduler states as well
debug = False
log_freq = 10
seed = 42

save_latest = True
auto_resume = True
pretrained_path = ""  # path to pretrained model weights, for resume only?

deepspeed = dict(
    enable=False,
    stage=2,
)

wiseft = dict(
    enable=False,
    coef=0.5,
    keys_to_exclude=["vision_encoder.temporal_positional_embedding"]
)