|
bert_config: 'config_bert.json' |
|
|
|
image_res: 504 |
|
batch_size_train: 128 |
|
vision_width: 1024 |
|
distill: True |
|
clip_name: "ViT-L-14" |
|
batch_size_test: 64 |
|
k_test: 128 |
|
|
|
alpha: 0.4 |
|
warm_up: True |
|
|
|
eos: '[SEP]' |
|
|
|
optimizer: {opt: adamW, lr1: 3e-5, lr2: 5e-6, weight_decay: 0.02} |
|
schedular: {sched: cosine, lr: 3e-5, epochs: 8, min_lr: 1e-6, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 4, cooldown_epochs: 0} |
|
|
|
|
|
min_length: 1 |
|
max_length: 10 |
|
beam_size: 5 |
|
add_ocr: False |
|
add_object: False |
|
text_encoder: 'bert-base-uncased' |
|
text_decoder: 'bert-base-uncased' |
|
|
|
|
|
clip_embed_dim: 768 |
|
clip_image_resolution: 224 |
|
clip_vision_layers: 24 |
|
clip_vision_width: 1024 |
|
clip_vision_patch_size: 14 |
|
clip_context_length: 77 |
|
clip_vocab_size: 49408 |
|
clip_transformer_width: 768 |
|
clip_transformer_heads: 12 |
|
clip_transformer_layers: 12 |
|
|
|
|