File size: 2,880 Bytes
			
			| 1a2f2af 785eb33 1a2f2af 785eb33 1a2f2af 4cc5408 1a2f2af | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | run_id: 0903_libero_spatial_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_16_pretrained_vlm
run_root_dir: ./playground/Checkpoints
seed: 42
trackers:
- jsonl
- wandb
wandb_entity: michaelyu-1101-fudanuniversity
wandb_project: Internvla
is_debug: false
framework:
  framework_py: DinoQFormerACT
  qwenvl:
    base_vlm: Qwen/Qwen2.5-VL-3B-Instruct
    attn_implementation: flash_attention_2
    vl_hidden_dim: 2048
  dino:
    dino_backbone: dinov2_vitl14
  layer_qformer:
    qformer_end_layer: 37
    qformer_start_layer: 36
    num_query_tokens: 64
    input_dim: 2048
    ouptput_dim: 768
    grad_scale: 0.5
  action_model:
    action_model_type: DiT-B
    action_hidden_dim: 768
    action_dim: 7
    input_dim: 2048
    ouptput_dim: 768
    use_ema: false
    future_action_window_size: 7
    past_action_window_size: 0
    repeated_diffusion_steps: 8
  reduce_in_full_precision: true
datasets:
  vlm_data:
    dataformat: llava_json
    dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
    eval_dataset: aokvqa_cauldron_llava_format
    data_flatten: false
    base_interval: 2
    max_pixels: 50176
    min_pixels: 784
    fix_image_size:
    - 224
    - 224
    model_max_length: 1024
    model_type: qwen2.5vl
    per_device_batch_size: 4
  vla_data:
    dataset_py: lerobot_libero
    data_root_dir: playground/Datasets/LEROBOT_LIBERO_DATA
    data_mix: libero_spatial
    action_type: delta_qpos
    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
      Locate their bounding boxes in [x1,y1,x2,y2] format.
    CoT_answer: bbox
    default_image_resolution:
    - 3
    - 224
    - 224
    per_device_batch_size: 16
    load_all_data_for_training: true
    obs:
    - image_0
trainer:
  epochs: 100
  max_train_steps: 100000
  num_warmup_steps: 5000
  save_interval: 10000
  eval_interval: 1000
  learning_rate:
    base: 2.5e-05
  lr_scheduler_type: cosine_with_min_lr
  scheduler_specific_kwargs:
    min_lr: 1.0e-06
  freeze_modules: ''
  loss_scale:
    vla: 1.0
    vlm: 0.1
  max_grad_norm: 1.0
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 10
  gradient_clipping: 1.0
  gradient_accumulation_steps: 1
  optimizer:
    name: AdamW
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  is_resume: false
  resume_epoch: null
  resume_step: null
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true
output_dir: ./playground/Checkpoints/0903_libero_spatial_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_16_pretrained_vlm
 | 
