hywslxh
/

RoboVLMs-New

Model card Files Files and versions

xet

Community

hywslxh commited on May 10

Commit

bbd7bc4

verified ·

1 Parent(s): 416d1cf

Upload pi-zero.json

Browse files

Files changed (1) hide show

pi-zero.json +269 -0

pi-zero.json ADDED Viewed

	@@ -0,0 +1,269 @@

+{
+    "robovlm_name": "RoboPiZero",
+    "parent": null,
+    "task_name": "pizero_finetune",
+    "model": "pizero",
+    "model_url": "https://huggingface.co/google/paligemma-3b-pt-224",
+    "seq_len": 1,
+    "image_size": 224,
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "window_size": 1,
+    "fwd_pred_next_n": 4,
+    "arm_gripper_loss_ratio": 0.01,
+    "text_loss_ratio": 0.1,
+    "cap_loss_ratio": 0.05,
+    "fwd_loss_ratio": 0,
+    "vl_cotrain_ratio": 1,
+    "seed": 123,
+    "batch_size": 4,
+    "num_workers": 16,
+    "data_scale": 1,
+    "optimizer": "adam",
+    "learning_rate": 2e-05,
+    "min_lr_scale": 0.01,
+    "weight_decay": 0,
+    "warmup_epochs": 0.25,
+    "warmup_steps": 0,
+    "warmup_ratio": null,
+    "use_hand_rgb": true,
+    "use_time_causal_attn": false,
+    "use_mim_obs_loss": false,
+    "use_pixel_loss": true,
+    "use_obs_queries": true,
+    "use_vision_resampler": false,
+    "vision_masked_ratio": 0.9,
+    "use_tube_mask": false,
+    "cache_root": "runs/cache/pizero",
+    "model_load_path": null,
+    "model_load_source": "torch",
+    "resume": null,
+    "use_cot_stage_token": false,
+    "use_cot": false,
+    "force_model_cot": false,
+    "use_chat_stage_token": false,
+    "cot_tags": [
+        "task",
+        "plan",
+        "bboxes",
+        "subtask_reason",
+        "subtask",
+        "move_reason",
+        "move",
+        "gripper"
+    ],
+    "model_path": "/share/project/lxh/project/VLMs/paligemma-3b-pt-224",
+    "model_config": "/share/project/lxh/project/VLMs/paligemma-3b-pt-224",
+    "deepspeed_config": {
+        "fp16": {
+            "enabled": false,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+        "bf16": {
+            "enabled": true
+        },
+        "zero_allow_untested_optimizer": true,
+        "train_micro_batch_size_per_gpu": 4,
+        "zero_optimization": {
+            "stage": 2,
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "reduce_bucket_size": 500000000.0,
+            "allgather_partitions": true,
+            "allgather_bucket_size": 500000000.0,
+            "reduce_scatter": true
+        },
+        "communication": {
+            "nccl_timeout": 3600,
+            "all_reduce_timeout": 7200
+        },
+        "gradient_accumulation_steps": 4,
+        "gradient_clipping": 0.0
+    },
+    "train_setup": {
+        "precision": "bf16",
+        "predict_action": true,
+        "predict_forward": false,
+        "predict_forward_hand": false,
+        "predict_caption": false,
+        "train_vision": true,
+        "bits": -1,
+        "freeze_mm_mlp_adapter": false,
+        "freeze_backbone": false,
+        "freeze_resampler": false,
+        "tune_mm_mlp_adapter": false,
+        "mm_use_im_start_end": false,
+        "mm_use_im_patch_token": false,
+        "gradient_checkpointing": false,
+        "lora_enable": false,
+        "mm_projector_lr": 0.0001,
+        "lora_r": 64,
+        "lora_alpha": 16,
+        "lora_dropout": 0.05,
+        "lora_bias": "none",
+        "train_text_embedding": true
+    },
+    "vision_resampler": {
+        "vis_dim": 1024,
+        "depth": 8,
+        "dim_head": 64,
+        "heads": 8,
+        "num_latents": 64
+    },
+    "act_encoder": null,
+    "act_head": {
+        "type": "FCDecoder",
+        "hidden_size": 1024,
+        "action_dim": 7,
+        "down_sample": "none",
+        "latent": 1,
+        "fwd_pred_next_n": 1,
+        "window_size": 1,
+        "action_space": "continuous",
+        "with_history": true,
+        "history_type": "post",
+        "n_bin": 256,
+        "min_action": -1,
+        "max_action": 1
+    },
+    "fwd_head": null,
+    "tokenizer": {
+        "type": "AutoProcessor",
+        "pretrained_model_name_or_path": "/share/project/lxh/project/VLMs/paligemma-3b-pt-224",
+        "tokenizer_type": "paligemma",
+        "max_text_len": 256,
+        "additional_special_tokens": null
+    },
+    "vlm": {
+        "type": "Pizero",
+        "name": "pizero",
+        "dtype": "bfloat16",
+        "pretrained_model_name_or_path": "/share/project/lxh/project/VLMs/paligemma-3b-pt-224",
+        "mixture": {
+            "vlm": {
+                "hidden_size": 2048,
+                "intermediate_size": 16384,
+                "use_final_norm": false,
+                "cache": true,
+                "rope_theta": 10000.0,
+                "use_quantize": false,
+                "use_lora": false,
+                "adaptive_mode": null
+            },
+            "proprio": {
+                "hidden_size": 1024,
+                "intermediate_size": 4096,
+                "use_final_norm": true,
+                "cache": true,
+                "rope_theta": 100.0,
+                "use_quantize": false,
+                "use_lora": false,
+                "adaptive_mode": null
+            },
+            "action": {
+                "hidden_size": 1024,
+                "intermediate_size": 4096,
+                "use_final_norm": true,
+                "cache": false,
+                "rope_theta": 100.0,
+                "use_quantize": false,
+                "use_lora": false,
+                "adaptive_mode": null
+            }
+        },
+        "num_hidden_layers": 18,
+        "num_attention_heads": 8,
+        "num_key_value_heads": 1,
+        "num_inference_steps": 10,
+        "head_dim": 256,
+        "rms_norm_eps": 1e-06,
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "max_image_text_tokens": 276,
+        "cond_steps": 1,
+        "horizon_steps": 4,
+        "pad_token_id": 0,
+        "time_hidden_size": 256,
+        "action_expert_adaptive_mode": null,
+        "final_action_clip_value": 1.0,
+        "action_dim": 7,
+        "proprio_dim": 7,
+        "time_max_period": 100.0,
+        "flow_sampling": "beta",
+        "flow_alpha": 1.5,
+        "flow_beta": 1.0,
+        "flow_sig_min": 0.001
+    },
+    "trainer": {
+        "accelerator": "gpu",
+        "strategy": "deepspeed_stage_2",
+        "precision": "bf16",
+        "logger": [
+            "tensorboard"
+        ],
+        "gradient_clip_val": 1.0,
+        "use_distributed_sampler": false,
+        "log_every_n_steps": 10,
+        "max_epochs": 10,
+        "val_check_interval": null,
+        "check_val_every_n_epoch": 1,
+        "max_steps": -1,
+        "accumulate_grad_batches": 4
+    },
+    "use_cot_data": true,
+    "train_dataset": {
+        "type": "OpenVLADatasetByRank",
+        "data_root_dir": "/share/project/lxh/datasets/tf_datasets",
+        "model_name": "paligemma",
+        "image_aug": true,
+        "mode": "train",
+        "data_mix": "bridge",
+        "window_sample": "sliding",
+        "organize_type": "interleave",
+        "shuffle_buffer_size": 51200,
+        "train": true
+    },
+    "val_dataset": {
+        "type": "OpenVLADatasetByRank",
+        "data_root_dir": "/share/project/lxh/datasets/tf_datasets",
+        "model_name": "paligemma",
+        "mode": "train",
+        "data_mix": "bridge",
+        "window_sample": "sliding",
+        "organize_type": "interleave",
+        "shuffle_buffer_size": 10000,
+        "train": false
+    },
+    "raw_config_path": "/share/project/lxh/project/RoboVLMs-dev/configs/baai_oxe/finetune_pizero_bridge_no-cot.json",
+    "config": "/share/project/lxh/project/RoboVLMs-dev/configs/baai_oxe/finetune_pizero_bridge_no-cot.json",
+    "gpus": 8,
+    "num_nodes": 4,
+    "log_dir": "runs/logs/pizero/pizero_finetune/2025-04-29/05-54",
+    "output_dir": "runs/checkpoints/pizero/pizero_finetune/2025-04-29/05-54",
+    "data_dir": null,
+    "annotation_file": null,
+    "data_subfolder": null,
+    "task_num": null,
+    "exp_name": "05-54",
+    "use_multi_modal_emb": false,
+    "no_video_pretrained_model": false,
+    "finetune": false,
+    "llm": {
+        "type": null,
+        "n_embd": null,
+        "n_layer": null,
+        "n_head": null
+    }
+}