{ "action_space": null, "backbone": "vit_base_patch32_clip_224.openai", "decoder_kwargs": { "mem_len": 128, "num_layers": 4, "timesteps": 128 }, "freeze_backbone": true, "hiddim": 1024, "image_encoder_kwargs": { "dropout": 0.1, "num_heads": 8, "num_layers": 2 }, "video_encoder_kwargs": { "dropout": 0.1, "num_heads": 8, "num_spatial_layers": 2, "num_temporal_layers": 4 } }