|
{ |
|
"type": "smolvla", |
|
"n_obs_steps": 1, |
|
"normalization_mapping": { |
|
"VISUAL": "IDENTITY", |
|
"STATE": "MEAN_STD", |
|
"ACTION": "MEAN_STD" |
|
}, |
|
"input_features": { |
|
"observation.state": { |
|
"type": "STATE", |
|
"shape": [ |
|
6 |
|
] |
|
}, |
|
"observation.images.top": { |
|
"type": "VISUAL", |
|
"shape": [ |
|
640, |
|
3, |
|
480 |
|
] |
|
}, |
|
"observation.images.wrist_left": { |
|
"type": "VISUAL", |
|
"shape": [ |
|
640, |
|
3, |
|
480 |
|
] |
|
} |
|
}, |
|
"output_features": { |
|
"action": { |
|
"type": "ACTION", |
|
"shape": [ |
|
6 |
|
] |
|
} |
|
}, |
|
"device": "cuda", |
|
"use_amp": false, |
|
"chunk_size": 50, |
|
"n_action_steps": 1, |
|
"max_state_dim": 32, |
|
"max_action_dim": 32, |
|
"resize_imgs_with_padding": [ |
|
512, |
|
512 |
|
], |
|
"empty_cameras": 0, |
|
"adapt_to_pi_aloha": false, |
|
"use_delta_joint_actions_aloha": false, |
|
"tokenizer_max_length": 48, |
|
"num_steps": 10, |
|
"use_cache": true, |
|
"freeze_vision_encoder": true, |
|
"train_expert_only": true, |
|
"train_state_proj": true, |
|
"optimizer_lr": 0.0001, |
|
"optimizer_betas": [ |
|
0.9, |
|
0.95 |
|
], |
|
"optimizer_eps": 1e-08, |
|
"optimizer_weight_decay": 1e-10, |
|
"optimizer_grad_clip_norm": 10.0, |
|
"scheduler_warmup_steps": 1000, |
|
"scheduler_decay_steps": 30000, |
|
"scheduler_decay_lr": 2.5e-06, |
|
"vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", |
|
"load_vlm_weights": true, |
|
"add_image_special_tokens": false, |
|
"attention_mode": "cross_attn", |
|
"prefix_length": 0, |
|
"pad_language_to": "max_length", |
|
"num_expert_layers": 0, |
|
"num_vlm_layers": 16, |
|
"self_attn_every_n_layers": 2, |
|
"expert_width_multiplier": 0.75, |
|
"min_period": 0.004, |
|
"max_period": 4.0 |
|
} |