Robotics
Transformers
Safetensors
English
VLA
Libero-Goal-2 / configuration_hume.py
Hume-vla's picture
Upload folder using huggingface_hub
72bf50e verified
from dataclasses import dataclass, field
from lerobot.common.optim.optimizers import AdamWConfig
from lerobot.common.optim.schedulers import (
CosineDecayWithWarmupSchedulerConfig,
)
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
@PreTrainedConfig.register_subclass("hume")
@dataclass
class HumeConfig(PreTrainedConfig):
# Input / output structure.
type: str = "hume"
n_obs_steps: int = 1
s1_chunk_size: int = 10
s2_chunk_size: int = 50
n_action_steps: int = 50
normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: {
"VISUAL": NormalizationMode.IDENTITY,
"STATE": NormalizationMode.MEAN_STD,
"ACTION": NormalizationMode.MEAN_STD,
}
)
# Shorter state and action vectors will be padded
max_state_dim: int = 32
max_action_dim: int = 32
# Image preprocessing
resize_imgs_with_padding: tuple[int, int] = (224, 224)
# Add empty images. Used by pi0_aloha_sim which adds the empty
# left and right wrist cameras in addition to the top camera.
empty_cameras: int = 0
# Converts the joint and gripper values from the standard Aloha space to
# the space used by the pi internal runtime which was used to train the base model.
adapt_to_pi_aloha: bool = False
# Converts joint dimensions to deltas with respect to the current state before passing to the model.
# Gripper dimensions will remain in absolute values.
use_delta_joint_actions_aloha: bool = False
# Tokenizer
tokenizer_max_length: int = 48
# Projector
proj_width: int = 1024
# Decoding
num_steps: int = 10
# Attention utils
use_cache: bool = True
attention_implementation: str = "eager" # or fa2, flex
# Finetuning settings
freeze_vision_encoder: bool = True
train_expert_only: bool = False
train_state_proj: bool = True
# Training presets
optimizer_lr: float = 2.5e-5
optimizer_betas: tuple[float, float] = (0.9, 0.95)
optimizer_eps: float = 1e-8
optimizer_weight_decay: float = 1e-10
scheduler_warmup_steps: int = 1_000
scheduler_decay_steps: int = 30_000
scheduler_decay_lr: float = 2.5e-6
# + Aadditional attributes for s1 / s2
# freeze system
freeze_s2: bool = False
s1_his_state_size: int = 1
cache_s2_actions: bool = False
# denoise ratio
theta2: float = 1.0
theta1: float = 1.0
noise_slides_eps: float = 0.0
noise_slides_alp: float = 0.0
# projector
s1_proj_width: int = 512 # NOTE: consitent with the s1_gemma_expert_config
freeze_s1_vision_encoder: bool = False
# decoding
s1_num_steps: int = 10
# vqh
num_pos: int = 3
discount: float = 0.98
actor_lr: float = 1e-4 # actor learning rate
critic_lr: float = 3e-4
temp_lr: float = 3e-4
qf_lr: float = 3e-4 # Critics learning rate
next_obs_offset: int = 10 # should be equal to vqh_chunk_size
vqh_chunk_size: int = 10
paligemma_config: dict = field(
default_factory=lambda: {
"bos_token_id": 2,
"eos_token_id": 1,
"hidden_size": 2048,
"ignore_index": -100,
"image_token_index": 257152,
"model_type": "paligemma",
"pad_token_id": 0,
"projection_dim": 2048,
"text_config": {
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 2048,
"intermediate_size": 16384,
"model_type": "gemma",
"num_attention_heads": 8,
"num_hidden_layers": 18,
"num_image_tokens": 256,
"num_key_value_heads": 1,
"torch_dtype": "float32",
"vocab_size": 257152,
},
"torch_dtype": "float32",
"transformers_version": "4.48.1",
"vision_config": {
"hidden_size": 1152,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"num_image_tokens": 256,
"patch_size": 14,
"projection_dim": 2048,
"projector_hidden_act": "gelu_fast",
"vision_use_head": False,
},
"vocab_size": 257152,
}
)
gemma_expert_config: dict = field(
default_factory=lambda: {
"attention_bias": False,
"attention_dropout": 0.0,
"bos_token_id": 2,
"eos_token_id": 1,
"head_dim": 256,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 8192,
"model_type": "gemma",
"num_attention_heads": 8,
"num_hidden_layers": 18,
"num_key_value_heads": 1,
"pad_token_id": 0,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"torch_dtype": "float32",
"transformers_version": "4.48.1",
"use_cache": True,
"vocab_size": 257152,
}
)
# TODO: Add EMA
# system2 configurations
s1_dino_config: dict = field(
default_factory=lambda: {
"model_type": "dinov2",
"attention_probs_dropout_prob": 0.0,
"drop_path_rate": 0.0,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"hidden_size": 384,
"image_size": 518,
"initializer_range": 0.02,
"layer_norm_eps": 1e-06,
"layerscale_value": 1.0,
"mlp_ratio": 4,
"num_attention_heads": 6,
"num_channels": 3,
"num_hidden_layers": 12,
"patch_size": 14,
"qkv_bias": True,
"torch_dtype": "float32",
"use_swiglu_ffn": False,
}
)
s1_gemma_expert_config: dict = field(
default_factory=lambda: {
"attention_bias": False,
"attention_dropout": 0.0,
"bos_token_id": 2,
"eos_token_id": 1,
"head_dim": 128,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 2048,
"max_position_embeddings": 8192,
"model_type": "gemma",
"num_attention_heads": 8,
"num_hidden_layers": 13,
"num_key_value_heads": 1,
"pad_token_id": 0,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"torch_dtype": "float32",
"transformers_version": "4.48.1",
"use_cache": True,
"vocab_size": 257152,
}
)
def __post_init__(self):
super().__post_init__()
"""Input validation (not exhaustive)."""
if self.n_action_steps > self.s2_chunk_size:
raise ValueError(
f"The chunk size is the upper bound for the number of action steps per model invocation. Got "
f"{self.n_action_steps} for `n_action_steps` and {self.s2_chunk_size} for `chunk_size`."
)
if self.n_obs_steps != 1:
raise ValueError(
f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
)
if self.use_delta_joint_actions_aloha:
raise NotImplementedError(
"`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot."
)
def validate_features(self) -> None:
# TODO: implement value error
# if not self.image_features and not self.env_state_feature:
# raise ValueError("You must provide at least one image or the environment state among the inputs.")
for i in range(self.empty_cameras):
key = f"observation.images.empty_camera_{i}"
empty_camera = PolicyFeature(
type=FeatureType.VISUAL,
shape=(3, 480, 640),
)
self.input_features[key] = empty_camera
def get_optimizer_preset(self) -> dict[AdamWConfig]:
qf_optimizer = AdamWConfig(
lr=self.qf_lr,
weight_decay=0,
grad_clip_norm=10,
)
actor_optimizer = AdamWConfig(
lr=self.actor_lr,
weight_decay=0,
grad_clip_norm=10,
)
trunk_optimizer = AdamWConfig(
lr=self.optimizer_lr,
betas=self.optimizer_betas,
eps=self.optimizer_eps,
weight_decay=self.optimizer_weight_decay,
)
optimizer_dict = dict(
qf_optimizer=qf_optimizer,
actor_optimizer=actor_optimizer,
trunk_optimizer=trunk_optimizer,
)
return optimizer_dict
def get_scheduler_preset(self):
return CosineDecayWithWarmupSchedulerConfig(
peak_lr=self.optimizer_lr,
decay_lr=self.scheduler_decay_lr,
num_warmup_steps=self.scheduler_warmup_steps,
num_decay_steps=self.scheduler_decay_steps,
)
@property
def observation_delta_indices(self) -> None:
return None
@property
def action_delta_indices(self) -> list:
return list(range(self.s2_chunk_size))
@property
def reward_delta_indices(self) -> None:
return None
@property
def slide(self) -> None:
return self.s2_chunk_size // self.s1_chunk_size
@property
def s1_action_steps(self) -> None:
return self.s1_chunk_size
@property
def s2_action_steps(self) -> None:
return self.s2_chunk_size
from dataclasses import dataclass, field
from lerobot.common.optim.optimizers import AdamWConfig
from lerobot.common.optim.schedulers import (
CosineDecayWithWarmupSchedulerConfig,
)
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
@PreTrainedConfig.register_subclass("system2")
@dataclass
class System2Config(PreTrainedConfig):
# Input / output structure.
num_pos: int = 3
discount: float = 0.98
n_obs_steps: int = 1
chunk_size: int = 50
n_action_steps: int = 50
next_obs_offset: int = 1
s1_his_state_size: int = 1
normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: {
"VISUAL": NormalizationMode.IDENTITY,
"STATE": NormalizationMode.MEAN_STD,
"ACTION": NormalizationMode.MEAN_STD,
}
)
# Shorter state and action vectors will be padded
max_state_dim: int = 32
max_action_dim: int = 32
# Image preprocessing
resize_imgs_with_padding: tuple[int, int] = (224, 224)
# Add empty images. Used by pi0_aloha_sim which adds the empty
# left and right wrist cameras in addition to the top camera.
empty_cameras: int = 0
# Converts the joint and gripper values from the standard Aloha space to
# the space used by the pi internal runtime which was used to train the base model.
adapt_to_pi_aloha: bool = False
# Converts joint dimensions to deltas with respect to the current state before passing to the model.
# Gripper dimensions will remain in absolute values.
use_delta_joint_actions_aloha: bool = False
# Tokenizer
tokenizer_max_length: int = 48
# Projector
proj_width: int = 1024
# Decoding
num_steps: int = 10
# Attention utils
use_cache: bool = True
attention_implementation: str = "eager" # or fa2, flex
# Finetuning settings
freeze_vision_encoder: bool = True
train_expert_only: bool = False
train_state_proj: bool = True
# Training presets
optimizer_lr: float = 2.5e-5
optimizer_betas: tuple[float, float] = (0.9, 0.95)
optimizer_eps: float = 1e-8
optimizer_weight_decay: float = 1e-10
scheduler_warmup_steps: int = 1_000
scheduler_decay_steps: int = 30_000
scheduler_decay_lr: float = 2.5e-6
paligemma_config: dict = field(
default_factory=lambda: {
"bos_token_id": 2,
"eos_token_id": 1,
"hidden_size": 2048,
"ignore_index": -100,
"image_token_index": 257152,
"model_type": "paligemma",
"pad_token_id": 0,
"projection_dim": 2048,
"text_config": {
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 2048,
"intermediate_size": 16384,
"model_type": "gemma",
"num_attention_heads": 8,
"num_hidden_layers": 18,
"num_image_tokens": 256,
"num_key_value_heads": 1,
"torch_dtype": "float32",
"vocab_size": 257152,
},
"torch_dtype": "float32",
"transformers_version": "4.48.1",
"vision_config": {
"hidden_size": 1152,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"num_image_tokens": 256,
"patch_size": 14,
"projection_dim": 2048,
"projector_hidden_act": "gelu_fast",
"vision_use_head": False,
},
"vocab_size": 257152,
}
)
gemma_expert_config: dict = field(
default_factory=lambda: {
"attention_bias": False,
"attention_dropout": 0.0,
"bos_token_id": 2,
"eos_token_id": 1,
"head_dim": 256,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 8192,
"model_type": "gemma",
"num_attention_heads": 8,
"num_hidden_layers": 18,
"num_key_value_heads": 1,
"pad_token_id": 0,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"torch_dtype": "float32",
"transformers_version": "4.48.1",
"use_cache": True,
"vocab_size": 257152,
}
)
# TODO: Add EMA
def __post_init__(self):
super().__post_init__()
"""Input validation (not exhaustive)."""
if self.n_action_steps > self.chunk_size:
raise ValueError(
f"The chunk size is the upper bound for the number of action steps per model invocation. Got "
f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`."
)
if self.n_obs_steps != 1:
raise ValueError(
f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
)
if self.use_delta_joint_actions_aloha:
raise NotImplementedError(
"`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot."
)
def validate_features(self) -> None:
# TODO: implement value error
# if not self.image_features and not self.env_state_feature:
# raise ValueError("You must provide at least one image or the environment state among the inputs.")
for i in range(self.empty_cameras):
key = f"observation.images.empty_camera_{i}"
empty_camera = PolicyFeature(
type=FeatureType.VISUAL,
shape=(3, 480, 640),
)
self.input_features[key] = empty_camera
def get_optimizer_preset(self) -> AdamWConfig:
return AdamWConfig(
lr=self.optimizer_lr,
betas=self.optimizer_betas,
eps=self.optimizer_eps,
weight_decay=self.optimizer_weight_decay,
)
def get_scheduler_preset(self):
return CosineDecayWithWarmupSchedulerConfig(
peak_lr=self.optimizer_lr,
decay_lr=self.scheduler_decay_lr,
num_warmup_steps=self.scheduler_warmup_steps,
num_decay_steps=self.scheduler_decay_steps,
)
@property
def observation_delta_indices(self) -> None:
return None
@property
def action_delta_indices(self) -> list:
return list(range(self.chunk_size))
@property
def reward_delta_indices(self) -> None:
return None
@property
def slide(self) -> None:
return 1
@property
def s1_action_steps(self) -> None:
return 1