Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
array_typing.py +80 -0
config.json +283 -0
configuration_hume.py +528 -0
fast_visuo_expert.py +321 -0
model.safetensors +3 -0
modeling_hume.py +1909 -0
paligemma_with_expert.py +444 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer_config.json +1772 -0
value_query.py +1155 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

array_typing.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Annotated, TypeAlias, TypedDict
+import numpy as np
+from jaxtyping import Float
+from numpy.typing import NDArray
+from torch import Tensor
+class InferConfig(TypedDict):
+    """Configuration for inference."""
+    replan_steps: int
+    s2_replan_steps: int
+    s2_candidates_num: int
+    noise_temp_lower_bound: float
+    noise_temp_upper_bound: float
+    time_temp_lower_bound: float
+    time_temp_upper_bound: float
+    post_process_action: bool
+    device: str
+ImageArray: TypeAlias = Annotated[NDArray[np.uint8], "Shape[B, H, W, C]"]
+StateArray: TypeAlias = Annotated[
+    NDArray[np.float32], "Shape[B, state_horizon, state_dim]"
+]
+ActionArray: TypeAlias = Annotated[NDArray[np.float32], "Shape[B, action_dim]"]
+InferBatchObs = TypedDict(
+    "BatchObs",
+    {
+        "observation.images.image": ImageArray,
+        "observation.images.wrist_image": ImageArray,
+        "observation.state": StateArray,
+        "task": list[str],
+    },
+)
+class InferOutput(TypedDict):
+    noise_action: Float[Tensor, "batch s2_chunksize padded_action_dim"]
+    s1_action: Float[Tensor, "batch s1_chunksize unpadded_action_dim"]
+    s2_action: Float[Tensor, "batch s2_chunksize unpadded_action_dim"]
+class CalQlBatch(TypedDict):
+    encoded_observations: Float[Tensor, "batch encoded_dim"]
+    encoded_next_observations: Float[Tensor, "batch encoded_dim"]
+    actions: Float[Tensor, "batch action_dim"]
+    rewards: Float[Tensor, " batch"]
+    mc_returns: Float[Tensor, " batch"]
+    masks: Float[Tensor, " batch"]
+class EnvArgs(TypedDict):
+    """Environment arguments."""
+    # necessary args
+    num_trials_per_task: int
+    num_steps_wait: int
+    task_suite_name: str
+    seed: int
+    ckpt_path: str | None
+    eval_name: str | None
+class Request(TypedDict):
+    """Environment receive message."""
+    frame_type: str  # "init" | "action"
+    env_args: EnvArgs | None
+    action: ActionArray | None
+class Response(TypedDict):
+    """Environment send message."""
+    status: str  # "new_episode" | "eval_finished" | "in_episode"
+    success_rate: float | None
+    observation: InferBatchObs | None

config.json ADDED Viewed

	@@ -0,0 +1,283 @@

+{
+    "n_obs_steps": 1,
+    "normalization_mapping": {
+        "VISUAL": "IDENTITY",
+        "STATE": "MEAN_STD",
+        "ACTION": "MEAN_STD"
+    },
+    "input_features": {
+        "observation.images.image": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                256,
+                256
+            ]
+        },
+        "observation.images.wrist_image": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                256,
+                256
+            ]
+        },
+        "observation.state": {
+            "type": "STATE",
+            "shape": [
+                8
+            ]
+        }
+    },
+    "output_features": {
+        "action": {
+            "type": "ACTION",
+            "shape": [
+                7
+            ]
+        }
+    },
+    "device": "cpu",
+    "use_amp": false,
+    "type": "hume",
+    "s1_chunk_size": 8,
+    "s2_chunk_size": 16,
+    "n_action_steps": 16,
+    "max_state_dim": 32,
+    "max_action_dim": 32,
+    "resize_imgs_with_padding": [
+        224,
+        224
+    ],
+    "empty_cameras": 0,
+    "adapt_to_pi_aloha": false,
+    "use_delta_joint_actions_aloha": false,
+    "tokenizer_max_length": 48,
+    "proj_width": 1024,
+    "num_steps": 10,
+    "use_cache": true,
+    "attention_implementation": "eager",
+    "freeze_vision_encoder": true,
+    "train_expert_only": false,
+    "train_state_proj": true,
+    "optimizer_lr": 5e-05,
+    "optimizer_betas": [
+        0.9,
+        0.95
+    ],
+    "optimizer_eps": 1e-08,
+    "optimizer_weight_decay": 1e-10,
+    "scheduler_warmup_steps": 1000,
+    "scheduler_decay_steps": 1600000,
+    "scheduler_decay_lr": 2.5e-06,
+    "freeze_s2": true,
+    "s1_his_state_size": 4,
+    "cache_s2_actions": false,
+    "theta2": 1.0,
+    "theta1": 1.0,
+    "noise_slides_eps": 0.0,
+    "noise_slides_alp": 0.0,
+    "s1_proj_width": 512,
+    "freeze_s1_vision_encoder": false,
+    "s1_num_steps": 10,
+    "num_pos": 3,
+    "discount": 0.98,
+    "actor_lr": 1e-05,
+    "critic_lr": 1e-05,
+    "temp_lr": 2e-05,
+    "qf_lr": 0.0003,
+    "next_obs_offset": 1,
+    "vqh_chunk_size": 1,
+    "paligemma_config": {
+        "bos_token_id": 2,
+        "eos_token_id": 1,
+        "hidden_size": 2048,
+        "ignore_index": -100,
+        "image_token_index": 257152,
+        "model_type": "paligemma",
+        "pad_token_id": 0,
+        "projection_dim": 2048,
+        "text_config": {
+            "hidden_activation": "gelu_pytorch_tanh",
+            "hidden_size": 2048,
+            "intermediate_size": 16384,
+            "model_type": "gemma",
+            "num_attention_heads": 8,
+            "num_hidden_layers": 18,
+            "num_image_tokens": 256,
+            "num_key_value_heads": 1,
+            "torch_dtype": "float32",
+            "vocab_size": 257152
+        },
+        "torch_dtype": "float32",
+        "transformers_version": "4.48.1",
+        "vision_config": {
+            "hidden_size": 1152,
+            "intermediate_size": 4304,
+            "model_type": "siglip_vision_model",
+            "num_attention_heads": 16,
+            "num_hidden_layers": 27,
+            "num_image_tokens": 256,
+            "patch_size": 14,
+            "projection_dim": 2048,
+            "projector_hidden_act": "gelu_fast",
+            "vision_use_head": false
+        },
+        "vocab_size": 257152
+    },
+    "gemma_expert_config": {
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "bos_token_id": 2,
+        "eos_token_id": 1,
+        "head_dim": 256,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_activation": "gelu_pytorch_tanh",
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 8192,
+        "model_type": "gemma",
+        "num_attention_heads": 8,
+        "num_hidden_layers": 18,
+        "num_key_value_heads": 1,
+        "pad_token_id": 0,
+        "rms_norm_eps": 1e-06,
+        "rope_theta": 10000.0,
+        "torch_dtype": "float32",
+        "transformers_version": "4.48.1",
+        "use_cache": true,
+        "vocab_size": 257152
+    },
+    "s1_dino_config": {
+        "return_dict": true,
+        "output_hidden_states": false,
+        "output_attentions": false,
+        "torchscript": false,
+        "torch_dtype": "float32",
+        "use_bfloat16": false,
+        "tf_legacy_loss": false,
+        "pruned_heads": {},
+        "tie_word_embeddings": true,
+        "chunk_size_feed_forward": 0,
+        "is_encoder_decoder": false,
+        "is_decoder": false,
+        "cross_attention_hidden_size": null,
+        "add_cross_attention": false,
+        "tie_encoder_decoder": false,
+        "max_length": 20,
+        "min_length": 0,
+        "do_sample": false,
+        "early_stopping": false,
+        "num_beams": 1,
+        "num_beam_groups": 1,
+        "diversity_penalty": 0.0,
+        "temperature": 1.0,
+        "top_k": 50,
+        "top_p": 1.0,
+        "typical_p": 1.0,
+        "repetition_penalty": 1.0,
+        "length_penalty": 1.0,
+        "no_repeat_ngram_size": 0,
+        "encoder_no_repeat_ngram_size": 0,
+        "bad_words_ids": null,
+        "num_return_sequences": 1,
+        "output_scores": false,
+        "return_dict_in_generate": false,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "remove_invalid_values": false,
+        "exponential_decay_length_penalty": null,
+        "suppress_tokens": null,
+        "begin_suppress_tokens": null,
+        "architectures": [
+            "Dinov2Model"
+        ],
+        "finetuning_task": null,
+        "id2label": {
+            "0": "LABEL_0",
+            "1": "LABEL_1"
+        },
+        "label2id": {
+            "LABEL_0": 0,
+            "LABEL_1": 1
+        },
+        "tokenizer_class": null,
+        "prefix": null,
+        "bos_token_id": null,
+        "pad_token_id": null,
+        "eos_token_id": null,
+        "sep_token_id": null,
+        "decoder_start_token_id": null,
+        "task_specific_params": null,
+        "problem_type": null,
+        "_name_or_path": "../pretrained/dinov2-small",
+        "_attn_implementation_autoset": false,
+        "transformers_version": "4.52.0.dev0",
+        "model_type": "dinov2",
+        "hidden_size": 384,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 6,
+        "mlp_ratio": 4,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.0,
+        "attention_probs_dropout_prob": 0.0,
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-06,
+        "image_size": 518,
+        "patch_size": 14,
+        "num_channels": 3,
+        "qkv_bias": true,
+        "layerscale_value": 1.0,
+        "drop_path_rate": 0.0,
+        "use_swiglu_ffn": false,
+        "stage_names": [
+            "stem",
+            "stage1",
+            "stage2",
+            "stage3",
+            "stage4",
+            "stage5",
+            "stage6",
+            "stage7",
+            "stage8",
+            "stage9",
+            "stage10",
+            "stage11",
+            "stage12"
+        ],
+        "apply_layernorm": true,
+        "reshape_hidden_states": true,
+        "use_mask_token": true,
+        "out_features": [
+            "stage12"
+        ],
+        "out_indices": [
+            12
+        ]
+    },
+    "s1_gemma_expert_config": {
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "bos_token_id": 2,
+        "eos_token_id": 1,
+        "head_dim": 128,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_activation": "gelu_pytorch_tanh",
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 2048,
+        "max_position_embeddings": 8192,
+        "model_type": "gemma",
+        "num_attention_heads": 8,
+        "num_hidden_layers": 13,
+        "num_key_value_heads": 1,
+        "pad_token_id": 0,
+        "rms_norm_eps": 1e-06,
+        "rope_theta": 10000.0,
+        "torch_dtype": "float32",
+        "transformers_version": "4.48.1",
+        "use_cache": true,
+        "vocab_size": 257152
+    }
+}

configuration_hume.py ADDED Viewed

	@@ -0,0 +1,528 @@

+from dataclasses import dataclass, field
+from lerobot.common.optim.optimizers import AdamWConfig
+from lerobot.common.optim.schedulers import (
+    CosineDecayWithWarmupSchedulerConfig,
+)
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+@PreTrainedConfig.register_subclass("hume")
+@dataclass
+class HumeConfig(PreTrainedConfig):
+    # Input / output structure.
+    type: str = "hume"
+    n_obs_steps: int = 1
+    s1_chunk_size: int = 10
+    s2_chunk_size: int = 50
+    n_action_steps: int = 50
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "STATE": NormalizationMode.MEAN_STD,
+            "ACTION": NormalizationMode.MEAN_STD,
+        }
+    )
+    # Shorter state and action vectors will be padded
+    max_state_dim: int = 32
+    max_action_dim: int = 32
+    # Image preprocessing
+    resize_imgs_with_padding: tuple[int, int] = (224, 224)
+    # Add empty images. Used by pi0_aloha_sim which adds the empty
+    # left and right wrist cameras in addition to the top camera.
+    empty_cameras: int = 0
+    # Converts the joint and gripper values from the standard Aloha space to
+    # the space used by the pi internal runtime which was used to train the base model.
+    adapt_to_pi_aloha: bool = False
+    # Converts joint dimensions to deltas with respect to the current state before passing to the model.
+    # Gripper dimensions will remain in absolute values.
+    use_delta_joint_actions_aloha: bool = False
+    # Tokenizer
+    tokenizer_max_length: int = 48
+    # Projector
+    proj_width: int = 1024
+    # Decoding
+    num_steps: int = 10
+    # Attention utils
+    use_cache: bool = True
+    attention_implementation: str = "eager"  # or fa2, flex
+    # Finetuning settings
+    freeze_vision_encoder: bool = True
+    train_expert_only: bool = False
+    train_state_proj: bool = True
+    # Training presets
+    optimizer_lr: float = 2.5e-5
+    optimizer_betas: tuple[float, float] = (0.9, 0.95)
+    optimizer_eps: float = 1e-8
+    optimizer_weight_decay: float = 1e-10
+    scheduler_warmup_steps: int = 1_000
+    scheduler_decay_steps: int = 30_000
+    scheduler_decay_lr: float = 2.5e-6
+    # + Aadditional attributes for s1 / s2
+    # freeze system
+    freeze_s2: bool = False
+    s1_his_state_size: int = 1
+    cache_s2_actions: bool = False
+    # denoise ratio
+    theta2: float = 1.0
+    theta1: float = 1.0
+    noise_slides_eps: float = 0.0
+    noise_slides_alp: float = 0.0
+    # projector
+    s1_proj_width: int = 512  # NOTE: consitent with the s1_gemma_expert_config
+    freeze_s1_vision_encoder: bool = False
+    # decoding
+    s1_num_steps: int = 10
+    # vqh
+    num_pos: int = 3
+    discount: float = 0.98
+    actor_lr: float = 1e-4  # actor learning rate
+    critic_lr: float = 3e-4
+    temp_lr: float = 3e-4
+    qf_lr: float = 3e-4  # Critics learning rate
+    next_obs_offset: int = 10  # should be equal to vqh_chunk_size
+    vqh_chunk_size: int = 10
+    paligemma_config: dict = field(
+        default_factory=lambda: {
+            "bos_token_id": 2,
+            "eos_token_id": 1,
+            "hidden_size": 2048,
+            "ignore_index": -100,
+            "image_token_index": 257152,
+            "model_type": "paligemma",
+            "pad_token_id": 0,
+            "projection_dim": 2048,
+            "text_config": {
+                "hidden_activation": "gelu_pytorch_tanh",
+                "hidden_size": 2048,
+                "intermediate_size": 16384,
+                "model_type": "gemma",
+                "num_attention_heads": 8,
+                "num_hidden_layers": 18,
+                "num_image_tokens": 256,
+                "num_key_value_heads": 1,
+                "torch_dtype": "float32",
+                "vocab_size": 257152,
+            },
+            "torch_dtype": "float32",
+            "transformers_version": "4.48.1",
+            "vision_config": {
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "model_type": "siglip_vision_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27,
+                "num_image_tokens": 256,
+                "patch_size": 14,
+                "projection_dim": 2048,
+                "projector_hidden_act": "gelu_fast",
+                "vision_use_head": False,
+            },
+            "vocab_size": 257152,
+        }
+    )
+    gemma_expert_config: dict = field(
+        default_factory=lambda: {
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": 2,
+            "eos_token_id": 1,
+            "head_dim": 256,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_activation": "gelu_pytorch_tanh",
+            "hidden_size": 1024,
+            "initializer_range": 0.02,
+            "intermediate_size": 4096,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma",
+            "num_attention_heads": 8,
+            "num_hidden_layers": 18,
+            "num_key_value_heads": 1,
+            "pad_token_id": 0,
+            "rms_norm_eps": 1e-06,
+            "rope_theta": 10000.0,
+            "torch_dtype": "float32",
+            "transformers_version": "4.48.1",
+            "use_cache": True,
+            "vocab_size": 257152,
+        }
+    )
+    # TODO: Add EMA
+    # system2 configurations
+    s1_dino_config: dict = field(
+        default_factory=lambda: {
+            "model_type": "dinov2",
+            "attention_probs_dropout_prob": 0.0,
+            "drop_path_rate": 0.0,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.0,
+            "hidden_size": 384,
+            "image_size": 518,
+            "initializer_range": 0.02,
+            "layer_norm_eps": 1e-06,
+            "layerscale_value": 1.0,
+            "mlp_ratio": 4,
+            "num_attention_heads": 6,
+            "num_channels": 3,
+            "num_hidden_layers": 12,
+            "patch_size": 14,
+            "qkv_bias": True,
+            "torch_dtype": "float32",
+            "use_swiglu_ffn": False,
+        }
+    )
+    s1_gemma_expert_config: dict = field(
+        default_factory=lambda: {
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": 2,
+            "eos_token_id": 1,
+            "head_dim": 128,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_activation": "gelu_pytorch_tanh",
+            "hidden_size": 512,
+            "initializer_range": 0.02,
+            "intermediate_size": 2048,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma",
+            "num_attention_heads": 8,
+            "num_hidden_layers": 13,
+            "num_key_value_heads": 1,
+            "pad_token_id": 0,
+            "rms_norm_eps": 1e-06,
+            "rope_theta": 10000.0,
+            "torch_dtype": "float32",
+            "transformers_version": "4.48.1",
+            "use_cache": True,
+            "vocab_size": 257152,
+        }
+    )
+    def __post_init__(self):
+        super().__post_init__()
+        """Input validation (not exhaustive)."""
+        if self.n_action_steps > self.s2_chunk_size:
+            raise ValueError(
+                f"The chunk size is the upper bound for the number of action steps per model invocation. Got "
+                f"{self.n_action_steps} for `n_action_steps` and {self.s2_chunk_size} for `chunk_size`."
+            )
+        if self.n_obs_steps != 1:
+            raise ValueError(
+                f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
+            )
+        if self.use_delta_joint_actions_aloha:
+            raise NotImplementedError(
+                "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot."
+            )
+    def validate_features(self) -> None:
+        # TODO: implement value error
+        # if not self.image_features and not self.env_state_feature:
+        #     raise ValueError("You must provide at least one image or the environment state among the inputs.")
+        for i in range(self.empty_cameras):
+            key = f"observation.images.empty_camera_{i}"
+            empty_camera = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(3, 480, 640),
+            )
+            self.input_features[key] = empty_camera
+    def get_optimizer_preset(self) -> dict[AdamWConfig]:
+        qf_optimizer = AdamWConfig(
+            lr=self.qf_lr,
+            weight_decay=0,
+            grad_clip_norm=10,
+        )
+        actor_optimizer = AdamWConfig(
+            lr=self.actor_lr,
+            weight_decay=0,
+            grad_clip_norm=10,
+        )
+        trunk_optimizer = AdamWConfig(
+            lr=self.optimizer_lr,
+            betas=self.optimizer_betas,
+            eps=self.optimizer_eps,
+            weight_decay=self.optimizer_weight_decay,
+        )
+        optimizer_dict = dict(
+            qf_optimizer=qf_optimizer,
+            actor_optimizer=actor_optimizer,
+            trunk_optimizer=trunk_optimizer,
+        )
+        return optimizer_dict
+    def get_scheduler_preset(self):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.optimizer_lr,
+            decay_lr=self.scheduler_decay_lr,
+            num_warmup_steps=self.scheduler_warmup_steps,
+            num_decay_steps=self.scheduler_decay_steps,
+        )
+    @property
+    def observation_delta_indices(self) -> None:
+        return None
+    @property
+    def action_delta_indices(self) -> list:
+        return list(range(self.s2_chunk_size))
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+    @property
+    def slide(self) -> None:
+        return self.s2_chunk_size // self.s1_chunk_size
+    @property
+    def s1_action_steps(self) -> None:
+        return self.s1_chunk_size
+    @property
+    def s2_action_steps(self) -> None:
+        return self.s2_chunk_size
+from dataclasses import dataclass, field
+from lerobot.common.optim.optimizers import AdamWConfig
+from lerobot.common.optim.schedulers import (
+    CosineDecayWithWarmupSchedulerConfig,
+)
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+@PreTrainedConfig.register_subclass("system2")
+@dataclass
+class System2Config(PreTrainedConfig):
+    # Input / output structure.
+    num_pos: int = 3
+    discount: float = 0.98
+    n_obs_steps: int = 1
+    chunk_size: int = 50
+    n_action_steps: int = 50
+    next_obs_offset: int = 1
+    s1_his_state_size: int = 1
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "STATE": NormalizationMode.MEAN_STD,
+            "ACTION": NormalizationMode.MEAN_STD,
+        }
+    )
+    # Shorter state and action vectors will be padded
+    max_state_dim: int = 32
+    max_action_dim: int = 32
+    # Image preprocessing
+    resize_imgs_with_padding: tuple[int, int] = (224, 224)
+    # Add empty images. Used by pi0_aloha_sim which adds the empty
+    # left and right wrist cameras in addition to the top camera.
+    empty_cameras: int = 0
+    # Converts the joint and gripper values from the standard Aloha space to
+    # the space used by the pi internal runtime which was used to train the base model.
+    adapt_to_pi_aloha: bool = False
+    # Converts joint dimensions to deltas with respect to the current state before passing to the model.
+    # Gripper dimensions will remain in absolute values.
+    use_delta_joint_actions_aloha: bool = False
+    # Tokenizer
+    tokenizer_max_length: int = 48
+    # Projector
+    proj_width: int = 1024
+    # Decoding
+    num_steps: int = 10
+    # Attention utils
+    use_cache: bool = True
+    attention_implementation: str = "eager"  # or fa2, flex
+    # Finetuning settings
+    freeze_vision_encoder: bool = True
+    train_expert_only: bool = False
+    train_state_proj: bool = True
+    # Training presets
+    optimizer_lr: float = 2.5e-5
+    optimizer_betas: tuple[float, float] = (0.9, 0.95)
+    optimizer_eps: float = 1e-8
+    optimizer_weight_decay: float = 1e-10
+    scheduler_warmup_steps: int = 1_000
+    scheduler_decay_steps: int = 30_000
+    scheduler_decay_lr: float = 2.5e-6
+    paligemma_config: dict = field(
+        default_factory=lambda: {
+            "bos_token_id": 2,
+            "eos_token_id": 1,
+            "hidden_size": 2048,
+            "ignore_index": -100,
+            "image_token_index": 257152,
+            "model_type": "paligemma",
+            "pad_token_id": 0,
+            "projection_dim": 2048,
+            "text_config": {
+                "hidden_activation": "gelu_pytorch_tanh",
+                "hidden_size": 2048,
+                "intermediate_size": 16384,
+                "model_type": "gemma",
+                "num_attention_heads": 8,
+                "num_hidden_layers": 18,
+                "num_image_tokens": 256,
+                "num_key_value_heads": 1,
+                "torch_dtype": "float32",
+                "vocab_size": 257152,
+            },
+            "torch_dtype": "float32",
+            "transformers_version": "4.48.1",
+            "vision_config": {
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "model_type": "siglip_vision_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27,
+                "num_image_tokens": 256,
+                "patch_size": 14,
+                "projection_dim": 2048,
+                "projector_hidden_act": "gelu_fast",
+                "vision_use_head": False,
+            },
+            "vocab_size": 257152,
+        }
+    )
+    gemma_expert_config: dict = field(
+        default_factory=lambda: {
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": 2,
+            "eos_token_id": 1,
+            "head_dim": 256,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_activation": "gelu_pytorch_tanh",
+            "hidden_size": 1024,
+            "initializer_range": 0.02,
+            "intermediate_size": 4096,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma",
+            "num_attention_heads": 8,
+            "num_hidden_layers": 18,
+            "num_key_value_heads": 1,
+            "pad_token_id": 0,
+            "rms_norm_eps": 1e-06,
+            "rope_theta": 10000.0,
+            "torch_dtype": "float32",
+            "transformers_version": "4.48.1",
+            "use_cache": True,
+            "vocab_size": 257152,
+        }
+    )
+    # TODO: Add EMA
+    def __post_init__(self):
+        super().__post_init__()
+        """Input validation (not exhaustive)."""
+        if self.n_action_steps > self.chunk_size:
+            raise ValueError(
+                f"The chunk size is the upper bound for the number of action steps per model invocation. Got "
+                f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`."
+            )
+        if self.n_obs_steps != 1:
+            raise ValueError(
+                f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
+            )
+        if self.use_delta_joint_actions_aloha:
+            raise NotImplementedError(
+                "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot."
+            )
+    def validate_features(self) -> None:
+        # TODO: implement value error
+        # if not self.image_features and not self.env_state_feature:
+        #     raise ValueError("You must provide at least one image or the environment state among the inputs.")
+        for i in range(self.empty_cameras):
+            key = f"observation.images.empty_camera_{i}"
+            empty_camera = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(3, 480, 640),
+            )
+            self.input_features[key] = empty_camera
+    def get_optimizer_preset(self) -> AdamWConfig:
+        return AdamWConfig(
+            lr=self.optimizer_lr,
+            betas=self.optimizer_betas,
+            eps=self.optimizer_eps,
+            weight_decay=self.optimizer_weight_decay,
+        )
+    def get_scheduler_preset(self):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.optimizer_lr,
+            decay_lr=self.scheduler_decay_lr,
+            num_warmup_steps=self.scheduler_warmup_steps,
+            num_decay_steps=self.scheduler_decay_steps,
+        )
+    @property
+    def observation_delta_indices(self) -> None:
+        return None
+    @property
+    def action_delta_indices(self) -> list:
+        return list(range(self.chunk_size))
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+    @property
+    def slide(self) -> None:
+        return 1
+    @property
+    def s1_action_steps(self) -> None:
+        return 1

fast_visuo_expert.py ADDED Viewed

	@@ -0,0 +1,321 @@

+from typing import Optional
+import torch
+from torch import nn
+from transformers import (
+    AutoConfig,
+    Dinov2Model,
+    GemmaForCausalLM,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.models.auto import CONFIG_MAPPING
+def apply_rope(x, positions, max_wavelength=10_000):
+    """
+    Applies RoPE positions [B, L] to x [B, L, H, D].
+    """
+    d_half = x.shape[-1] // 2
+    device = x.device
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(
+        d_half, dtype=torch.float32, device=device
+    )
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(
+        torch.float32
+    )
+    radians = radians[..., None, :]
+    sin = torch.sin(radians)  # .to(dtype=dtype)
+    cos = torch.cos(radians)  # .to(dtype=dtype)
+    x1, x2 = x.split(d_half, dim=-1)
+    res = torch.empty_like(x)
+    res[..., :d_half] = x1 * cos - x2 * sin
+    res[..., d_half:] = x2 * cos + x1 * sin
+    return res.to(dtype)
+class FastVisuoExpertConfig(PretrainedConfig):
+    model_type = "FastVisuoExpertModel"
+    sub_configs = {"dino_config": AutoConfig, "gemma_expert_config": AutoConfig}
+    def __init__(
+        self,
+        dino_config: dict | None = None,
+        gemma_expert_config: dict | None = None,
+        freeze_vision_encoder: bool = True,
+        attention_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.freeze_vision_encoder = freeze_vision_encoder
+        self.attention_implementation = attention_implementation
+        if dino_config is None:
+            self.dino_config = CONFIG_MAPPING["dinov2"](
+                transformers_version="4.48.1",
+                model_type="dinov2",
+                attention_probs_dropout_prob=0.0,
+                drop_path_rate=0.0,
+                hidden_act="gelu",
+                hidden_dropout_prob=0.0,
+                hidden_size=384,
+                image_size=518,
+                initializer_range=0.02,
+                layer_norm_eps=1e-06,
+                layerscale_value=1.0,
+                mlp_ratio=4,
+                num_attention_heads=6,
+                num_channels=3,
+                num_hidden_layers=12,
+                patch_size=14,
+                qkv_bias=True,
+                torch_dtype="float32",
+                use_swiglu_ffn=False,
+            )
+        elif isinstance(dino_config, dict):
+            if "model_type" not in dino_config:
+                dino_config["model_type"] = "dinov2"
+            cfg_cls = CONFIG_MAPPING[dino_config["model_type"]]
+            self.dino_config = cfg_cls(**dino_config)
+        if gemma_expert_config is None:
+            self.gemma_expert_config = CONFIG_MAPPING["gemma"](
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=2,
+                eos_token_id=1,
+                head_dim=256,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_activation="gelu_pytorch_tanh",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=4096,
+                max_position_embeddings=8192,
+                model_type="gemma",
+                num_attention_heads=8,
+                num_hidden_layers=8,
+                num_key_value_heads=1,
+                pad_token_id=0,
+                rms_norm_eps=1e-06,
+                rope_theta=10000.0,
+                torch_dtype="float32",
+                transformers_version="4.48.1",
+                use_cache=True,
+                vocab_size=257152,
+            )
+        elif isinstance(gemma_expert_config, dict):
+            if "model_type" not in gemma_expert_config:
+                gemma_expert_config["model_type"] = "gemma"
+            cfg_cls = CONFIG_MAPPING[gemma_expert_config["model_type"]]
+            self.gemma_expert_config = cfg_cls(**gemma_expert_config)
+        super().__init__(**kwargs)
+    def __post_init__(self):
+        super().__post_init__()
+        if self.attention_implementation not in ["eager", "fa2", "flex"]:
+            raise ValueError(
+                f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'."
+            )
+class FastVisuoExpertModel(PreTrainedModel):
+    config_class = FastVisuoExpertConfig
+    def __init__(self, config: FastVisuoExpertConfig):
+        super().__init__(config=config)
+        self.config = config
+        self.vision_tower = Dinov2Model(config=config.dino_config)
+        self.gemma_expert = GemmaForCausalLM(
+            config=config.gemma_expert_config
+        )  # GemmaModel
+        self.multi_modal_projector = nn.Linear(
+            config.dino_config.hidden_size, config.gemma_expert_config.hidden_size
+        )
+        self.gemma_expert.model.embed_tokens = None
+        self.gemma_expert.lm_head = None
+        self.to_bfloat16_like_physical_intelligence()
+        self.set_requires_grad()
+    def set_requires_grad(self):
+        if self.config.freeze_vision_encoder:
+            self.vision_tower.eval()
+            for params in self.vision_tower.parameters():
+                params.requires_grad = False
+    def train(self, mode: bool = True):
+        super().train(mode)
+        if self.config.freeze_vision_encoder:
+            self.vision_tower.eval()
+    def to_bfloat16_like_physical_intelligence(self):
+        self.vision_tower = self.vision_tower.to(dtype=torch.bfloat16)
+        params_to_change_dtype = [
+            "language_model.model.layers",
+            "gemma_expert.model.layers",
+            "vision_tower",
+            "multi_modal",
+        ]
+        for name, param in self.named_parameters():
+            if any(selector in name for selector in params_to_change_dtype):
+                param.data = param.data.to(dtype=torch.bfloat16)
+    def embed_image(self, image: torch.Tensor):
+        selected_image_feature = self.vision_tower(image).last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features / (
+            self.config.gemma_expert_config.hidden_size**0.5
+        )
+        return image_features
+    # TODO: break down this huge forward into modules or functions
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        # RMSNorm
+        head_dim = self.gemma_expert.config.head_dim
+        hidden_states = inputs_embeds
+        batch_size = hidden_states.shape[0]
+        for layer in self.gemma_expert.model.layers[
+            : self.gemma_expert.config.num_hidden_layers
+        ]:
+            # normalizer = torch.tensor(model.config.hidden_size**0.5, dtype=hidden_states.dtype)
+            # hidden_states = hidden_states * normalizer
+            hidden_states = layer.input_layernorm(hidden_states)
+            input_shape = hidden_states.shape[:-1]
+            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+            # self attention
+            hidden_states = hidden_states.to(dtype=torch.bfloat16)
+            query_states = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            key_states = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+            query_states = apply_rope(query_states, position_ids)
+            key_states = apply_rope(key_states, position_ids)
+            attention_interface = self.get_attention_interface()
+            att_output = attention_interface(
+                attention_mask,
+                batch_size,
+                head_dim,
+                query_states,
+                key_states,
+                value_states,
+            )
+            if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+                att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+            out_emb = layer.self_attn.o_proj(att_output)
+            # first residual
+            out_emb += hidden_states
+            after_first_residual = out_emb.clone()
+            out_emb = layer.post_attention_layernorm(out_emb)
+            out_emb = layer.mlp(out_emb)
+            # second residual
+            out_emb += after_first_residual
+            hidden_states = out_emb
+        # final norm
+        hidden_states = self.gemma_expert.model.norm(hidden_states)
+        return hidden_states
+    def get_attention_interface(self):
+        if self.config.attention_implementation == "fa2":
+            attention_interface = self.flash_attention_forward
+        else:
+            attention_interface = self.eager_attention_forward
+        return attention_interface
+    def eager_attention_forward(
+        self,
+        attention_mask,
+        batch_size,
+        head_dim,
+        query_states,
+        key_states,
+        value_states,
+    ):
+        num_att_heads = self.config.gemma_expert_config.num_attention_heads
+        num_key_value_heads = self.config.gemma_expert_config.num_key_value_heads
+        num_key_value_groups = num_att_heads // num_key_value_heads
+        # query_states: batch_size, sequence_length, num_att_head, head_dim
+        # key_states: batch_size, sequence_length, num_key_value_head, head_dim
+        # value_states: batch_size, sequence_length, num_key_value_head, head_dim
+        sequence_length = key_states.shape[1]
+        key_states = key_states[:, :, :, None, :].expand(
+            batch_size,
+            sequence_length,
+            num_key_value_heads,
+            num_key_value_groups,
+            head_dim,
+        )
+        key_states = key_states.reshape(
+            batch_size,
+            sequence_length,
+            num_key_value_heads * num_key_value_groups,
+            head_dim,
+        )
+        value_states = value_states[:, :, :, None, :].expand(
+            batch_size,
+            sequence_length,
+            num_key_value_heads,
+            num_key_value_groups,
+            head_dim,
+        )
+        value_states = value_states.reshape(
+            batch_size,
+            sequence_length,
+            num_key_value_heads * num_key_value_groups,
+            head_dim,
+        )
+        # Attention here is upcasted to float32 to match the original eager implementation.
+        query_states = query_states.to(dtype=torch.float32)
+        key_states = key_states.to(dtype=torch.float32)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        att_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        att_weights *= head_dim**-0.5
+        big_neg = -2.3819763e38  # See gemma/modules.py
+        masked_att_weights = torch.where(
+            attention_mask[:, None, :, :], att_weights, big_neg
+        )
+        probs = nn.functional.softmax(masked_att_weights, dim=-1)
+        probs = probs.to(dtype=value_states.dtype)
+        # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length
+        # value_states: batch_size, sequence_length, num_att_heads, head_dim
+        att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3))
+        att_output = att_output.permute(0, 2, 1, 3)
+        # we use -1 because sequence length can change
+        att_output = att_output.reshape(
+            batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim
+        )
+        return att_output

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58518ffa9166223aafee65453b3ed9cd4abde834949a61bbf78c3a5e99c1fe42
+size 9038608596

modeling_hume.py ADDED Viewed

	@@ -0,0 +1,1909 @@

+import collections
+import math
+from argparse import Namespace
+from collections import deque
+import array_typing as at
+import numpy as np
+import torch
+import torch.nn.functional as F  # noqa: N812
+import torchvision.transforms.functional as TF
+from beartype import beartype as typechecker
+from configuration_hume import HumeConfig, System2Config
+from fast_visuo_expert import FastVisuoExpertConfig, FastVisuoExpertModel
+from jaxtyping import Bool, Float, Int64, jaxtyped
+from lerobot.common.constants import ACTION, OBS_ROBOT
+from lerobot.common.policies.normalize import Normalize, Unnormalize
+from lerobot.common.policies.pretrained import PreTrainedPolicy
+from lerobot.common.utils.utils import get_safe_dtype
+from paligemma_with_expert import (
+    PaliGemmaWithExpertConfig,
+    PaliGemmaWithExpertModel,
+)
+from torch import Tensor, nn
+from transformers import AutoTokenizer
+from value_query import (
+    CalQL,
+    CalQlConfig,
+    VQHBackbone,
+    VQHBackboneConfig,
+)
+def create_sinusoidal_pos_embedding(
+    time: torch.tensor,
+    dimension: int,
+    min_period: float,
+    max_period: float,
+    device="cpu",
+) -> Tensor:
+    """Computes sine-cosine positional embedding vectors for scalar positions."""
+    if dimension % 2 != 0:
+        raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+    if time.ndim != 1:
+        raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+    dtype = get_safe_dtype(torch.float64, device.type)
+    fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
+    period = min_period * (max_period / min_period) ** fraction
+    # Compute the outer product
+    scaling_factor = 1.0 / period * 2 * math.pi
+    sin_input = scaling_factor[None, :] * time[:, None]
+    pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+    return pos_emb
+def sample_beta(alpha, beta, bsize, device):
+    gamma1 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / alpha)
+    gamma2 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / beta)
+    return gamma1 / (gamma1 + gamma2)
+def make_att_2d_masks(pad_masks, att_masks):
+    """Copied from big_vision.
+    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+    smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
+    setup several types of attention, for example:
+      [[1 1 1 1 1 1]]: pure causal attention.
+      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+          themselves and the last 3 tokens have a causal attention. The first
+          entry could also be a 1 without changing behaviour.
+      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+          block can attend all previous blocks and all tokens on the same block.
+    Args:
+      input_mask: bool[B, N] true if its part of the input, false if padding.
+      mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
+        it and 0 where it shares the same attention mask as the previous token.
+    """
+    if att_masks.ndim != 2:
+        raise ValueError(att_masks.ndim)
+    if pad_masks.ndim != 2:
+        raise ValueError(pad_masks.ndim)
+    cumsum = torch.cumsum(att_masks, dim=1)
+    att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
+    pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
+    att_2d_masks = att_2d_masks & pad_2d_masks
+    return att_2d_masks
+def resize_with_pad(img, width, height, pad_value=-1):
+    # assume no-op when width height fits already
+    if img.ndim != 4:
+        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+    cur_height, cur_width = img.shape[2:]
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_img = F.interpolate(
+        img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+    )
+    pad_height = max(0, int(height - resized_height))
+    pad_width = max(0, int(width - resized_width))
+    # pad on left and top of image
+    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+    return padded_img
+def pad_vector(vector, new_dim):
+    """Can be (batch_size x sequence_length x features_dimension)
+    or (batch_size x features_dimension)
+    """
+    if vector.shape[-1] == new_dim:
+        return vector
+    shape = list(vector.shape)
+    current_dim = shape[-1]
+    shape[-1] = new_dim
+    new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
+    new_vector[..., :current_dim] = vector
+    return new_vector
+def normalize(x, min_val, max_val):
+    return (x - min_val) / (max_val - min_val)
+def unnormalize(x, min_val, max_val):
+    return x * (max_val - min_val) + min_val
+def safe_arcsin(value):
+    # This ensures that the input stays within
+    # [−1,1] to avoid invalid values for arcsin
+    return torch.arcsin(torch.clamp(value, -1.0, 1.0))
+def aloha_gripper_to_angular(value):
+    # Aloha transforms the gripper positions into a linear space. The following code
+    # reverses this transformation to be consistent with pi0 which is pretrained in
+    # angular space.
+    #
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
+    value = unnormalize(value, min_val=0.01844, max_val=0.05800)
+    # This is the inverse of the angular to linear transformation inside the Interbotix code.
+    def linear_to_radian(linear_position, arm_length, horn_radius):
+        value = (horn_radius**2 + linear_position**2 - arm_length**2) / (
+            2 * horn_radius * linear_position
+        )
+        return safe_arcsin(value)
+    # The constants are taken from the Interbotix code.
+    value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
+    # Normalize to [0, 1].
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    return normalize(value, min_val=0.4, max_val=1.5)
+def aloha_gripper_from_angular(value):
+    # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha.
+    # Note that the units are still angular but the range is different.
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    value = unnormalize(value, min_val=0.4, max_val=1.5)
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
+    return normalize(value, min_val=-0.6213, max_val=1.4910)
+def aloha_gripper_from_angular_inv(value):
+    # Directly inverts the gripper_from_angular function.
+    value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
+    return normalize(value, min_val=0.4, max_val=1.5)
+class HumePolicy(PreTrainedPolicy):
+    """Wrapper class around System2 model to train and run inference within LeRobot."""
+    config_class = HumeConfig
+    name = "hume"
+    def __init__(
+        self,
+        config: HumeConfig,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        super().__init__(config)
+        config.validate_features()
+        self.config = config
+        # TODO: input / output features / normalizer for mutiple datasets
+        self.normalize_inputs = Normalize(
+            config.input_features, config.normalization_mapping, dataset_stats
+        )
+        self.normalize_targets = Normalize(
+            config.output_features, config.normalization_mapping, dataset_stats
+        )
+        self.unnormalize_outputs = Unnormalize(
+            config.output_features, config.normalization_mapping, dataset_stats
+        )
+        self.language_tokenizer = None
+        self.s2_model = System2(config)
+        self.s1_model = FastVisuoMatching(config)
+        self.value_query_head = ValueQueryHead(
+            paligemma_with_expert=self.s2_model.paligemma_with_expert, config=config
+        )
+        self.reset()
+        self.set_requires_grad()
+    def set_requires_grad(self):
+        if self.config.freeze_s2:
+            self.s2_model.eval()
+            for params in self.s2_model.parameters():
+                params.requires_grad = False
+    def train(self, mode: bool = True):
+        super().train(mode)
+        if self.config.freeze_s2:
+            self.s2_model.eval()
+    def reset(self):
+        """This should be called whenever the environment is reset."""
+        self._action_queue = deque([], maxlen=self.config.n_action_steps)
+        self.s2_action_cache = {}
+    def get_trunk_params(self) -> dict:
+        exclude_params = set()
+        exclude_modules = [
+            self.value_query_head.calql.policy,
+            self.value_query_head.calql.critics,
+            self.value_query_head.calql.temperature,
+        ]
+        for module in exclude_modules:
+            for param in module.parameters():
+                exclude_params.add(id(param))
+        return [param for param in self.parameters() if id(param) not in exclude_params]
+    def get_optim_params(self) -> dict:
+        return self.parameters()
+    def get_actor_optim_params(self) -> dict:
+        return self.value_query_head.calql.policy.parameters()
+    def get_critics_optim_params(self) -> dict:
+        return self.value_query_head.calql.critics.parameters()
+    def get_temperature_optim_params(self) -> dict:
+        return self.value_query_head.calql.temperature.parameters()
+    def init_infer(self, infer_cfg: at.InferConfig):
+        self.infer_cfg = Namespace(**infer_cfg)
+        self.action_plan = collections.deque()
+        self.history_state = collections.deque(maxlen=self.config.s1_his_state_size)
+        self.infer_step = 0
+        self.outputs = {}
+        self.q_value_cache = []
+        self.action_cache = []
+        self.reset()
+        print("Initializing inference with config:", infer_cfg)
+        return True
+    def infer(self, observation: at.InferBatchObs) -> at.ActionArray:
+        # prcoess observation
+        # from np.array -> torch.tensor -> add batch, change shape
+        if not self.history_state:
+            self.history_state.extend(
+                np.expand_dims(observation["observation.state"], 1)
+                .repeat(self.config.s1_his_state_size, axis=1)
+                .transpose(1, 0, 2)
+            )
+        else:
+            self.history_state.append(observation["observation.state"])
+        observation["observation.state"] = np.asarray(self.history_state).transpose(
+            1, 0, 2
+        )
+        observation: dict[str, torch.tensor | list[str]] = {
+            **{
+                k: torch.tensor(v / 255)  # b, h, w ,c
+                .permute(0, 3, 1, 2)  # b, c, h, w
+                .to(self.infer_cfg.device)
+                .float()
+                for k, v in observation.items()
+                if k
+                in {
+                    "observation.images.image",
+                    "observation.images.wrist_image",
+                    "observation.images.image_0",
+                }
+            },
+            **{k: v for k, v in observation.items() if k in {"task"}},  # len = batch
+            **{
+                k: torch.tensor(v)
+                .to(self.infer_cfg.device)
+                .float()  # b, state_horizon, state_dim
+                for k, v in observation.items()
+                if k in {"observation.state"}
+            },
+        }
+        batch_size = len(observation["task"])
+        if not self.action_plan:
+            # Finished executing previous action chunk -- compute new chunk
+            # Prepare observations dict
+            # infer the action
+            if self.infer_step % self.infer_cfg.s2_replan_steps == 0:
+                self.outputs = {}  # infer with s1 or s2
+            stamp = (
+                torch.tensor(
+                    [
+                        self.infer_step
+                        % self.infer_cfg.s2_replan_steps
+                        / self.config.s2_chunk_size
+                    ]
+                )
+                .expand(batch_size)
+                .to(self.infer_cfg.device)
+                .float()
+            )
+            self.outputs = self.select_action(
+                observation,
+                self.outputs,
+                stamp,
+                s2_candidates_num=self.infer_cfg.s2_candidates_num,
+                noise_temp_bounds=(
+                    self.infer_cfg.noise_temp_lower_bound,
+                    self.infer_cfg.noise_temp_upper_bound,
+                ),
+                time_temp_bounds=(
+                    self.infer_cfg.time_temp_lower_bound,
+                    self.infer_cfg.time_temp_upper_bound,
+                ),
+            )
+            action_chunk = self.outputs["s1_action"].cpu().numpy()
+            if self.infer_cfg.post_process_action:
+                action_chunk[..., -1] = 2 * (1 - action_chunk[..., -1]) - 1
+            # convert action chunk shape to (replan_steps, batch, action_dim)
+            action_chunk = action_chunk.transpose(1, 0, 2)
+            assert (
+                len(action_chunk) >= self.infer_cfg.replan_steps
+            ), f"We want to replan every {self.infer_cfg.replan_steps} steps, but policy only predicts {len(action_chunk)} steps."
+            self.action_plan.extend(action_chunk[: self.infer_cfg.replan_steps])
+        self.infer_step += 1
+        action = self.action_plan.popleft()
+        return np.asarray(action)
+    @torch.no_grad
+    @jaxtyped(typechecker=typechecker)
+    def select_action(
+        self,
+        batch: at.InferBatchObs,
+        outputs: at.InferOutput = {},
+        stamp: Float[Tensor, " batch"] | None = None,
+        s2_candidates_num: int = 5,
+        noise_temp_bounds: tuple = (1.0, 1.0),
+        time_temp_bounds: tuple = (1.0, 1.0),
+    ) -> at.InferOutput:
+        """Select a single action given environment observations.
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        self.eval()
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_ROBOT] = self._pi_aloha_decode_state(batch[OBS_ROBOT])
+        batch = self.normalize_inputs(batch)
+        # querying the policy.
+        images, img_masks = self.prepare_images(batch)
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+        original_action_dim = self.config.action_feature.shape[0]
+        if "noise_action" not in outputs:
+            noise_actions = []  # [(Batch, Chunksize, Action dim),]
+            for i in range(s2_candidates_num):
+                noise_actions.append(
+                    self.s2_model.sample_actions(
+                        images,
+                        img_masks,
+                        lang_tokens,
+                        lang_masks,
+                        state[:, -1, :],  # s2 not supported history state yet
+                        time_temp=(i / s2_candidates_num)
+                        * (time_temp_bounds[1] - time_temp_bounds[0])
+                        + time_temp_bounds[0],
+                        noise_temp=(i / s2_candidates_num)
+                        * (noise_temp_bounds[1] - noise_temp_bounds[0])
+                        + noise_temp_bounds[0],
+                    )
+                )
+            noise_actions = torch.stack(noise_actions, dim=1)
+            # (Batch, s2_candidates_num, Chunksize, Actiondim)
+            batch_size = noise_actions.shape[0]
+            batch_idx = torch.arange(batch_size, device=noise_actions.device)
+            noise_actions_wo_pad = noise_actions[
+                :, :, : self.config.vqh_chunk_size, :original_action_dim
+            ]
+            action_index, q_values = self.value_query_head.select_q_actions(
+                images, img_masks, lang_tokens, lang_masks, noise_actions_wo_pad
+            )
+            self.q_value_cache.append(q_values.squeeze())
+            unnormalized_noise_actions = self.unnormalize_outputs(
+                {"action": noise_actions_wo_pad}
+            )["action"]
+            self.action_cache.append(unnormalized_noise_actions.squeeze())
+            selected_noise_action = noise_actions[batch_idx, action_index]
+            outputs = {"noise_action": selected_noise_action}
+        noise_action: Float[Tensor, "batch s2_chunksize action_dim"] = outputs[
+            "noise_action"
+        ]
+        idcs = (stamp * self.config.s2_chunk_size).long().unsqueeze(1) + torch.arange(
+            self.config.s1_chunk_size, device=noise_action.device
+        )
+        batch_idcs = torch.arange(
+            noise_action.shape[0], device=noise_action.device
+        ).unsqueeze(1)
+        noise_action_slides = noise_action[batch_idcs, idcs]
+        s1_actions = self.s1_model.sample_actions(
+            images, img_masks, state, noise_action_slides, stamp=stamp
+        )
+        # Unpad actions
+        actions = s1_actions[:, :, :original_action_dim]
+        actions = self.unnormalize_outputs({"action": actions})["action"]
+        if self.config.adapt_to_pi_aloha:
+            actions = self._pi_aloha_encode_actions_inv(actions)
+        outputs["s1_action"] = actions
+        return outputs
+    def post_normalize(self, batch):
+        """additional keys {obervation.x}.s1 are merged in to the batch,
+        so we need to normalize these keys
+        """
+        merge_keys = filter(lambda k: k.endswith(".s1"), batch.keys())
+        for k in merge_keys:
+            _k = k.replace(".s1", "")
+            batch[k] = self.normalize_inputs({_k: batch[k]})[_k]
+        return batch
+    def get_noise_action_slides(self, action: Tensor, stamp: Tensor) -> Tensor:
+        """Augment the action with the previous actions in the queue."""
+        # idcs = (torch.rand_like(stamp) * (self.config.s2_chunk_size - self.config.s1_chunk_size)).long()
+        idcs = (
+            (
+                self.config.noise_slides_alp * torch.rand_like(stamp)
+                - self.config.noise_slides_alp / 2
+                + stamp
+            )
+            * self.config.s2_chunk_size
+        ).long()
+        idcs = torch.clamp(idcs, 0, action.shape[1] - self.config.s1_chunk_size)
+        idcs = idcs + torch.arange(self.config.s1_chunk_size, device=action.device)
+        batch_idcs = torch.arange(action.shape[0], device=action.device).unsqueeze(1)
+        noise_action_slides = action[batch_idcs, idcs]
+        noise_action_slides += (
+            torch.randn_like(noise_action_slides) * self.config.noise_slides_eps
+        )
+        return noise_action_slides
+    def forward(
+        self, batch: dict[str, Tensor], noise=None, time=None
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, dict[str, Tensor]]:
+        """Do a full training forward pass to compute the loss"""
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_ROBOT] = self._pi_aloha_decode_state(batch[OBS_ROBOT])
+            batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
+        batch = self.normalize_inputs(batch)
+        batch = self.post_normalize(batch)
+        batch = self.normalize_targets(batch)
+        # prepare images
+        images, img_masks = self.prepare_images(batch)
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+        s1_images, s1_img_masks = self.prepare_images(
+            batch, map(lambda x: f"{x}.s1", self.config.image_features)
+        )  # 0
+        s1_state = self.prepare_state(batch, f"{OBS_ROBOT}.s1")
+        # prepare actions
+        actions = self.prepare_action(batch)
+        actions_is_pad = batch.get("action_is_pad")
+        b, s, _ = actions.shape
+        device = actions.device
+        batch_idcs = torch.arange(b, device=device).unsqueeze(1)
+        stamp = batch["stamp"]
+        idcs = (stamp * self.config.s2_chunk_size).long() + torch.arange(
+            self.config.s1_chunk_size, device=device
+        )
+        s1_actions = actions[batch_idcs, idcs]
+        s1_actions_is_pad = (
+            None if actions_is_pad is None else actions_is_pad[batch_idcs, idcs]
+        )
+        # s2 forward pass
+        with torch.no_grad():
+            if self.config.cache_s2_actions:
+                is_noised = []
+                noise_actions = torch.zeros_like(actions)
+                for idx, s2_idx in enumerate(batch["s2_idx"]):
+                    if s2_idx in self.s2_action_cache:
+                        noise_actions[idx] = self.s2_action_cache[s2_idx]
+                        is_noised.append(False)
+                    else:
+                        is_noised.append(True)
+                # noise batch
+                is_noised = torch.tensor(is_noised, device=batch["s2_idx"].device)
+                s2_actions_infered = self.s2_model.sample_actions(
+                    [img[is_noised] for img in images],
+                    [mask[is_noised] for mask in img_masks],
+                    lang_tokens[is_noised],
+                    lang_masks[is_noised],
+                    state[is_noised],
+                )
+                noise_actions[is_noised] = s2_actions_infered
+            else:
+                noise_actions = self.s2_model.sample_actions(
+                    images,
+                    img_masks,
+                    lang_tokens,
+                    lang_masks,
+                    state,
+                )
+        # vgps: embs[q] -> layers -> [q] -> mlp
+        # value query head features are end with vqh: xx.vqh
+        vqh_images, vqh_img_masks = self.prepare_images(
+            batch, map(lambda x: f"{x}.vqh", self.config.image_features)
+        )  # 1
+        temperature_loss, policy_loss, critic_loss, log_dict = (
+            self.value_query_head.forward(
+                images,
+                img_masks,
+                lang_tokens,
+                lang_masks,
+                vqh_images,
+                vqh_img_masks,
+                batch["action"][:, : self.config.vqh_chunk_size, :],
+                batch["reward.vqh"],
+                batch["mc.vqh"],
+                batch["reward.vqh"].to(dtype=torch.float),
+            )
+        )
+        noise_action_slides = self.get_noise_action_slides(noise_actions, stamp)
+        s1_losses = self.s1_model.forward(
+            s1_images,
+            s1_img_masks,
+            s1_state,
+            s1_actions,
+            noise_action_slides,
+            time,
+            stamp=stamp.squeeze(),
+        )
+        total_loss, loss_dict = 0.0, {}
+        if s1_actions_is_pad is not None:
+            in_episode_bound = ~s1_actions_is_pad
+            s1_losses = s1_losses * in_episode_bound.unsqueeze(-1)
+        s1_losses = s1_losses[..., : self.config.max_action_dim]
+        s1_losses = s1_losses.mean()
+        loss_dict["s1_loss"] = s1_losses.item()
+        total_loss += s1_losses
+        # add ValueQueryHead log dict to loss_dict
+        # loss_dict = {**loss_dict, **log_dict}
+        loss_dict["entropy"] = log_dict["entropy"].item()
+        loss_dict["actions_mse"] = log_dict["actions_mse"].item()
+        loss_dict["td_err"] = log_dict["td_err"].item()
+        loss_dict["temperature"] = log_dict["temperature"].item()
+        loss_dict["cql_loss"] = log_dict["cql_loss"].item()
+        loss_dict["cql_alpha"] = log_dict["cql_alpha"]
+        loss_dict["cql_diff"] = log_dict["cql_diff"].item()
+        loss_dict["critic_loss"] = log_dict["critic_loss"].item()
+        loss_dict["cql_ood_values"] = log_dict["cql_ood_values"].item()
+        loss_dict["calql_bound_rate"] = log_dict["calql_bound_rate"].item()
+        loss_dict["online_q"] = log_dict["online_q"].item()
+        loss_dict["target_q"] = log_dict["target_q"].item()
+        loss_dict["positive_qs"] = log_dict["positive_qs"].item()
+        loss_dict["actor_loss"] = log_dict["actor_loss"].item()
+        return total_loss, temperature_loss, policy_loss, critic_loss, loss_dict
+    def prepare_images(self, batch, image_features=None):
+        """Apply preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
+        convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
+        """
+        images = []
+        img_masks = []
+        image_features = image_features or self.config.image_features
+        present_img_keys = [key for key in image_features if key in batch]
+        missing_img_keys = [key for key in image_features if key not in batch]
+        if len(present_img_keys) == 0:
+            raise ValueError(
+                f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
+            )
+        # Preprocess image features present in the batch
+        for key in present_img_keys:
+            img = batch[key]
+            if self.config.resize_imgs_with_padding is not None:
+                img = resize_with_pad(
+                    img, *self.config.resize_imgs_with_padding, pad_value=0
+                )
+            # Normalize from range [0,1] to [-1,1] as expacted by siglip
+            img = img * 2.0 - 1.0
+            bsize = img.shape[0]
+            device = img.device
+            mask = torch.ones(bsize, dtype=torch.bool, device=device)
+            images.append(img)
+            img_masks.append(mask)
+        # Create image features not present in the batch
+        # as fully 0 padded images.
+        for num_empty_cameras in range(len(missing_img_keys)):
+            if num_empty_cameras >= self.config.empty_cameras:
+                break
+            img = torch.ones_like(img) * -1
+            mask = torch.zeros_like(mask)
+            images.append(img)
+            img_masks.append(mask)
+        return images, img_masks
+    def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
+        """Tokenize the text input"""
+        device = batch[OBS_ROBOT].device
+        tasks = batch["task"]
+        # PaliGemma prompt has to end with a new line
+        tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
+        tokenized_prompt = self.language_tokenizer.__call__(
+            tasks,
+            padding="max_length",
+            padding_side="right",
+            max_length=self.config.tokenizer_max_length,
+            return_tensors="pt",
+            truncation=True,
+        )
+        lang_tokens = tokenized_prompt["input_ids"].to(device=device)
+        lang_masks = tokenized_prompt["attention_mask"].to(
+            device=device, dtype=torch.bool
+        )
+        return lang_tokens, lang_masks
+    def _pi_aloha_decode_state(self, state):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            state[:, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
+        return state
+    def _pi_aloha_encode_actions(self, actions):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular(
+                actions[:, :, motor_idx]
+            )
+        return actions
+    def _pi_aloha_encode_actions_inv(self, actions):
+        # Flip the joints again.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(
+                actions[:, :, motor_idx]
+            )
+        return actions
+    def prepare_state(self, batch, feature=None):
+        """Pad state"""
+        feature = feature or OBS_ROBOT
+        state = pad_vector(batch[feature], self.config.max_state_dim)
+        return state
+    def prepare_action(self, batch):
+        """Pad action"""
+        actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+        return actions
+    def _save_pretrained(self, save_directory) -> None:
+        super()._save_pretrained(save_directory)
+        print(f"Saving the language tokenizer to {save_directory} ...")
+        self.language_tokenizer.save_pretrained(save_directory)
+        import shutil
+        files = [
+            "src/hume/models/array_typing.py",
+            "src/hume/models/configuration_hume.py",
+            "src/hume/models/fast_visuo_expert.py",
+            "src/hume/models/modeling_hume.py",
+            "src/hume/models/paligemma_with_expert.py",
+            "src/hume/models/value_query.py",
+        ]
+        try:
+            for file in files:
+                shutil.copy(file, save_directory)
+        except Exception:
+            print("Failed to copy files to save_directory")
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_name_or_path,
+        **kwargs,
+    ):
+        policy = super().from_pretrained(pretrained_name_or_path, **kwargs)
+        print(f"Loading the language tokenizer from {pretrained_name_or_path} ...")
+        policy.language_tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_name_or_path
+        )
+        return policy
+class System2Policy(PreTrainedPolicy):
+    """Wrapper class around System2FlowMatching model to train and run inference within LeRobot."""
+    config_class = System2Config
+    name = "system2"
+    def __init__(
+        self,
+        config: System2Config,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            config: Policy configuration class instance or None, in which case the default instantiation of
+                    the configuration class is used.
+            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
+                that they will be passed with a call to `load_state_dict` before the policy is used.
+        """
+        super().__init__(config)
+        config.validate_features()
+        self.config = config
+        # TODO: input / output features / normalizer for mutiple datasets
+        self.normalize_inputs = Normalize(
+            config.input_features, config.normalization_mapping, dataset_stats
+        )
+        self.normalize_targets = Normalize(
+            config.output_features, config.normalization_mapping, dataset_stats
+        )
+        self.unnormalize_outputs = Unnormalize(
+            config.output_features, config.normalization_mapping, dataset_stats
+        )
+        self.language_tokenizer = None
+        self.model = System2(config)
+        self.reset()
+    def reset(self):
+        """This should be called whenever the environment is reset."""
+        self._action_queue = deque([], maxlen=self.config.n_action_steps)
+    def get_optim_params(self) -> dict:
+        return self.parameters()
+    @torch.no_grad
+    def select_action(
+        self, batch: dict[str, Tensor], noise: Tensor | None = None
+    ) -> Tensor:
+        """Select a single action given environment observations.
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        self.eval()
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_ROBOT] = self._pi_aloha_decode_state(batch[OBS_ROBOT])
+        batch = self.normalize_inputs(batch)
+        # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
+        # querying the policy.
+        images, img_masks = self.prepare_images(batch)
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+        actions = self.model.sample_actions(
+            images, img_masks, lang_tokens, lang_masks, state, noise=noise
+        )
+        # Unpad actions
+        original_action_dim = self.config.action_feature.shape[0]
+        actions = actions[:, :, :original_action_dim]
+        actions = self.unnormalize_outputs({"action": actions})["action"]
+        if self.config.adapt_to_pi_aloha:
+            actions = self._pi_aloha_encode_actions(actions)
+        return actions
+    def forward(
+        self, batch: dict[str, Tensor], noise=None, time=None
+    ) -> tuple[Tensor, dict[str, Tensor]]:
+        """Do a full training forward pass to compute the loss"""
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_ROBOT] = self._pi_aloha_decode_state(batch[OBS_ROBOT])
+            batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
+        batch = self.normalize_inputs(batch)
+        batch = self.normalize_targets(batch)
+        images, img_masks = self.prepare_images(batch)
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+        actions = self.prepare_action(batch)
+        actions_is_pad = batch.get("action_is_pad")
+        loss_dict = {}
+        losses, _ = self.model.forward(
+            images, img_masks, lang_tokens, lang_masks, state, actions, noise, time
+        )
+        # loss_dict["losses_after_forward"] = losses.detach().mean().item()
+        if actions_is_pad is not None:
+            in_episode_bound = ~actions_is_pad
+            losses = losses * in_episode_bound.unsqueeze(-1)
+            # loss_dict["losses_after_in_ep_bound"] = losses.detach().mean().item()
+        # Remove padding
+        losses = losses[:, :, : self.config.max_action_dim]
+        # loss_dict["losses_after_rm_padding"] = losses.detach().mean().item()
+        # For backward pass
+        loss = losses.mean()
+        # For logging
+        loss_dict["l2_loss"] = loss.item()
+        return loss, loss_dict
+    def prepare_images(self, batch):
+        """Apply preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
+        convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
+        """
+        images = []
+        img_masks = []
+        present_img_keys = [key for key in self.config.image_features if key in batch]
+        missing_img_keys = [
+            key for key in self.config.image_features if key not in batch
+        ]
+        if len(present_img_keys) == 0:
+            raise ValueError(
+                f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
+            )
+        # Preprocess image features present in the batch
+        for key in present_img_keys:
+            img = batch[key]
+            if self.config.resize_imgs_with_padding is not None:
+                img = resize_with_pad(
+                    img, *self.config.resize_imgs_with_padding, pad_value=0
+                )
+            # Normalize from range [0,1] to [-1,1] as expacted by siglip
+            img = img * 2.0 - 1.0
+            bsize = img.shape[0]
+            device = img.device
+            mask = torch.ones(bsize, dtype=torch.bool, device=device)
+            images.append(img)
+            img_masks.append(mask)
+        # Create image features not present in the batch
+        # as fully 0 padded images.
+        for num_empty_cameras in range(len(missing_img_keys)):
+            if num_empty_cameras >= self.config.empty_cameras:
+                break
+            img = torch.ones_like(img) * -1
+            mask = torch.zeros_like(mask)
+            images.append(img)
+            img_masks.append(mask)
+        return images, img_masks
+    def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
+        """Tokenize the text input"""
+        device = batch[OBS_ROBOT].device
+        tasks = batch["task"]
+        # PaliGemma prompt has to end with a new line
+        tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
+        tokenized_prompt = self.language_tokenizer.__call__(
+            tasks,
+            padding="max_length",
+            padding_side="right",
+            max_length=self.config.tokenizer_max_length,
+            return_tensors="pt",
+            truncation=True,
+        )
+        lang_tokens = tokenized_prompt["input_ids"].to(device=device)
+        lang_masks = tokenized_prompt["attention_mask"].to(
+            device=device, dtype=torch.bool
+        )
+        return lang_tokens, lang_masks
+    def _pi_aloha_decode_state(self, state):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            state[:, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
+        return state
+    def _pi_aloha_encode_actions(self, actions):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular(
+                actions[:, :, motor_idx]
+            )
+        return actions
+    def _pi_aloha_encode_actions_inv(self, actions):
+        # Flip the joints again.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(
+                actions[:, :, motor_idx]
+            )
+        return actions
+    def prepare_state(self, batch):
+        """Pad state"""
+        state = pad_vector(batch[OBS_ROBOT], self.config.max_state_dim)
+        return state
+    def prepare_action(self, batch):
+        """Pad action"""
+        actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+        return actions
+    def _save_pretrained(self, save_directory) -> None:
+        super()._save_pretrained(save_directory)
+        print(f"Saving the language tokenizer to {save_directory} ...")
+        self.language_tokenizer.save_pretrained(save_directory)
+        import shutil
+        files = [
+            "src/hume/models/array_typing.py",
+            "src/hume/models/configuration_hume.py",
+            "src/hume/models/fast_visuo_expert.py",
+            "src/hume/models/modeling_hume.py",
+            "src/hume/models/paligemma_with_expert.py",
+            "src/hume/models/value_query.py",
+        ]
+        try:
+            for file in files:
+                shutil.copy(file, save_directory)
+        except Exception:
+            print("Failed to copy files to save_directory")
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_name_or_path,
+        **kwargs,
+    ):
+        policy = super().from_pretrained(pretrained_name_or_path, **kwargs)
+        print(f"Loading the language tokenizer from {pretrained_name_or_path} ...")
+        policy.language_tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_name_or_path
+        )
+        return policy
+class System2(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        paligemma_with_export_config = PaliGemmaWithExpertConfig(
+            freeze_vision_encoder=self.config.freeze_vision_encoder,
+            train_expert_only=self.config.train_expert_only,
+            attention_implementation=self.config.attention_implementation,
+            paligemma_config=self.config.paligemma_config,
+            gemma_expert_config=self.config.gemma_expert_config,
+        )
+        self.paligemma_with_expert = PaliGemmaWithExpertModel(
+            paligemma_with_export_config
+        )
+        # Projections are float32
+        self.state_proj = nn.Linear(self.config.max_state_dim, self.config.proj_width)
+        self.action_in_proj = nn.Linear(
+            self.config.max_action_dim, self.config.proj_width
+        )
+        self.action_out_proj = nn.Linear(
+            self.config.proj_width, self.config.max_action_dim
+        )
+        self.action_time_mlp_in = nn.Linear(
+            self.config.proj_width * 2, self.config.proj_width
+        )
+        self.action_time_mlp_out = nn.Linear(
+            self.config.proj_width, self.config.proj_width
+        )
+        self.set_requires_grad()
+    def set_requires_grad(self):
+        for params in self.state_proj.parameters():
+            params.requires_grad = self.config.train_state_proj
+    def sample_noise(self, shape, device):
+        noise = torch.normal(
+            mean=0.0,
+            std=1.0,
+            size=shape,
+            dtype=torch.float32,
+            device=device,
+        )
+        return noise
+    def sample_time(self, bsize, device):
+        time_beta = sample_beta(1.5, 1.0, bsize, device)
+        time = time_beta * 0.999 + 0.001
+        return time.to(dtype=torch.float32, device=device)
+    def embed_prefix(
+        self, images, img_masks, lang_tokens, lang_masks
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Embed images with SigLIP and language tokens with embedding layer to prepare
+        for PaliGemma transformer processing.
+        """
+        # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty
+        embs = []
+        pad_masks = []
+        att_masks = []
+        # TODO: remove for loop
+        for (
+            img,
+            img_mask,
+        ) in zip(images, img_masks, strict=False):
+            img_emb = self.paligemma_with_expert.embed_image(img)
+            img_emb = img_emb.to(dtype=torch.bfloat16)
+            # Normalize image embeddings
+            img_emb_dim = img_emb.shape[-1]
+            img_emb = img_emb * torch.tensor(
+                img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device
+            )
+            bsize, num_img_embs = img_emb.shape[:2]
+            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
+            embs.append(img_emb)
+            pad_masks.append(img_mask)
+            # Create attention masks so that image tokens attend to each other
+            att_masks += [0] * num_img_embs
+        lang_emb = self.paligemma_with_expert.embed_language_tokens(lang_tokens)
+        # Normalize language embeddings
+        lang_emb_dim = lang_emb.shape[-1]
+        lang_emb = lang_emb * math.sqrt(lang_emb_dim)
+        embs.append(lang_emb)
+        pad_masks.append(lang_masks)
+        # full attention between image and language inputs
+        num_lang_embs = lang_emb.shape[1]
+        att_masks += [0] * num_lang_embs
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks
+    def embed_suffix(self, state, noisy_actions, timestep):
+        """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
+        embs = []
+        pad_masks = []
+        att_masks = []
+        # Embed state
+        state_emb = self.state_proj(state)
+        state_emb = state_emb.to(dtype=torch.bfloat16)
+        embs.append(state_emb[:, None, :])
+        bsize = state_emb.shape[0]
+        dtype = state_emb.dtype
+        device = state_emb.device
+        state_mask = torch.ones(bsize, 1, dtype=torch.bool, device=device)
+        pad_masks.append(state_mask)
+        # Set attention masks so that image and language inputs do not attend to state or actions
+        att_masks += [1]
+        # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
+        time_emb = create_sinusoidal_pos_embedding(
+            timestep,
+            self.config.proj_width,
+            min_period=4e-3,
+            max_period=4.0,
+            device=device,
+        )
+        time_emb = time_emb.type(dtype=dtype)
+        # Fuse timestep + action information using an MLP
+        action_emb = self.action_in_proj(noisy_actions)
+        time_emb = time_emb[:, None, :].expand_as(action_emb)
+        action_time_emb = torch.cat([action_emb, time_emb], dim=2)
+        action_time_emb = self.action_time_mlp_in(action_time_emb)
+        action_time_emb = F.silu(action_time_emb)  # swish == silu
+        action_time_emb = self.action_time_mlp_out(action_time_emb)
+        # Add to input tokens
+        embs.append(action_time_emb)
+        bsize, action_time_dim = action_time_emb.shape[:2]
+        action_time_mask = torch.ones(
+            bsize, action_time_dim, dtype=torch.bool, device=device
+        )
+        pad_masks.append(action_time_mask)
+        # Set attention masks so that image, language and state inputs do not attend to action tokens
+        att_masks += [1] + ([0] * (self.config.n_action_steps - 1))
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks
+    def forward(
+        self,
+        images,
+        img_masks,
+        lang_tokens,
+        lang_masks,
+        state,
+        actions,
+        noise=None,
+        time=None,
+    ) -> Tensor:
+        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
+        if noise is None:
+            noise = self.sample_noise(actions.shape, actions.device)
+        if time is None:
+            time = self.sample_time(actions.shape[0], actions.device)
+        time_expanded = time[:, None, None]
+        x_t = time_expanded * noise + (1 - time_expanded) * actions
+        u_t = noise - actions
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks
+        )
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(
+            state, x_t, time
+        )
+        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        (_, suffix_out), past_key_values = self.paligemma_with_expert.forward(
+            attention_mask=att_2d_masks,
+            position_ids=position_ids,
+            past_key_values=None,
+            inputs_embeds=[prefix_embs, suffix_embs],
+            use_cache=True,
+            fill_kv_cache=True,
+        )
+        suffix_out = suffix_out[:, -self.config.n_action_steps :]
+        # Original openpi code, upcast attention output
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        losses = F.mse_loss(u_t, v_t, reduction="none")
+        return losses, past_key_values
+    def sample_actions(
+        self,
+        images,
+        img_masks,
+        lang_tokens,
+        lang_masks,
+        state,
+        noise=None,
+        past_key_values=None,
+        time_temp=1.0,
+        noise_temp=1.0,
+    ) -> Tensor:
+        """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
+        bsize = state.shape[0]
+        device = state.device
+        if noise is None:
+            actions_shape = (
+                bsize,
+                self.config.n_action_steps,
+                self.config.max_action_dim,
+            )
+            noise = self.sample_noise(actions_shape, device)
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks
+        )
+        prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
+        # Compute image and language key value cache
+        if past_key_values is None:
+            _, past_key_values = self.paligemma_with_expert.forward(
+                attention_mask=prefix_att_2d_masks,
+                position_ids=prefix_position_ids,
+                past_key_values=None,
+                inputs_embeds=[prefix_embs, None],
+                use_cache=self.config.use_cache,
+                fill_kv_cache=True,
+            )
+        dt = -1.0 / self.config.num_steps
+        dt = torch.tensor(dt, dtype=torch.float32, device=device)
+        x_t = noise
+        time = torch.tensor(
+            time_temp, dtype=torch.float32, device=device
+        )  # TODO: Add temp
+        while time >= -dt / 2 + (1 - self.config.theta2):
+            expanded_time = time.expand(bsize)
+            v_t = self.denoise_step(
+                state,
+                prefix_pad_masks,
+                past_key_values,
+                x_t,
+                expanded_time,
+            )
+            # Euler step
+            x_t += dt * v_t * noise_temp  # TODO: Add noise temp
+            time += dt
+        return x_t
+    def denoise_step(
+        self,
+        state,
+        prefix_pad_masks,
+        past_key_values,
+        x_t,
+        timestep,
+    ):
+        """Apply one denoising step of the noise `x_t` at a given timestep."""
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(
+            state, x_t, timestep
+        )
+        suffix_len = suffix_pad_masks.shape[1]
+        batch_size = prefix_pad_masks.shape[0]
+        prefix_len = prefix_pad_masks.shape[1]
+        prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(
+            batch_size, suffix_len, prefix_len
+        )
+        suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
+        full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
+        prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
+        position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
+        outputs_embeds, _ = self.paligemma_with_expert.forward(
+            attention_mask=full_att_2d_masks,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=[None, suffix_embs],
+            use_cache=self.config.use_cache,
+            fill_kv_cache=False,
+        )
+        suffix_out = outputs_embeds[1]
+        suffix_out = suffix_out[:, -self.config.n_action_steps :]
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        return v_t
+class FastVisuoMatching(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # FastVisuoExpertConfig, FastVisuoExpertModel
+        fast_visuo_expertConfig = FastVisuoExpertConfig(
+            freeze_vision_encoder=self.config.freeze_s1_vision_encoder,
+            attention_implementation=self.config.attention_implementation,
+            dino_config=self.config.s1_dino_config,
+            gemma_expert_config=self.config.s1_gemma_expert_config,
+        )
+        self.fast_visuo_expert = FastVisuoExpertModel(fast_visuo_expertConfig)
+        # Projections are float32
+        self.state_proj = nn.Linear(
+            self.config.max_state_dim, self.config.s1_proj_width
+        )
+        self.action_in_proj = nn.Linear(
+            self.config.max_action_dim, self.config.s1_proj_width
+        )
+        self.action_out_proj = nn.Linear(
+            self.config.s1_proj_width, self.config.max_action_dim
+        )
+        self.action_time_mlp_in = nn.Linear(
+            self.config.s1_proj_width * 2, self.config.s1_proj_width
+        )
+        self.action_time_mlp_out = nn.Linear(
+            self.config.s1_proj_width, self.config.s1_proj_width
+        )
+        self.set_requires_grad()
+    def set_requires_grad(self):
+        for params in self.state_proj.parameters():
+            params.requires_grad = self.config.train_state_proj
+    def sample_noise(self, shape, device):
+        noise = torch.normal(
+            mean=0.0,
+            std=1.0,
+            size=shape,
+            dtype=torch.float32,
+            device=device,
+        )
+        return noise
+    def sample_time(self, bsize, device):
+        time_beta = sample_beta(1.5, 1.0, bsize, device)
+        time = time_beta * 0.999 + 0.001
+        return time.to(dtype=torch.float32, device=device)
+    def embed_prefix(
+        self, images, img_masks
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Embed images with SigLIP and language tokens with embedding layer to prepare
+        for PaliGemma transformer processing.
+        """
+        # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty
+        embs = []
+        pad_masks = []
+        att_masks = []
+        # TODO: remove for loop
+        for img, img_mask in zip(images, img_masks, strict=False):
+            DINO_MEAN, DINO_STD = (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)
+            img = TF.normalize(img * 0.5 + 0.5, mean=DINO_MEAN, std=DINO_STD)
+            img_emb = self.fast_visuo_expert.embed_image(img)
+            img_emb = img_emb.to(dtype=torch.bfloat16)
+            # Normalize image embeddings
+            img_emb_dim = img_emb.shape[-1]
+            img_emb = img_emb * torch.tensor(
+                img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device
+            )
+            bsize, num_img_embs = img_emb.shape[:2]
+            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
+            embs.append(img_emb)
+            pad_masks.append(img_mask)
+            # Create attention masks so that image tokens attend to each other
+            att_masks += [0] * num_img_embs
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks
+    def embed_suffix(self, state, noisy_actions, timestep, stamp):
+        """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
+        embs = []
+        pad_masks = []
+        att_masks = []
+        # Embed state
+        state_emb = self.state_proj(state)
+        state_emb = state_emb.to(dtype=torch.bfloat16)
+        embs.append(state_emb)
+        bsize = state_emb.shape[0]
+        state_horizon = state_emb.shape[1]
+        dtype = state_emb.dtype
+        device = state_emb.device
+        state_mask = torch.ones(bsize, state_horizon, dtype=torch.bool, device=device)
+        pad_masks.append(state_mask)
+        # Set attention masks so that image and language inputs do not attend to state or actions
+        att_masks += [1] * state_horizon
+        # Embed stamp
+        stamp_emb = create_sinusoidal_pos_embedding(
+            stamp,
+            self.config.s1_proj_width,
+            min_period=4e-3,
+            max_period=4.0,
+            device=device,
+        )
+        stamp_emb = stamp_emb.type(dtype=dtype)[:, None, :]
+        embs.append(stamp_emb)
+        stamp_mask = torch.ones(bsize, 1, dtype=torch.bool, device=device)
+        pad_masks.append(stamp_mask)
+        att_masks += [1]
+        # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
+        time_emb = create_sinusoidal_pos_embedding(
+            timestep,
+            self.config.s1_proj_width,
+            min_period=4e-3,
+            max_period=4.0,
+            device=device,
+        )
+        time_emb = time_emb.type(dtype=dtype)
+        # Fuse timestep + action information using an MLP
+        action_emb = self.action_in_proj(noisy_actions)
+        time_emb = time_emb[:, None, :].expand_as(action_emb)
+        action_time_emb = torch.cat([action_emb, time_emb], dim=2)
+        action_time_emb = self.action_time_mlp_in(action_time_emb)
+        action_time_emb = F.silu(action_time_emb)  # swish == silu
+        action_time_emb = self.action_time_mlp_out(action_time_emb)
+        # Add to input tokens
+        embs.append(action_time_emb)
+        bsize, action_time_dim = action_time_emb.shape[:2]
+        action_time_mask = torch.ones(
+            bsize, action_time_dim, dtype=torch.bool, device=device
+        )
+        pad_masks.append(action_time_mask)
+        # Set attention masks so that image, language and state inputs do not attend to action tokens
+        att_masks += [1] + ([0] * (self.config.s1_action_steps - 1))
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks
+    def forward(
+        self, images, img_masks, state, actions, noise=None, time=None, stamp=None
+    ) -> Float[
+        Tensor, "batch {self.config.s1_action_steps} {self.config.max_action_dim}"
+    ]:
+        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
+        if noise is None:
+            noise = self.sample_noise(actions.shape, actions.device)
+        if time is None:
+            time = (
+                self.sample_time(actions.shape[0], actions.device) * self.config.theta1
+            )  # s2: [1, 0.1] -> s1: [0.1, 0]
+        time_expanded = time[:, None, None]
+        x_t = time_expanded * noise + (1 - time_expanded) * actions
+        u_t = noise - actions
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks
+        )
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(
+            state, x_t, time, stamp
+        )
+        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        inputs_embeds = torch.cat(
+            [prefix_embs, suffix_embs], dim=1
+        )  # torch.Size([16, 565]), torch.Size([16, 565])
+        suffix_out = self.fast_visuo_expert.forward(
+            attention_mask=att_2d_masks,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        suffix_out = suffix_out[:, -self.config.s1_action_steps :]
+        # Original openpi code, upcast attention output
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        losses = F.mse_loss(u_t, v_t, reduction="none")
+        return losses
+    def sample_actions(
+        self, images, img_masks, state, noise=None, stamp=None
+    ) -> Tensor:
+        """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
+        bsize = state.shape[0]
+        device = state.device
+        if noise is None:
+            actions_shape = (
+                bsize,
+                self.config.s1_action_steps,
+                self.config.max_action_dim,
+            )
+            noise = self.sample_noise(actions_shape, device)
+        if stamp is None:
+            stamp = torch.rand(bsize, device=device)
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks
+        )
+        dt = -self.config.theta1 / self.config.s1_num_steps
+        dt = torch.tensor(dt, dtype=torch.float32, device=device)
+        x_t = noise
+        time = torch.tensor(self.config.theta1, dtype=torch.float32, device=device)
+        while time >= -dt / 2:
+            expanded_time = time.expand(bsize)
+            v_t = self.denoise_step(
+                state,
+                prefix_embs,
+                prefix_pad_masks,
+                prefix_att_masks,
+                x_t,
+                expanded_time,
+                stamp,
+            )
+            # Euler step
+            x_t += dt * v_t
+            time += dt
+        return x_t
+    def denoise_step(
+        self,
+        state,
+        prefix_embs,
+        prefix_pad_masks,
+        prefix_att_masks,
+        x_t,
+        timestep,
+        stamp,
+    ):
+        """Apply one denoising step of the noise `x_t` at a given timestep."""
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(
+            state, x_t, timestep, stamp
+        )
+        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        inputs_embeds = torch.cat(
+            [prefix_embs, suffix_embs], dim=1
+        )  # torch.Size([16, 565]), torch.Size([16, 565])
+        suffix_out = self.fast_visuo_expert.forward(
+            attention_mask=att_2d_masks,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        suffix_out = suffix_out[:, -self.config.s1_action_steps :]
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        return v_t
+class ValueQueryHead(nn.Module):
+    def __init__(self, paligemma_with_expert, config):
+        super().__init__()
+        # gemma_expert for processing img and languge tokens
+        # paligemma with export fot processing image features
+        self.config = config
+        self.paligemma_with_expert = paligemma_with_expert
+        vqh_backbone_config = VQHBackboneConfig()
+        self.vqh_backbone = VQHBackbone(config=vqh_backbone_config)
+        cal_ql_config = CalQlConfig(
+            obs_encoded_dim=self.paligemma_with_expert.config.paligemma_config.hidden_size,
+            action_dim=config.vqh_chunk_size * config.action_feature.shape[0],
+            actor_lr=config.actor_lr,
+            critic_lr=config.critic_lr,
+            temp_lr=config.temp_lr,
+        )
+        self.calql = CalQL(config=cal_ql_config)
+        self.query_embedding = nn.Parameter(
+            torch.zeros(
+                self.paligemma_with_expert.config.paligemma_config.hidden_size,
+                dtype=torch.bfloat16,
+            )
+        )
+    def embed_prefix(
+        self, images, img_masks, lang_tokens, lang_masks
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Embed images with SigLIP and language tokens with embedding layer to prepare
+        for PaliGemma transformer processing.
+        """
+        # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty
+        embs = []
+        pad_masks = []
+        att_masks = []
+        # TODO: remove for loop
+        for (
+            img,
+            img_mask,
+        ) in zip(images, img_masks, strict=False):
+            img_emb = self.paligemma_with_expert.embed_image(img)
+            img_emb = img_emb.to(dtype=torch.bfloat16)
+            # Normalize image embeddings
+            img_emb_dim = img_emb.shape[-1]
+            img_emb = img_emb * torch.tensor(
+                img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device
+            )
+            bsize, num_img_embs = img_emb.shape[:2]
+            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
+            embs.append(img_emb)
+            pad_masks.append(img_mask)
+            # Create attention masks so that image tokens attend to each other
+            att_masks += [0] * num_img_embs
+        lang_emb = self.paligemma_with_expert.embed_language_tokens(
+            lang_tokens
+        ).detach()
+        # Normalize language embeddings
+        lang_emb_dim = lang_emb.shape[-1]
+        lang_emb = lang_emb * math.sqrt(lang_emb_dim)
+        embs.append(lang_emb)
+        pad_masks.append(lang_masks)
+        # full attention between image and language inputs
+        num_lang_embs = lang_emb.shape[1]
+        att_masks += [0] * num_lang_embs
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        # NOTE: add query embedding for each sequence
+        seq_lengths = pad_masks.sum(dim=1).long()  # w/o padding length
+        seq_len = embs.shape[1]
+        new_seq_len = seq_len + 1
+        new_embs = torch.zeros(
+            (bsize, new_seq_len, embs.shape[-1]), dtype=embs.dtype, device=embs.device
+        )
+        new_pad_masks = torch.zeros(
+            (bsize, new_seq_len), dtype=pad_masks.dtype, device=pad_masks.device
+        )
+        new_att_masks = torch.zeros(
+            (bsize, new_seq_len), dtype=att_masks.dtype, device=att_masks.device
+        )
+        batch_idx = torch.arange(bsize, device=embs.device).view(-1, 1)
+        seq_idx = (
+            torch.arange(seq_len, device=embs.device).view(1, -1).expand(bsize, -1)
+        )
+        mask = seq_idx >= seq_lengths.unsqueeze(1)
+        new_seq_idx = seq_idx + mask.long()
+        new_embs[batch_idx, new_seq_idx] = embs
+        new_pad_masks[batch_idx, new_seq_idx] = pad_masks
+        new_att_masks[batch_idx, new_seq_idx] = att_masks
+        new_embs[torch.arange(bsize), seq_lengths] = self.query_embedding.unsqueeze(
+            0
+        ).expand(bsize, -1)
+        new_pad_masks[torch.arange(bsize), seq_lengths] = True
+        new_att_masks[torch.arange(bsize), seq_lengths] = False
+        return new_embs, new_pad_masks, new_att_masks
+    def process_next_obs(
+        self,
+        images: list[torch.Tensor],
+        img_masks: list[torch.Tensor],
+        vqh_images: list[torch.Tensor],
+        vqh_img_masks: list[torch.Tensor],
+        lang_tokens: torch.Tensor,
+        lang_masks: torch.Tensor,
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor], torch.Tensor, torch.Tensor]:
+        """Process next observation for ValueQueryHead model.
+        Args:
+            images (list): List of image tensors.
+            img_masks (list): List of image mask tensors.
+            vqh_images (list): List of ValueQueryHead image tensors.
+            vqh_img_masks (list): List of ValueQueryHead image mask tensors.
+            lang_tokens (torch.Tensor): Language token tensor.
+            lang_masks (torch.Tensor): Language mask tensor.
+        Returns:
+            tuple: Tuple containing processed images, masks, and language tokens.
+        """
+        new_images = []
+        new_img_masks = []
+        for img, next_img, img_mask, next_img_mask in zip(
+            images, vqh_images, img_masks, vqh_img_masks
+        ):
+            new_images.append(torch.cat([img, next_img], dim=0))
+            new_img_masks.append(torch.cat([img_mask, next_img_mask], dim=0))
+        new_lang_tokens = torch.cat([lang_tokens, lang_tokens], dim=0)
+        new_lang_masks = torch.cat([lang_masks, lang_masks], dim=0)
+        return (
+            new_images,
+            new_img_masks,
+            new_lang_tokens,
+            new_lang_masks,
+        )
+    @jaxtyped(typechecker=typechecker)
+    def forward(
+        self,
+        images: list[Float[Tensor, "batch 3 224 224"]],
+        img_masks: list[Bool[Tensor, " batch"]],
+        lang_tokens: Int64[Tensor, "batch seq_len"],
+        lang_masks: Bool[Tensor, "batch seq_len"],
+        vqh_images: list[Float[Tensor, "batch 3 224 224"]],
+        vqh_img_masks: list[Bool[Tensor, " batch"]],
+        actions: Float[
+            Tensor,
+            "batch {self.config.vqh_chunk_size} {self.config.action_feature.shape[0]}",
+        ],
+        rewards: Float[Tensor, " batch"],
+        mc_returns: Float[Tensor, " batch"],
+        masks: Float[Tensor, " batch"],
+    ) -> tuple[Tensor, Tensor, Tensor, dict]:
+        """Forward pass for ValueQueryHead model.
+        Args:
+            images (torch.Tensor): Image input tensor.
+            img_masks (torch.Tensor): Image mask tensor.
+            lang_tokens (torch.Tensor): Language token tensor.
+            lang_masks (torch.Tensor): Language mask tensor.
+        Returns:
+            tuple: Tuple containing the output tensors.
+        """
+        images, img_masks, lang_tokens, lang_masks = self.process_next_obs(
+            images, img_masks, vqh_images, vqh_img_masks, lang_tokens, lang_masks
+        )
+        embs, pad_masks, att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks
+        )
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        suffix_out = self.vqh_backbone.forward(
+            attention_mask=att_2d_masks,
+            position_ids=position_ids,
+            inputs_embeds=embs,
+        )  # (2B, S, E)
+        batch_indices = torch.arange(suffix_out.shape[0], device=suffix_out.device)
+        query_embedding_idx = pad_masks.sum(-1).long() - 1
+        query_embedding = suffix_out[batch_indices, query_embedding_idx]
+        cal_ql_batch: at.CalQlBatch = dict(
+            encoded_observations=query_embedding[
+                : int(query_embedding.shape[0] / 2)
+            ].to(dtype=torch.float32),
+            encoded_next_observations=query_embedding[
+                int(query_embedding.shape[0] / 2) :
+            ].to(dtype=torch.float32),
+            actions=actions.view(actions.shape[0], -1),
+            rewards=rewards,
+            mc_returns=mc_returns,
+            masks=masks,
+        )
+        temperature_loss, policy_loss, critic_loss, log_dict = self.calql(cal_ql_batch)
+        return temperature_loss, policy_loss, critic_loss, log_dict
+    @jaxtyped(typechecker=typechecker)
+    def select_q_actions(
+        self,
+        images: list[Float[Tensor, "Batch 3 224 224"]],
+        img_masks: list[Bool[Tensor, " Batch"]],
+        lang_tokens: Int64[Tensor, "Batch seq_len"],
+        lang_masks: Bool[Tensor, "Batch seq_len"],
+        noise_actions: Float[
+            Tensor,
+            "Batch s2_candidates_num {self.config.vqh_chunk_size} {self.config.action_feature.shape[0]}",
+        ],
+    ) -> tuple[Int64[Tensor, " Batch"], Float[Tensor, "Batch s2_candidates_num"]]:
+        batch_size = noise_actions.shape[0]
+        s2_candidates_num = noise_actions.shape[1]
+        embs, pad_masks, att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks
+        )
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        suffix_out = self.vqh_backbone.forward(
+            attention_mask=att_2d_masks,
+            position_ids=position_ids,
+            inputs_embeds=embs,
+        )  # (B, S, E)
+        batch_indices = torch.arange(suffix_out.shape[0], device=suffix_out.device)
+        query_embedding_idx = pad_masks.sum(-1).long() - 1
+        query_embedding = suffix_out[batch_indices, query_embedding_idx]
+        noise_actions = noise_actions.reshape(batch_size, s2_candidates_num, -1)
+        q_values = self.calql.get_q_values(query_embedding, noise_actions)
+        action_index = torch.argmax(q_values, dim=1)
+        print(f"MaxValues: {q_values.max(dim=1)[0].tolist()}")
+        print(f"MinValues: {q_values.min(dim=1)[0].tolist()}")
+        print(f"MeanValues: {q_values.mean(dim=1)[0].tolist()}")
+        print(f"ActionIndex: {action_index.tolist()}")
+        return action_index, q_values

paligemma_with_expert.py ADDED Viewed

	@@ -0,0 +1,444 @@

+from typing import List, Optional, Union
+import torch
+import torch.version
+from pytest import Cache
+from torch import nn
+from transformers import (
+    AutoConfig,
+    GemmaForCausalLM,
+    PaliGemmaForConditionalGeneration,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.models.auto import CONFIG_MAPPING
+def apply_rope(x, positions, max_wavelength=10_000):
+    """
+    Applies RoPE positions [B, L] to x [B, L, H, D].
+    """
+    d_half = x.shape[-1] // 2
+    device = x.device
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(
+        d_half, dtype=torch.float32, device=device
+    )
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(
+        torch.float32
+    )
+    radians = radians[..., None, :]
+    sin = torch.sin(radians)  # .to(dtype=dtype)
+    cos = torch.cos(radians)  # .to(dtype=dtype)
+    x1, x2 = x.split(d_half, dim=-1)
+    res = torch.empty_like(x)
+    res[..., :d_half] = x1 * cos - x2 * sin
+    res[..., d_half:] = x2 * cos + x1 * sin
+    return res.to(dtype)
+class PaliGemmaWithExpertConfig(PretrainedConfig):
+    model_type = "PaliGemmaWithExpertModel"
+    sub_configs = {"paligemma_config": AutoConfig, "gemma_expert_config": AutoConfig}
+    def __init__(
+        self,
+        paligemma_config: dict | None = None,
+        gemma_expert_config: dict | None = None,
+        freeze_vision_encoder: bool = True,
+        train_expert_only: bool = True,
+        attention_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.freeze_vision_encoder = freeze_vision_encoder
+        self.train_expert_only = train_expert_only
+        self.attention_implementation = attention_implementation
+        if paligemma_config is None:
+            self.paligemma_config = CONFIG_MAPPING["paligemma"](
+                transformers_version="4.48.1",
+                _vocab_size=257152,
+                bos_token_id=2,
+                eos_token_id=1,
+                hidden_size=2048,
+                image_token_index=257152,
+                model_type="paligemma",
+                pad_token_id=0,
+                projection_dim=2048,
+                text_config={
+                    "hidden_activation": "gelu_pytorch_tanh",
+                    "hidden_size": 2048,
+                    "intermediate_size": 16384,
+                    "model_type": "gemma",
+                    "num_attention_heads": 8,
+                    "num_hidden_layers": 18,
+                    "num_image_tokens": 256,
+                    "num_key_value_heads": 1,
+                    "torch_dtype": "float32",
+                    "vocab_size": 257152,
+                },
+                vision_config={
+                    "hidden_size": 1152,
+                    "intermediate_size": 4304,
+                    "model_type": "siglip_vision_model",
+                    "num_attention_heads": 16,
+                    "num_hidden_layers": 27,
+                    "num_image_tokens": 256,
+                    "patch_size": 14,
+                    "projection_dim": 2048,
+                    "projector_hidden_act": "gelu_fast",
+                    "torch_dtype": "float32",
+                    "vision_use_head": False,
+                },
+            )
+        elif isinstance(paligemma_config, dict):
+            if "model_type" not in paligemma_config:
+                paligemma_config["model_type"] = "paligemma"
+            cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
+            self.paligemma_config = cfg_cls(**paligemma_config)
+        if gemma_expert_config is None:
+            self.gemma_expert_config = CONFIG_MAPPING["gemma"](
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=2,
+                eos_token_id=1,
+                head_dim=256,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_activation="gelu_pytorch_tanh",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=4096,
+                max_position_embeddings=8192,
+                model_type="gemma",
+                num_attention_heads=8,
+                num_hidden_layers=18,
+                num_key_value_heads=1,
+                pad_token_id=0,
+                rms_norm_eps=1e-06,
+                rope_theta=10000.0,
+                torch_dtype="float32",
+                transformers_version="4.48.1",
+                use_cache=True,
+                vocab_size=257152,
+            )
+        elif isinstance(gemma_expert_config, dict):
+            if "model_type" not in gemma_expert_config:
+                gemma_expert_config["model_type"] = "gemma"
+            cfg_cls = CONFIG_MAPPING[gemma_expert_config["model_type"]]
+            self.gemma_expert_config = cfg_cls(**gemma_expert_config)
+        super().__init__(**kwargs)
+    def __post_init__(self):
+        super().__post_init__()
+        if self.train_expert_only and not self.freeze_vision_encoder:
+            raise ValueError(
+                "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible."
+            )
+        if self.attention_implementation not in ["eager", "fa2", "flex"]:
+            raise ValueError(
+                f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'."
+            )
+class PaliGemmaWithExpertModel(PreTrainedModel):
+    config_class = PaliGemmaWithExpertConfig
+    def __init__(self, config: PaliGemmaWithExpertConfig):
+        super().__init__(config=config)
+        self.config = config
+        self.paligemma = PaliGemmaForConditionalGeneration(
+            config=config.paligemma_config
+        )
+        self.gemma_expert = GemmaForCausalLM(config=config.gemma_expert_config)
+        # Remove unused embed_tokens
+        self.gemma_expert.model.embed_tokens = None
+        self.gemma_expert.lm_head = None
+        self.to_bfloat16_like_physical_intelligence()
+        self.set_requires_grad()
+    def set_requires_grad(self):
+        if self.config.freeze_vision_encoder:
+            self.paligemma.vision_tower.eval()
+            for params in self.paligemma.vision_tower.parameters():
+                params.requires_grad = False
+        if self.config.train_expert_only:
+            self.paligemma.eval()
+            for params in self.paligemma.parameters():
+                params.requires_grad = False
+    def train(self, mode: bool = True):
+        super().train(mode)
+        if self.config.freeze_vision_encoder:
+            self.paligemma.vision_tower.eval()
+        if self.config.train_expert_only:
+            self.paligemma.eval()
+    def to_bfloat16_like_physical_intelligence(self):
+        self.paligemma = self.paligemma.to(dtype=torch.bfloat16)
+        params_to_change_dtype = [
+            "language_model.model.layers",
+            "gemma_expert.model.layers",
+            "vision_tower",
+            "multi_modal",
+        ]
+        for name, param in self.named_parameters():
+            if any(selector in name for selector in params_to_change_dtype):
+                param.data = param.data.to(dtype=torch.bfloat16)
+    def embed_image(self, image: torch.Tensor):
+        return self.paligemma.get_image_features(image)
+    def embed_language_tokens(self, tokens: torch.Tensor):
+        return self.paligemma.language_model.model.embed_tokens(tokens)
+    # TODO: break down this huge forward into modules or functions
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        inputs_embeds: List[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        fill_kv_cache: Optional[bool] = None,
+    ):
+        models = [self.paligemma.language_model.model, self.gemma_expert.model]
+        for hidden_states in inputs_embeds:
+            # TODO this is very inefficient
+            # dtype is always the same, batch size too (if > 1 len)
+            # device could be trickier in multi gpu edge cases but that's it
+            if hidden_states is None:
+                continue
+            batch_size = hidden_states.shape[0]
+        # RMSNorm
+        num_layers = self.paligemma.config.text_config.num_hidden_layers
+        head_dim = self.paligemma.config.text_config.head_dim
+        for layer_idx in range(num_layers):
+            query_states = []
+            key_states = []
+            value_states = []
+            for i, hidden_states in enumerate(inputs_embeds):
+                if hidden_states is None:
+                    continue
+                layer = models[i].layers[layer_idx]
+                # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
+                # hidden_states = hidden_states * normalizer
+                hidden_states = layer.input_layernorm(hidden_states)
+                input_shape = hidden_states.shape[
+                    :-1
+                ]  # (b s e) -> layer* -> (b s e) -> mlp
+                hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)  # (b s h d)
+                hidden_states = hidden_states.to(dtype=torch.bfloat16)
+                query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+                key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+                value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+                query_states.append(query_state)
+                key_states.append(key_state)
+                value_states.append(value_state)
+            # B,L,H,D with L sequence length, H number of heads, D head dim
+            # concatenate on the number of embeddings/tokens
+            query_states = torch.cat(query_states, dim=1)
+            key_states = torch.cat(key_states, dim=1)
+            value_states = torch.cat(value_states, dim=1)
+            query_states = apply_rope(query_states, position_ids)
+            key_states = apply_rope(key_states, position_ids)
+            if use_cache and past_key_values is None:
+                past_key_values = {}
+            if use_cache:
+                if fill_kv_cache:
+                    past_key_values[layer_idx] = {
+                        "key_states": key_states,
+                        "value_states": value_states,
+                    }
+                else:
+                    # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+                    # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+                    # the max len, then we (for instance) double the cache size. This implementation already exists
+                    # in `transformers`. (molbap)
+                    key_states = torch.cat(
+                        [past_key_values[layer_idx]["key_states"], key_states], dim=1
+                    )
+                    value_states = torch.cat(
+                        [past_key_values[layer_idx]["value_states"], value_states],
+                        dim=1,
+                    )
+            attention_interface = self.get_attention_interface()
+            att_output = attention_interface(
+                attention_mask,
+                batch_size,
+                head_dim,
+                query_states,
+                key_states,
+                value_states,
+            )
+            att_output = att_output.to(dtype=torch.bfloat16)
+            # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len])
+            outputs_embeds = []
+            start = 0
+            for i, hidden_states in enumerate(inputs_embeds):
+                layer = models[i].layers[layer_idx]
+                if hidden_states is not None:
+                    end = start + hidden_states.shape[1]
+                    if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+                        att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+                    out_emb = layer.self_attn.o_proj(att_output[:, start:end])
+                    # TODO: first dropout (by default 0.0)
+                    # first residual
+                    out_emb += hidden_states
+                    after_first_residual = out_emb.clone()
+                    out_emb = layer.post_attention_layernorm(out_emb)
+                    out_emb = layer.mlp(out_emb)
+                    # TODO: second dropout (by default 0.0)
+                    # second residual
+                    out_emb += after_first_residual
+                    outputs_embeds.append(out_emb)
+                    start = end
+                else:
+                    outputs_embeds.append(None)
+            inputs_embeds = outputs_embeds
+        # final norm
+        outputs_embeds = []
+        for i, hidden_states in enumerate(inputs_embeds):
+            if hidden_states is not None:
+                out_emb = models[i].norm(hidden_states)
+                outputs_embeds.append(out_emb)
+            else:
+                outputs_embeds.append(None)
+        return outputs_embeds, past_key_values
+    def get_attention_interface(self):
+        if self.config.attention_implementation == "fa2":
+            attention_interface = self.flash_attention_forward
+        else:
+            attention_interface = self.eager_attention_forward
+        return attention_interface
+    def flash_attention_forward(
+        self,
+        attention_mask,
+        batch_size,
+        head_dim,
+        query_states,
+        key_states,
+        value_states,
+    ):
+        raise NotImplementedError("FA2 is not implemented (yet)")
+    def eager_attention_forward(
+        self,
+        attention_mask,
+        batch_size,
+        head_dim,
+        query_states,
+        key_states,
+        value_states,
+    ):
+        num_att_heads = self.config.paligemma_config.text_config.num_attention_heads
+        num_key_value_heads = (
+            self.config.paligemma_config.text_config.num_key_value_heads
+        )
+        num_key_value_groups = num_att_heads // num_key_value_heads
+        # query_states: batch_size, sequence_length, num_att_head, head_dim
+        # key_states: batch_size, sequence_length, num_key_value_head, head_dim
+        # value_states: batch_size, sequence_length, num_key_value_head, head_dim
+        sequence_length = key_states.shape[1]
+        key_states = key_states[:, :, :, None, :].expand(
+            batch_size,
+            sequence_length,
+            num_key_value_heads,
+            num_key_value_groups,
+            head_dim,
+        )
+        key_states = key_states.reshape(
+            batch_size,
+            sequence_length,
+            num_key_value_heads * num_key_value_groups,
+            head_dim,
+        )
+        value_states = value_states[:, :, :, None, :].expand(
+            batch_size,
+            sequence_length,
+            num_key_value_heads,
+            num_key_value_groups,
+            head_dim,
+        )
+        value_states = value_states.reshape(
+            batch_size,
+            sequence_length,
+            num_key_value_heads * num_key_value_groups,
+            head_dim,
+        )
+        # Attention here is upcasted to float32 to match the original eager implementation.
+        query_states = query_states.to(dtype=torch.float32)
+        key_states = key_states.to(dtype=torch.float32)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        att_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        att_weights *= head_dim**-0.5
+        big_neg = -2.3819763e38  # See gemma/modules.py
+        masked_att_weights = torch.where(
+            attention_mask[:, None, :, :], att_weights, big_neg
+        )
+        probs = nn.functional.softmax(masked_att_weights, dim=-1)
+        probs = probs.to(dtype=value_states.dtype)
+        # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length
+        # value_states: batch_size, sequence_length, num_att_heads, head_dim
+        att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3))
+        att_output = att_output.permute(0, 2, 1, 3)
+        # we use -1 because sequence length can change
+        att_output = att_output.reshape(
+            batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim
+        )
+        return att_output

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<image>"
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:537cbe6b94581ee7b70f7f39453d5c52f2590069aa75d76ceee458fde442523c
+size 34387383

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1772 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "5": {
+      "content": "<2mass>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "[@BOS@]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "<unused0>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "<unused1>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "<unused2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "<unused3>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "<unused4>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "<unused5>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "<unused6>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "<unused7>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "<unused8>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "16": {
+      "content": "<unused9>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "17": {
+      "content": "<unused10>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "18": {
+      "content": "<unused11>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "19": {
+      "content": "<unused12>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "20": {
+      "content": "<unused13>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21": {
+      "content": "<unused14>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "22": {
+      "content": "<unused15>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "23": {
+      "content": "<unused16>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "24": {
+      "content": "<unused17>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "25": {
+      "content": "<unused18>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "26": {
+      "content": "<unused19>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "27": {
+      "content": "<unused20>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "<unused21>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "<unused22>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "<unused23>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "31": {
+      "content": "<unused24>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32": {
+      "content": "<unused25>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "33": {
+      "content": "<unused26>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "34": {
+      "content": "<unused27>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "35": {
+      "content": "<unused28>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "36": {
+      "content": "<unused29>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "<unused30>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "<unused31>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "39": {
+      "content": "<unused32>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "40": {
+      "content": "<unused33>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "41": {
+      "content": "<unused34>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "42": {
+      "content": "<unused35>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "<unused36>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "<unused37>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "45": {
+      "content": "<unused38>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "46": {
+      "content": "<unused39>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "47": {
+      "content": "<unused40>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "48": {
+      "content": "<unused41>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "49": {
+      "content": "<unused42>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50": {
+      "content": "<unused43>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "51": {
+      "content": "<unused44>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "52": {
+      "content": "<unused45>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "53": {
+      "content": "<unused46>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "54": {
+      "content": "<unused47>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "55": {
+      "content": "<unused48>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "56": {
+      "content": "<unused49>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57": {
+      "content": "<unused50>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "58": {
+      "content": "<unused51>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "59": {
+      "content": "<unused52>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "60": {
+      "content": "<unused53>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "61": {
+      "content": "<unused54>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "62": {
+      "content": "<unused55>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "63": {
+      "content": "<unused56>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "64": {
+      "content": "<unused57>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65": {
+      "content": "<unused58>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "66": {
+      "content": "<unused59>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "67": {
+      "content": "<unused60>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "68": {
+      "content": "<unused61>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "69": {
+      "content": "<unused62>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "70": {
+      "content": "<unused63>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "71": {
+      "content": "<unused64>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "72": {
+      "content": "<unused65>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "73": {
+      "content": "<unused66>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "74": {
+      "content": "<unused67>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "75": {
+      "content": "<unused68>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "76": {
+      "content": "<unused69>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "77": {
+      "content": "<unused70>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "78": {
+      "content": "<unused71>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "79": {
+      "content": "<unused72>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "80": {
+      "content": "<unused73>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "81": {
+      "content": "<unused74>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "82": {
+      "content": "<unused75>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "83": {
+      "content": "<unused76>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "84": {
+      "content": "<unused77>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "85": {
+      "content": "<unused78>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "86": {
+      "content": "<unused79>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "87": {
+      "content": "<unused80>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "88": {
+      "content": "<unused81>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "89": {
+      "content": "<unused82>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "90": {
+      "content": "<unused83>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "91": {
+      "content": "<unused84>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "92": {
+      "content": "<unused85>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "93": {
+      "content": "<unused86>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "94": {
+      "content": "<unused87>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "95": {
+      "content": "<unused88>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "96": {
+      "content": "<unused89>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "97": {
+      "content": "<unused90>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "98": {
+      "content": "<unused91>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "99": {
+      "content": "<unused92>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100": {
+      "content": "<unused93>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "101": {
+      "content": "<unused94>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "102": {
+      "content": "<unused95>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "103": {
+      "content": "<unused96>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "104": {
+      "content": "<unused97>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "105": {
+      "content": "<unused98>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "106": {
+      "content": "<start_of_turn>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "107": {
+      "content": "<end_of_turn>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "108": {
+      "content": "\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "109": {
+      "content": "\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "110": {
+      "content": "\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "111": {
+      "content": "\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "112": {
+      "content": "\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "113": {
+      "content": "\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "114": {
+      "content": "\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "115": {
+      "content": "\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "116": {
+      "content": "\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "117": {
+      "content": "\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "118": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "119": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "120": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "121": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "122": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "123": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "124": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "125": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "126": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "127": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "129": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "130": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "131": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "132": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "133": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "134": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "135": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "136": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "137": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "138": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "139": {
+      "content": "▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "140": {
+      "content": "▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "141": {
+      "content": "▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "142": {
+      "content": "▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "143": {
+      "content": "▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "144": {
+      "content": "▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "145": {
+      "content": "▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "146": {
+      "content": "▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "147": {
+      "content": "▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "148": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "149": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "150": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "152": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "153": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "154": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "155": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "156": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "157": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "158": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "159": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "160": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "161": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "162": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "164": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "165": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "166": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "167": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "168": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "169": {
+      "content": "<table>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "170": {
+      "content": "<caption>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "171": {
+      "content": "<thead>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "172": {
+      "content": "<tbody>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "173": {
+      "content": "<tfoot>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "174": {
+      "content": "<tr>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "175": {
+      "content": "<th>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "176": {
+      "content": "<td>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "177": {
+      "content": "</table>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "178": {
+      "content": "</caption>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "179": {
+      "content": "</thead>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "180": {
+      "content": "</tbody>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "181": {
+      "content": "</tfoot>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "182": {
+      "content": "</tr>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "183": {
+      "content": "</th>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "184": {
+      "content": "</td>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "185": {
+      "content": "<h1>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "186": {
+      "content": "<h2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "187": {
+      "content": "<h3>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "188": {
+      "content": "<h4>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "189": {
+      "content": "<h5>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "190": {
+      "content": "<h6>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "191": {
+      "content": "<blockquote>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "192": {
+      "content": "</h1>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "193": {
+      "content": "</h2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "194": {
+      "content": "</h3>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "195": {
+      "content": "</h4>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "196": {
+      "content": "</h5>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "197": {
+      "content": "</h6>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "198": {
+      "content": "</blockquote>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "199": {
+      "content": "<strong>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "200": {
+      "content": "<em>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "201": {
+      "content": "<b>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "202": {
+      "content": "<i>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "203": {
+      "content": "<u>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "204": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "205": {
+      "content": "<sub>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "206": {
+      "content": "<sup>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "207": {
+      "content": "<code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "208": {
+      "content": "</strong>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "209": {
+      "content": "</em>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "210": {
+      "content": "</b>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "211": {
+      "content": "</i>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "212": {
+      "content": "</u>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "213": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "214": {
+      "content": "</sub>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "215": {
+      "content": "</sup>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "216": {
+      "content": "</code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "257152": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<image>"
+  ],
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_special_tokens": {},
+  "max_length": 48,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "<pad>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "processor_class": "PaliGemmaProcessor",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "stride": 0,
+  "tokenizer_class": "GemmaTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

value_query.py ADDED Viewed

	@@ -0,0 +1,1155 @@

+import math
+from copy import deepcopy
+from functools import partial
+from typing import Callable, Optional, Sequence, Tuple, Union
+import array_typing as at
+import numpy as np
+import torch
+import torch.nn as nn
+from beartype import beartype as typechecker
+from jaxtyping import Float, jaxtyped
+from torch.distributions import Independent, Normal, TransformedDistribution
+from torch.distributions.transforms import (
+    AffineTransform,
+    ComposeTransform,
+    TanhTransform,
+)
+from torch.optim import Adam, AdamW, Optimizer
+from torch.optim.lr_scheduler import (
+    LambdaLR,
+)
+from transformers import (
+    AutoConfig,
+    GemmaForCausalLM,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.models.auto import CONFIG_MAPPING
+def extend_and_repeat(tensor: torch.Tensor, dim: int, repeat: int) -> torch.Tensor:
+    return tensor.unsqueeze(dim).repeat_interleave(repeat, dim=dim)
+def init_module_weights(module: torch.nn.Module, orthogonal_init: bool = False):
+    if isinstance(module, nn.Linear):
+        if orthogonal_init:
+            nn.init.orthogonal_(module.weight, gain=np.sqrt(2))
+            nn.init.constant_(module.bias, 0.0)
+        else:
+            nn.init.xavier_uniform_(module.weight, gain=1e-2)
+class VQHBackboneConfig(PretrainedConfig):
+    model_type = "VQHBackbone"
+    sub_configs = {"gemma_expert_config": AutoConfig}
+    def __init__(
+        self,
+        gemma_expert_config: dict | None = None,
+        attention_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attention_implementation = attention_implementation
+        if gemma_expert_config is None:
+            self.gemma_expert_config = CONFIG_MAPPING["gemma"](
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=2,
+                eos_token_id=1,
+                head_dim=256,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_activation="gelu_pytorch_tanh",
+                hidden_size=2048,
+                initializer_range=0.02,
+                intermediate_size=4096,
+                max_position_embeddings=8192,
+                model_type="gemma",
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=1,
+                pad_token_id=0,
+                rms_norm_eps=1e-06,
+                rope_theta=10000.0,
+                torch_dtype="float32",
+                transformers_version="4.48.1",
+                use_cache=True,
+                vocab_size=257152,
+            )
+        elif isinstance(gemma_expert_config, dict):
+            if "model_type" not in gemma_expert_config:
+                gemma_expert_config["model_type"] = "gemma"
+            cfg_cls = CONFIG_MAPPING[gemma_expert_config["model_type"]]
+            self.gemma_expert_config = cfg_cls(**gemma_expert_config)
+        super().__init__(**kwargs)
+    def __post_init__(self):
+        super().__post_init__()
+        if self.attention_implementation not in ["eager", "fa2", "flex"]:
+            raise ValueError(
+                f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'."
+            )
+def apply_rope(x, positions, max_wavelength=10_000):
+    """
+    Applies RoPE positions [B, L] to x [B, L, H, D].
+    """
+    d_half = x.shape[-1] // 2
+    device = x.device
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(
+        d_half, dtype=torch.float32, device=device
+    )
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(
+        torch.float32
+    )
+    radians = radians[..., None, :]
+    sin = torch.sin(radians)  # .to(dtype=dtype)
+    cos = torch.cos(radians)  # .to(dtype=dtype)
+    x1, x2 = x.split(d_half, dim=-1)
+    res = torch.empty_like(x)
+    res[..., :d_half] = x1 * cos - x2 * sin
+    res[..., d_half:] = x2 * cos + x1 * sin
+    return res.to(dtype)
+class VQHBackbone(PreTrainedModel):
+    config_class = VQHBackboneConfig
+    def __init__(self, config: VQHBackboneConfig):
+        super().__init__(config=config)
+        self.config = config
+        self.gemma_expert = GemmaForCausalLM(config=config.gemma_expert_config)
+        self.to_bfloat16_like_physical_intelligence()
+    def train(self, mode: bool = True):
+        super().train(mode)
+    def to_bfloat16_like_physical_intelligence(self):
+        params_to_change_dtype = [
+            "language_model.model.layers",
+            "gemma_expert.model.layers",
+        ]
+        for name, param in self.named_parameters():
+            if any(selector in name for selector in params_to_change_dtype):
+                param.data = param.data.to(dtype=torch.bfloat16)
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        # RMSNorm
+        head_dim = self.gemma_expert.config.head_dim
+        hidden_states = inputs_embeds
+        batch_size = hidden_states.shape[0]
+        for layer in self.gemma_expert.model.layers[
+            : self.gemma_expert.config.num_hidden_layers
+        ]:
+            # normalizer = torch.tensor(model.config.hidden_size**0.5, dtype=hidden_states.dtype)
+            # hidden_states = hidden_states * normalizer
+            hidden_states = layer.input_layernorm(hidden_states)
+            input_shape = hidden_states.shape[:-1]
+            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+            # self attention
+            hidden_states = hidden_states.to(dtype=torch.bfloat16)
+            query_states = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            key_states = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+            query_states = apply_rope(query_states, position_ids)
+            key_states = apply_rope(key_states, position_ids)
+            attention_interface = self.get_attention_interface()
+            att_output = attention_interface(
+                attention_mask,
+                batch_size,
+                head_dim,
+                query_states,
+                key_states,
+                value_states,
+            )
+            if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+                att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+            out_emb = layer.self_attn.o_proj(att_output)
+            # first residual
+            out_emb += hidden_states
+            after_first_residual = out_emb.clone()
+            out_emb = layer.post_attention_layernorm(out_emb)
+            out_emb = layer.mlp(out_emb)
+            # second residual
+            out_emb += after_first_residual
+            hidden_states = out_emb
+        # final norm
+        hidden_states = self.gemma_expert.model.norm(hidden_states)
+        return hidden_states
+    def get_attention_interface(self):
+        if self.config.attention_implementation == "fa2":
+            attention_interface = self.flash_attention_forward
+        else:
+            attention_interface = self.eager_attention_forward
+        return attention_interface
+    def eager_attention_forward(
+        self,
+        attention_mask,
+        batch_size,
+        head_dim,
+        query_states,
+        key_states,
+        value_states,
+    ):
+        num_att_heads = self.config.gemma_expert_config.num_attention_heads
+        num_key_value_heads = self.config.gemma_expert_config.num_key_value_heads
+        num_key_value_groups = num_att_heads // num_key_value_heads
+        # query_states: batch_size, sequence_length, num_att_head, head_dim
+        # key_states: batch_size, sequence_length, num_key_value_head, head_dim
+        # value_states: batch_size, sequence_length, num_key_value_head, head_dim
+        sequence_length = key_states.shape[1]
+        key_states = key_states[:, :, :, None, :].expand(
+            batch_size,
+            sequence_length,
+            num_key_value_heads,
+            num_key_value_groups,
+            head_dim,
+        )
+        key_states = key_states.reshape(
+            batch_size,
+            sequence_length,
+            num_key_value_heads * num_key_value_groups,
+            head_dim,
+        )
+        value_states = value_states[:, :, :, None, :].expand(
+            batch_size,
+            sequence_length,
+            num_key_value_heads,
+            num_key_value_groups,
+            head_dim,
+        )
+        value_states = value_states.reshape(
+            batch_size,
+            sequence_length,
+            num_key_value_heads * num_key_value_groups,
+            head_dim,
+        )
+        # Attention here is upcasted to float32 to match the original eager implementation.
+        query_states = query_states.to(dtype=torch.float32)
+        key_states = key_states.to(dtype=torch.float32)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        att_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        att_weights *= head_dim**-0.5
+        big_neg = -2.3819763e38  # See gemma/modules.py
+        masked_att_weights = torch.where(
+            attention_mask[:, None, :, :], att_weights, big_neg
+        )
+        probs = nn.functional.softmax(masked_att_weights, dim=-1)
+        probs = probs.to(dtype=value_states.dtype)
+        # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length
+        # value_states: batch_size, sequence_length, num_att_heads, head_dim
+        att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3))
+        att_output = att_output.permute(0, 2, 1, 3)
+        # we use -1 because sequence length can change
+        att_output = att_output.reshape(
+            batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim
+        )
+        return att_output
+class LagrangeMultiplier(nn.Module):
+    def __init__(
+        self,
+        init_value: float = 1.0,
+        constraint_shape: Tuple[int, ...] = (),
+        constraint_type: str = "eq",  # One of ("eq", "leq", "geq")
+        parameterization: Optional[
+            str
+        ] = None,  # One of ("softplus", "exp"), or None for equality constraints
+    ):
+        super().__init__()
+        self.constraint_type = constraint_type
+        self.parameterization = parameterization
+        if constraint_type != "eq":
+            assert (
+                init_value > 0
+            ), "Inequality constraints must have non-negative initial multiplier values"
+            if parameterization == "softplus":
+                init_value = torch.log(torch.exp(torch.tensor(init_value)) - 1).item()
+            elif parameterization == "exp":
+                init_value = torch.log(torch.tensor(init_value)).item()
+            else:
+                raise ValueError(
+                    f"Invalid multiplier parameterization {parameterization}"
+                )
+        else:
+            assert (
+                parameterization is None
+            ), "Equality constraints must have no parameterization"
+        self.multiplier = nn.Parameter(torch.full(constraint_shape, init_value))
+    def forward(
+        self, lhs: Optional[torch.Tensor] = None, rhs: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        multiplier = self.multiplier
+        if self.constraint_type != "eq":
+            if self.parameterization == "softplus":
+                multiplier = torch.nn.functional.softplus(multiplier)
+            elif self.parameterization == "exp":
+                multiplier = torch.exp(multiplier)
+            else:
+                raise ValueError(
+                    f"Invalid multiplier parameterization {self.parameterization}"
+                )
+        if lhs is None:
+            return multiplier
+        if rhs is None:
+            rhs = torch.zeros_like(lhs)
+        diff = lhs - rhs
+        assert (
+            diff.shape == multiplier.shape
+        ), f"Shape mismatch: {diff.shape} vs {multiplier.shape}"
+        if self.constraint_type == "eq":
+            return multiplier * diff
+        elif self.constraint_type == "geq":
+            return multiplier * diff
+        elif self.constraint_type == "leq":
+            return -multiplier * diff
+GeqLagrangeMultiplier = partial(
+    LagrangeMultiplier, constraint_type="geq", parameterization="softplus"
+)
+LeqLagrangeMultiplier = partial(
+    LagrangeMultiplier, constraint_type="leq", parameterization="softplus"
+)
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dims: Sequence[int],
+        activations: Union[Callable[[torch.Tensor], torch.Tensor], str] = "silu",
+        activate_final: bool = False,
+        use_layer_norm: bool = False,
+        use_group_norm: bool = False,
+        dropout_rate: Optional[float] = None,
+    ):
+        super().__init__()
+        assert not (use_layer_norm and use_group_norm)
+        self.activate_final = activate_final
+        self.dropout_rate = dropout_rate
+        self.input_dim = input_dim
+        self.hidden_dims = hidden_dims
+        if isinstance(activations, str):
+            if activations == "silu" or activations == "swish":
+                self.activations = nn.SiLU()
+            else:
+                self.activations = getattr(nn, activations)()
+        else:
+            self.activations = activations
+        layers = []
+        for i, hidden_dim in enumerate(hidden_dims):
+            layers.append(nn.Linear(input_dim, hidden_dim))
+            nn.init.xavier_uniform_(layers[-1].weight)
+            nn.init.zeros_(layers[-1].bias)
+            input_dim = hidden_dim
+            if i + 1 < len(hidden_dims) or activate_final:
+                if dropout_rate is not None and dropout_rate > 0:
+                    layers.append(nn.Dropout(p=dropout_rate))
+                if use_layer_norm:
+                    layers.append(nn.LayerNorm(hidden_dim))
+                elif use_group_norm:
+                    num_groups = min(hidden_dim, 32)
+                    layers.append(nn.GroupNorm(num_groups, hidden_dim))
+                layers.append(self.activations)
+        self.layers = nn.ModuleList(layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class TanhMultivariateNormalDiag(TransformedDistribution):
+    def __init__(
+        self,
+        loc: torch.Tensor,
+        scale_diag: torch.Tensor,
+        low: Optional[torch.Tensor] = None,
+        high: Optional[torch.Tensor] = None,
+    ):
+        self.loc = loc
+        self.scale_diag = scale_diag
+        base_distribution = Independent(Normal(loc, scale_diag), 1)
+        transforms = []
+        transforms.append(TanhTransform())
+        if not (low is None or high is None):
+            transforms.append(
+                AffineTransform(loc=(high + low) / 2, scale=(high - low) / 2)
+            )
+        transform = ComposeTransform(transforms)
+        super().__init__(base_distribution, transform)
+    def mode(self) -> torch.Tensor:
+        """返回分布的众数"""
+        # 对于正态分布，众数就是均值
+        mode = self.loc
+        # 应用变换
+        for transform in self.transforms:
+            mode = transform(mode)
+        return mode
+    def stddev(self) -> torch.Tensor:
+        """返回变换后的标准差（近似值）"""
+        # 注意：这只是一个近似，因为非线性变换后的标准差计算复杂
+        return self.transform(self.loc + self.scale_diag) - self.transform(self.loc)
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        eps = 1e-6
+        value = torch.clamp(value, -1 + eps, 1 - eps)
+        return super().log_prob(value)
+class Policy(nn.Module):
+    def __init__(
+        self,
+        obs_encoded_dim: int,
+        network: nn.Module,
+        action_dim: int,
+        std_parameterization: str = "exp",  # "exp", "softplus", "fixed", or "uniform"
+        std_min: Optional[float] = 1e-5,
+        std_max: Optional[float] = 10.0,
+        tanh_squash_distribution: bool = False,
+        fixed_std: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+        self.obs_encoded_dim = obs_encoded_dim
+        self.network = network
+        self.action_dim = action_dim
+        self.std_parameterization = std_parameterization
+        self.std_min = std_min
+        self.std_max = std_max
+        self.tanh_squash_distribution = tanh_squash_distribution
+        self.fixed_std = fixed_std
+        self.mean_layer = nn.Linear(network.hidden_dims[-1], action_dim)
+        if fixed_std is None:
+            if std_parameterization in ["exp", "softplus"]:
+                self.std_layer = nn.Linear(network.hidden_dims[-1], action_dim)
+            elif std_parameterization == "uniform":
+                self.log_stds = nn.Parameter(torch.zeros(action_dim))
+            else:
+                raise ValueError(
+                    f"Invalid std_parameterization: {self.std_parameterization}"
+                )
+        else:
+            assert std_parameterization == "fixed"
+        nn.init.xavier_uniform_(self.mean_layer.weight)
+        nn.init.zeros_(self.mean_layer.bias)
+        if fixed_std is None and std_parameterization in ["exp", "softplus"]:
+            nn.init.xavier_uniform_(self.std_layer.weight)
+            nn.init.zeros_(self.std_layer.bias)
+    def forward(
+        self, encoded_observations: torch.Tensor, temperature: float = 1.0
+    ) -> Union[TransformedDistribution, Normal]:
+        outputs = self.network(encoded_observations)
+        means = self.mean_layer(outputs)
+        if self.fixed_std is None:
+            if self.std_parameterization == "exp":
+                log_stds = self.std_layer(outputs)
+                stds = torch.exp(log_stds)
+            elif self.std_parameterization == "softplus":
+                stds = self.std_layer(outputs)
+                stds = nn.functional.softplus(stds)
+            elif self.std_parameterization == "uniform":
+                stds = torch.exp(self.log_stds).expand_as(means)
+            else:
+                raise ValueError(
+                    f"Invalid std_parameterization: {self.std_parameterization}"
+                )
+        else:
+            stds = self.fixed_std.to(means.device).expand_as(means)
+        stds = torch.clamp(stds, self.std_min, self.std_max) * torch.sqrt(
+            torch.tensor(temperature)
+        )
+        if self.tanh_squash_distribution:
+            distribution = TanhMultivariateNormalDiag(
+                loc=means,
+                scale_diag=stds,
+            )
+        else:
+            distribution = Normal(loc=means, scale=stds)
+        return distribution
+class Critics(nn.Module):
+    def __init__(
+        self,
+        obs_encoded_dim: int,
+        networks: list[nn.Module],
+        num_backbones: int = 2,
+        init_final: Optional[float] = None,
+    ):
+        super().__init__()
+        assert len(networks) == num_backbones
+        self.obs_encoded_dim = obs_encoded_dim
+        self.networks = nn.ModuleList(networks)
+        self.num_backbones = num_backbones
+        self.init_final = init_final
+        self.backbone_output_dims = networks[0].hidden_dims[-1]
+        if init_final is not None:
+            self.output_layer = nn.Linear(self.backbone_output_dims, 1)
+            nn.init.uniform_(self.output_layer.weight, -init_final, init_final)
+            nn.init.uniform_(self.output_layer.bias, -init_final, init_final)
+        else:
+            self.output_layer = nn.Linear(self.backbone_output_dims, 1)
+            nn.init.xavier_uniform_(self.output_layer.weight)
+            nn.init.zeros_(self.output_layer.bias)
+    @jaxtyped(typechecker=typechecker)
+    def forward(
+        self,
+        encoded_observations: Float[torch.Tensor, "batch {self.obs_encoded_dim}"],
+        actions: Float[torch.Tensor, "batch *num_actions action_dim"],
+    ) -> Float[torch.Tensor, "{self.num_backbones} batch *num_actions"]:
+        if actions.ndim == 3:
+            # forward the q function with multiple actions on each state
+            encoded_observations = encoded_observations.unsqueeze(1).expand(
+                -1, actions.shape[1], -1
+            )
+        # HACK: check dimensions here
+        inputs = torch.cat([encoded_observations, actions], dim=-1)
+        backbone_outputs = []
+        for network in self.networks:
+            backbone_outputs.append(network(inputs))
+        backbone_outputs: Float[
+            torch.Tensor,
+            "{self.num_backbones} batch *num_actions {self.backbone_output_dims}",
+        ] = torch.stack(backbone_outputs, dim=0)
+        value = self.output_layer(backbone_outputs)
+        # HACK: check output shape here
+        # if actions.ndim == 3:
+        #     value = value.squeeze(-1).permute(0, 2, 1)
+        # else:
+        value = value.squeeze(-1)
+        return value  # (num_backbones, batch, *num_actions)
+class CalQlConfig(PretrainedConfig):
+    moedel_type = "calql"
+    def __init__(
+        self,
+        obs_encoded_dim=2048,
+        action_dim=70,
+        actor_lr=1e-4,
+        critic_lr=3e-4,
+        temp_lr=3e-4,
+        actor_wps=2000,
+        critic_wps=2000,
+        **kwargs,
+    ):
+        self.cql_clip_diff_min = -np.inf
+        self.cql_clip_diff_max = np.inf
+        self.cql_alpha = 5.0
+        self.cql_autotune_alpha = False
+        self.action_dim = action_dim
+        self.target_entropy = -self.action_dim
+        self.obs_encoded_dim = obs_encoded_dim
+        self.cql_temperature_init_value = 1.0
+        self.critic_ensemble_size = 2
+        self.cql_n_actions = 4
+        self.cql_max_target_backup = True
+        self.policy_network_kwargs = dict(
+            input_dim=self.obs_encoded_dim,
+            hidden_dims=[256, 256],
+            activate_final=True,
+            use_layer_norm=False,
+        )
+        self.critic_network_kwargs = dict(
+            input_dim=self.obs_encoded_dim + self.action_dim,
+            hidden_dims=[256, 256],
+            activate_final=True,
+            use_layer_norm=False,
+        )
+        self.policy_kwargs = dict(
+            tanh_squash_distribution=True,
+            std_parameterization="exp",
+        )
+        self.critic_subsample_size = None
+        self.cql_max_target_backup = True
+        self.backup_entropy = False
+        self.discount = 0.98
+        self.goal_conditioned = True
+        self.gc_kwargs = dict(
+            negative_proportion=0.0,
+        )
+        self.use_td_loss = True
+        self.cql_action_sample_method = "uniform"
+        self.cql_importance_sample = True
+        self.cql_temp = 1.0
+        self.use_calql = True
+        self.actor_optimizer_kwargs = dict(
+            learning_rate=actor_lr,
+            warmup_steps=actor_wps,
+        )
+        self.critic_optimizer_kwargs = dict(
+            learning_rate=critic_lr,
+            warmup_steps=critic_wps,
+        )
+        self.temperature_optimizer_kwargs = dict(learning_rate=temp_lr)
+        super().__init__(**kwargs)
+class CalQL(PreTrainedModel):
+    config_calss = CalQlConfig
+    def __init__(self, config: CalQlConfig):
+        super(CalQL, self).__init__(config=config)
+        self.config = config
+        self.temperature = GeqLagrangeMultiplier(
+            init_value=self.config.cql_temperature_init_value,
+            constraint_shape=(),
+        )
+        self.policy = Policy(
+            obs_encoded_dim=self.config.obs_encoded_dim,
+            network=MLP(**self.config.policy_network_kwargs),
+            action_dim=self.config.action_dim,
+            **self.config.policy_kwargs,
+        )
+        self.critics = Critics(
+            obs_encoded_dim=self.config.obs_encoded_dim,
+            networks=[
+                MLP(**self.config.critic_network_kwargs)
+                for _ in range(self.config.critic_ensemble_size)
+            ],
+            num_backbones=self.config.critic_ensemble_size,
+        )
+        self.target_critics = deepcopy(self.critics)
+    def forward_policy_and_sample(
+        self,
+        encoded_obs: Float[torch.Tensor, "batch {self.config.obs_encoded_dim}"],
+        repeat: int = None,
+    ):
+        action_dist = self.policy.forward(encoded_obs)
+        if repeat:
+            new_actions = action_dist.rsample(
+                torch.tensor([repeat])
+            )  # repeat, tensor, act_dim
+            log_pi = action_dist.log_prob(new_actions)
+            new_actions = new_actions.permute(1, 0, 2)  # (batch, repeat, action_dim)
+            log_pi = log_pi.permute(1, 0)  # (batch, repeat)
+        else:
+            new_actions = action_dist.rsample()  # (batch, action_dim)
+            log_pi = action_dist.log_prob(new_actions)  # (batch)
+        # NOTE: detach gradient here
+        new_actions = new_actions.detach()
+        log_pi = log_pi.detach()
+        return new_actions, log_pi
+    def _compute_next_actions(self, batch: at.CalQlBatch):
+        """
+        compute the next actions but with repeat cql_n_actions times
+        this should only be used when calculating critic loss using
+        cql_max_target_backup
+        """
+        sample_n_actions = (
+            self.config.cql_n_actions if self.config.cql_max_target_backup else None
+        )
+        next_actions, next_actions_log_probs = self.forward_policy_and_sample(
+            batch["encoded_next_observations"],
+            repeat=sample_n_actions,
+        )
+        return next_actions, next_actions_log_probs
+    def temperature_loss_fn(self, batch: at.CalQlBatch):
+        next_actions, next_actions_log_probs = self._compute_next_actions(batch)
+        entropy = -next_actions_log_probs.mean()
+        temperature_loss = self.temperature.forward(
+            lhs=entropy,
+            rhs=self.config.target_entropy,
+        )
+        return temperature_loss, {"temperature_loss": temperature_loss}
+    def policy_loss_fn(self, batch: at.CalQlBatch):
+        batch_size = batch["rewards"].shape[0]
+        temperature = self.temperature.forward().detach()  # detach gradient
+        action_distributions = self.policy.forward(batch["encoded_observations"])
+        actions = action_distributions.rsample()
+        log_probs = action_distributions.log_prob(actions)
+        predicted_qs = self.critics.forward(
+            batch["encoded_observations"],
+            actions,
+        ).detach()  # NOTE: detach grads
+        predicted_q = predicted_qs.min(dim=0)[0]
+        assert predicted_q.shape == (batch_size,)
+        assert log_probs.shape == (batch_size,)
+        nll_objective = -torch.mean(
+            action_distributions.log_prob(torch.clip(batch["actions"], -0.99, 0.99))
+        )
+        actor_objective = predicted_q
+        actor_loss = -torch.mean(actor_objective) + torch.mean(temperature * log_probs)
+        info = {
+            "actor_loss": actor_loss,
+            "actor_nll": nll_objective,
+            "temperature": temperature,
+            "entropy": -log_probs.mean(),
+            "log_probs": log_probs,
+            "actions_mse": ((actions - batch["actions"]) ** 2).sum(dim=-1).mean(),
+            "dataset_rewards": batch["rewards"],
+            "mc_returns": batch.get("mc_returns", None),
+        }
+        return actor_loss, info
+    def sac_critic_loss_fn(self, batch: at.CalQlBatch):
+        """classes that inherit this class can change this function"""
+        batch_size = batch["rewards"].shape[0]
+        next_actions, next_actions_log_probs = self._compute_next_actions(batch)
+        # (batch_size, ) for sac, (batch_size, cql_n_actions) for cql
+        # Evaluate next Qs for all ensemble members (cheap because we're only doing the forward pass)
+        with torch.no_grad():
+            self.target_critics.eval()
+            target_next_qs = self.target_critics.forward(
+                batch["encoded_next_observations"],
+                next_actions,
+            )  # (critic_ensemble_size, batch_size, cql_n_actions)
+            self.target_critics.train()
+        # Subsample if requested
+        if self.config.critic_subsample_size is not None:
+            subsample_idcs = torch.randint(
+                0,
+                self.config.critic_ensemble_size,
+                (self.config.critic_ensemble_size,),
+                device=target_next_qs.device,
+            )
+            target_next_qs = target_next_qs[subsample_idcs]
+        # Minimum Q across (subsampled) ensemble members
+        target_next_min_q = target_next_qs.min(dim=0)[0]
+        assert target_next_min_q.shape == next_actions_log_probs.shape
+        # (batch_size,) for sac, (batch_size, cql_n_actions) for cql
+        target_next_min_q = self._process_target_next_qs(
+            target_next_min_q,
+            next_actions_log_probs,
+        )
+        target_q = (
+            batch["rewards"] + self.config.discount * batch["masks"] * target_next_min_q
+        )
+        assert target_q.shape == (batch_size,)
+        predicted_qs = self.critics.forward(
+            batch["encoded_observations"], batch["actions"]
+        )
+        assert predicted_qs.shape == (self.config.critic_ensemble_size, batch_size)
+        target_qs = target_q.unsqueeze(0).expand(self.config.critic_ensemble_size, -1)
+        assert predicted_qs.shape == target_qs.shape
+        critic_loss = torch.mean((predicted_qs - target_qs) ** 2)
+        info = {
+            "td_err": critic_loss,
+            "online_q": torch.mean(predicted_qs),
+            "target_q": torch.mean(target_qs),
+        }
+        if self.config.goal_conditioned:
+            num_negatives = int(
+                self.config.gc_kwargs["negative_proportion"] * batch_size
+            )
+            info["negative_qs"] = torch.mean(predicted_qs, dim=-1)[
+                :num_negatives
+            ].mean()
+            info["positive_qs"] = torch.mean(predicted_qs, dim=-1)[
+                num_negatives:
+            ].mean()
+        return critic_loss, info
+    def _process_target_next_qs(self, target_next_qs, next_actions_log_probs):
+        """add cql_max_target_backup option"""
+        if self.config.cql_max_target_backup:
+            max_target_indices = torch.argmax(target_next_qs, dim=-1, keepdim=True)
+            target_next_qs = torch.gather(
+                target_next_qs, -1, max_target_indices
+            ).squeeze(-1)
+            next_actions_log_probs = torch.gather(
+                next_actions_log_probs, -1, max_target_indices
+            ).squeeze(-1)
+        target_next_qs = self.sac_process_target_next_qs(
+            target_next_qs,
+            next_actions_log_probs,
+        )
+        return target_next_qs
+    def sac_process_target_next_qs(self, target_next_qs, next_actions_log_probs):
+        """classes that inherit this class can add to this function
+        e.g. CQL will add the cql_max_target_backup option
+        """
+        if self.config.backup_entropy:
+            temperature = self.forward_temperature()
+            target_next_qs = target_next_qs - temperature * next_actions_log_probs
+        return target_next_qs
+    def critic_loss_fn(self, batch: at.CalQlBatch):
+        """add CQL loss on top of SAC loss"""
+        if self.config.use_td_loss:
+            td_loss, td_loss_info = self.sac_critic_loss_fn(batch)
+        else:
+            td_loss, td_loss_info = 0.0, {}
+        cql_q_diff, cql_intermediate_results = self._get_cql_q_diff(batch)
+        """auto tune cql alpha"""
+        if self.config.cql_autotune_alpha:
+            raise NotImplementedError
+            # alpha = self.forward_cql_alpha_lagrange()
+            # cql_loss = (cql_q_diff - self.config["cql_target_action_gap"]).mean()
+        else:
+            alpha = self.config.cql_alpha
+            cql_loss = torch.clip(
+                cql_q_diff, self.config.cql_clip_diff_min, self.config.cql_clip_diff_max
+            ).mean()
+        critic_loss = td_loss + alpha * cql_loss
+        info = {
+            **td_loss_info,
+            "critic_loss": critic_loss,
+            "td_err": td_loss,
+            "cql_loss": cql_loss,
+            "cql_alpha": alpha,
+            "cql_diff": cql_q_diff.mean(),
+            **cql_intermediate_results,
+        }
+        return critic_loss, info
+    def _get_cql_q_diff(self, batch: at.CalQlBatch):
+        """
+        most of the CQL loss logic is here
+        It is needed for both critic_loss_fn and cql_alpha_loss_fn
+        """
+        batch_size = batch["rewards"].shape[0]
+        q_pred = self.critics.forward(batch["encoded_observations"], batch["actions"])
+        # HACK: shape changed from jax implementation
+        assert q_pred.shape == (self.config.critic_ensemble_size, batch_size)
+        """sample random actions"""
+        action_dim = batch["actions"].shape[-1]
+        if self.config.cql_action_sample_method == "uniform":
+            cql_random_actions = (
+                torch.rand(
+                    (batch_size, self.config.cql_n_actions, action_dim),
+                    device=batch["actions"].device,
+                )
+                * 2.0
+                - 1.0
+            )
+        elif self.config.cql_action_sample_method == "normal":
+            cql_random_actions = torch.randn(
+                (batch_size, self.config.cql_n_actions, action_dim),
+                device=batch["actions"].device,
+            )
+        else:
+            raise NotImplementedError
+        cql_current_actions, cql_current_log_pis = self.forward_policy_and_sample(
+            batch["encoded_observations"],
+            repeat=self.config.cql_n_actions,
+        )
+        assert cql_current_log_pis.shape == (batch_size, self.config.cql_n_actions)
+        cql_next_actions, cql_next_log_pis = self.forward_policy_and_sample(
+            batch["encoded_next_observations"],
+            repeat=self.config.cql_n_actions,
+        )
+        all_sampled_actions = torch.cat(
+            [
+                cql_random_actions,
+                cql_current_actions,
+                cql_next_actions,
+            ],
+            dim=1,
+        )
+        """q values of randomly sampled actions"""
+        cql_q_samples = self.critics.forward(
+            batch["encoded_observations"], all_sampled_actions
+        )
+        # HACK: shape changed from jax implementation
+        assert cql_q_samples.shape == (
+            self.config.critic_ensemble_size,
+            batch_size,
+            self.config.cql_n_actions * 3,
+        )
+        if self.config.critic_subsample_size is not None:
+            subsample_idcs = torch.randint(
+                0,
+                self.config.critic_ensemble_size,
+                (self.config.critic_ensemble_size,),
+                device=cql_q_samples.device,
+            )
+            cql_q_samples = cql_q_samples[subsample_idcs]
+        """Cal-QL"""
+        if self.config.use_calql:
+            # HACK: check shape of mc_returns
+            mc_lower_bound = (
+                batch["mc_returns"]
+                .reshape(-1, 1)
+                .repeat(1, self.config.cql_n_actions * 2)
+            )
+            assert mc_lower_bound.shape == (
+                batch_size,
+                self.config.cql_n_actions * 2,
+            )
+            cql_q_pi = cql_q_samples[:, :, self.config.cql_n_actions :]
+            num_vals = cql_q_pi.numel()
+            calql_bound_rate = torch.sum((cql_q_pi < mc_lower_bound).float()) / num_vals
+            cql_q_pi = torch.maximum(cql_q_pi, mc_lower_bound)
+            cql_q_samples = torch.cat(
+                [
+                    cql_q_samples[:, :, : self.config.cql_n_actions],
+                    cql_q_pi,
+                ],
+                dim=-1,
+            )
+        if self.config.cql_importance_sample:
+            random_density = torch.log(
+                torch.tensor(0.5**action_dim, device=cql_q_samples.device)
+            )
+            importance_prob = torch.cat(
+                [
+                    random_density.expand(batch_size, self.config.cql_n_actions),
+                    cql_current_log_pis,
+                    cql_next_log_pis,
+                ],
+                dim=1,
+            )
+            # HACK: check dim
+            cql_q_samples = cql_q_samples - importance_prob.unsqueeze(0)
+        else:
+            cql_q_samples = torch.cat([cql_q_samples, q_pred.unsqueeze(-1)], dim=-1)
+            cql_q_samples -= (
+                torch.log(
+                    torch.tensor(
+                        cql_q_samples.shape[-1],
+                        dtype=torch.float,
+                        device=cql_q_samples.device,
+                    )
+                )
+                * self.config.cql_temp
+            )
+            # HACK: shape diff from jax implementation
+            assert cql_q_samples.shape == (
+                self.config.critic_ensemble_size,
+                batch_size,
+                3 * self.config.cql_n_actions + 1,
+            )
+        """log sum exp of the ood actions"""
+        cql_ood_values = (
+            torch.logsumexp(cql_q_samples / self.config.cql_temp, dim=-1)
+            * self.config.cql_temp
+        )
+        assert cql_ood_values.shape == (self.config.critic_ensemble_size, batch_size)
+        cql_q_diff = cql_ood_values - q_pred
+        info = {
+            "cql_ood_values": cql_ood_values.mean(),
+        }
+        if self.config.use_calql:
+            info["calql_bound_rate"] = calql_bound_rate
+        return cql_q_diff, info
+    @staticmethod
+    def make_optimizer(
+        params: torch.nn.Module,
+        learning_rate: float = 3e-4,
+        warmup_steps: int = 0,
+        cosine_decay_steps: Optional[int] = None,
+        weight_decay: Optional[float] = None,
+        return_lr_schedule: bool = True,
+    ) -> Union[Optimizer, Tuple[Optimizer, LambdaLR]]:
+        optimizer: Optimizer
+        if weight_decay is not None:
+            optimizer = AdamW(
+                params=params,
+                lr=learning_rate,
+                weight_decay=weight_decay,
+            )
+        else:
+            optimizer = Adam(params=params, lr=learning_rate)
+        def _lr_lambda(step: int) -> float:
+            if warmup_steps > 0 and step < warmup_steps:
+                return step / warmup_steps
+            if cosine_decay_steps is not None:
+                decay_step = step - warmup_steps
+                if decay_step < 0:
+                    return 0.0
+                if decay_step >= cosine_decay_steps:
+                    return 0.0
+                progress = decay_step / cosine_decay_steps
+                return 0.5 * (1.0 + math.cos(math.pi * progress))
+            return 1.0
+        scheduler = LambdaLR(optimizer, lr_lambda=_lr_lambda)
+        if return_lr_schedule:
+            return optimizer, scheduler
+        else:
+            return optimizer
+    def prepare_optimizers(self):
+        actor_optimizer, actor_scheduler = self.make_optimizer(
+            self.policy.parameters(), **self.config.actor_optimizer_kwargs
+        )
+        critic_optimizer, critic_scheduler = self.make_optimizer(
+            self.critics.parameters(), **self.config.critic_optimizer_kwargs
+        )
+        temperature_optimizer, temperature_scheduler = self.make_optimizer(
+            self.temperature.parameters(), **self.config.temperature_optimizer_kwargs
+        )
+        return (
+            actor_optimizer,
+            actor_scheduler,
+            critic_optimizer,
+            critic_scheduler,
+            temperature_optimizer,
+            temperature_scheduler,
+        )
+    def forward(self, batch: at.CalQlBatch):
+        temperature_loss, temperature_loss_info = self.temperature_loss_fn(batch)
+        policy_loss, policy_loss_info = self.policy_loss_fn(batch)
+        critic_loss, critic_loss_info = self.critic_loss_fn(batch)
+        return (
+            temperature_loss,
+            policy_loss,
+            critic_loss,
+            {
+                **temperature_loss_info,
+                **policy_loss_info,
+                **critic_loss_info,
+            },
+        )
+    @jaxtyped(typechecker=typechecker)
+    def get_q_values(
+        self,
+        encoded_observations: Float[
+            torch.Tensor, "batch {self.config.obs_encoded_dim}"
+        ],
+        noise_actions: Float[torch.Tensor, "batch num_actions action_dim"],
+    ) -> Float[torch.Tensor, "batch num_actions"]:
+        # (num_backbones, batch, *num_actions)
+        q_values = self.target_critics.forward(encoded_observations, noise_actions)
+        q_values = q_values.min(dim=0)[0]
+        return q_values