from typing import Any from transformers.configuration_utils import PretrainedConfig from transformers.models.qwen2 import Qwen2Config from transformers.models.siglip import SiglipVisionConfig class NVILAConfig(PretrainedConfig): model_type = "nvila" sub_configs = { "text_config": Qwen2Config, "vision_config": SiglipVisionConfig, } _auto_class = "AutoConfig" def __init__( self, *, text_config: dict[str, Any] | None = None, vision_config: dict[str, Any] | None = None, image_token_id: int | None = None, video_token_id: int | None = None, **kwargs, ): self.text_config = Qwen2Config(**text_config) if text_config is not None else Qwen2Config() self.vision_config = SiglipVisionConfig(**vision_config) if vision_config is not None else SiglipVisionConfig() self.image_token_id = image_token_id if image_token_id is not None else -1 self.video_token_id = video_token_id if video_token_id is not None else -1 super().__init__(**kwargs)