|
""" |
|
Molmo2 configuration |
|
""" |
|
|
|
from typing import Tuple, Optional, Dict, Any |
|
|
|
from transformers import PretrainedConfig |
|
from transformers.modeling_rope_utils import rope_config_validation |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class Molmo2VitConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`Molmo2VisionTransformer`]. |
|
It is used to instantiate a `Molmo2VisionTransformer` according to the specified arguments, |
|
defining the model architecture. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
Example: |
|
```python |
|
>>> from transformers import Molmo2VitConfig, Molmo2VisionTransformer |
|
|
|
>>> # Initializing a Molmo2VitConfig |
|
>>> configuration = Molmo2VitConfig() |
|
|
|
>>> # Initializing a Molmo2VisionTransformer (with random weights) |
|
>>> model = Molmo2VisionTransformer(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "molmo2_vit" |
|
|
|
def __init__( |
|
self, |
|
hidden_size: int = 1152, |
|
intermediate_size: int = 4304, |
|
num_hidden_layers: int = 27, |
|
num_attention_heads: int = 16, |
|
num_key_value_heads: int = 16, |
|
head_dim: int = 72, |
|
hidden_act: str = "gelu_pytorch_tanh", |
|
layer_norm_eps: float = 1e-6, |
|
image_default_input_size: Tuple[int, int] = (378, 378), |
|
image_patch_size: int = 14, |
|
image_num_pos: int = 577, |
|
attention_dropout: float = 0.0, |
|
residual_dropout: float = 0.0, |
|
initializer_range: float = 0.02, |
|
float32_attention: bool = True, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.hidden_size = hidden_size |
|
self.intermediate_size = intermediate_size |
|
self.num_hidden_layers = num_hidden_layers |
|
self.num_attention_heads = num_attention_heads |
|
self.num_key_value_heads = num_key_value_heads |
|
self.head_dim = head_dim |
|
self.hidden_act = hidden_act |
|
self.layer_norm_eps = layer_norm_eps |
|
self.image_default_input_size = image_default_input_size |
|
self.image_patch_size = image_patch_size |
|
self.image_num_pos = image_num_pos |
|
self.attention_dropout = attention_dropout |
|
self.residual_dropout = residual_dropout |
|
self.initializer_range = initializer_range |
|
self.float32_attention = float32_attention |
|
|
|
@property |
|
def image_num_patch(self): |
|
h, w = self.image_default_input_size |
|
return h // self.image_patch_size, w // self.image_patch_size |
|
|
|
|
|
class Molmo2AdapterConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of Molmo2Adapter. With Molmo2VitConfig, |
|
It is used to instantiate an Molmo2VisionBackbone according to the specified arguments, |
|
defining the model architecture. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import Molmo2VitConfig, Molmo2AdapterConfig, Molmo2VisionBackbone |
|
|
|
>>> # Initializing a Molmo2VitConfig and a Molmo2AdapterConfig |
|
>>> vit_config = Molmo2VitConfig() |
|
>>> adapter_config = MolmoPoolingConfig() |
|
|
|
>>> # Initializing a Molmo2VisionBackbone (with random weights) |
|
>>> model = Molmo2VisionBackbone(vit_config, adapter_config) |
|
|
|
>>> # Accessing the model configuration |
|
>>> vit_configuration = model.vit_config |
|
>>> adapter_configuration = model.adapter_config |
|
```""" |
|
|
|
def __init__( |
|
self, |
|
vit_layers: Tuple = (-3, -9), |
|
hidden_size: int = 1152, |
|
num_attention_heads: int = 16, |
|
num_key_value_heads: int = 16, |
|
head_dim: int = 72, |
|
float32_attention: bool = True, |
|
attention_dropout: float = 0.0, |
|
residual_dropout: float = 0.0, |
|
hidden_act: str = "silu", |
|
intermediate_size: int = 18944, |
|
text_hidden_size: int = 3584, |
|
image_feature_dropout: float = 0.0, |
|
initializer_range: float = 0.02, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.vit_layers = vit_layers |
|
self.hidden_size = hidden_size |
|
self.num_attention_heads = num_attention_heads |
|
self.num_key_value_heads = num_key_value_heads |
|
self.head_dim = head_dim |
|
self.float32_attention = float32_attention |
|
self.attention_dropout = attention_dropout |
|
self.residual_dropout = residual_dropout |
|
self.hidden_act = hidden_act |
|
self.intermediate_size = intermediate_size |
|
self.text_hidden_size = text_hidden_size |
|
self.image_feature_dropout = image_feature_dropout |
|
self.initializer_range = initializer_range |
|
|
|
|
|
class Molmo2LlmConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`Molmo2Llm`]. It is used to instantiate a |
|
`Molmo2Llm` according to the specified arguments, defining the model architecture. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
Example: |
|
```python |
|
>>> from transformers import Molmo2LlmConfig, Molmo2Llm |
|
|
|
>>> # Initializing a Molmo2LlmConfig |
|
>>> configuration = Molmo2LlmConfig() |
|
|
|
>>> # Initializing a Molmo2Llm (with random weights) |
|
>>> model = Molmo2Llm(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "molmo2_llm" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
base_model_tp_plan = { |
|
"blocks.*.self_attn.att_proj": "colwise", |
|
"blocks.*.self_attn.attn_out": "rowwise", |
|
"blocks.*.mlp.ff_proj": "colwise", |
|
"blocks.*.mlp.ff_out": "rowwise", |
|
} |
|
base_model_pp_plan = { |
|
"wte": (["input_ids"], ["inputs_embeds"]), |
|
"blocks": (["hidden_states", "attention_mask"], ["hidden_states"]), |
|
"ln_f": (["hidden_states"], ["hidden_states"]), |
|
} |
|
|
|
def __init__( |
|
self, |
|
hidden_size: int = 3584, |
|
num_attention_heads: int = 28, |
|
num_key_value_heads: Optional[int] = 4, |
|
head_dim: int = 128, |
|
vocab_size: int = 152064, |
|
additional_vocab_size: int = 128, |
|
qkv_bias: bool = True, |
|
num_hidden_layers: int = 48, |
|
intermediate_size: int = 18944, |
|
hidden_act: str = "silu", |
|
embedding_dropout: float=0.0, |
|
attention_dropout: float=0.0, |
|
residual_dropout: float = 0.0, |
|
max_position_embeddings: int = 4096, |
|
rope_theta: float = 1000000.0, |
|
rope_scaling: Dict[str, Any] = None, |
|
use_qk_norm: bool = False, |
|
qk_norm_type: str = "olmo", |
|
layer_norm_eps: int = 1e-6, |
|
norm_after: bool = False, |
|
initializer_range: float = 0.02, |
|
use_cache=True, |
|
tie_word_embeddings=False, |
|
**kwargs, |
|
): |
|
super().__init__( |
|
tie_word_embeddings=tie_word_embeddings, |
|
**kwargs |
|
) |
|
self.hidden_size = hidden_size |
|
self.num_attention_heads = num_attention_heads |
|
if num_key_value_heads is None: |
|
num_key_value_heads = num_attention_heads |
|
self.num_key_value_heads = num_key_value_heads |
|
self.head_dim = head_dim |
|
self.vocab_size = vocab_size |
|
self.additional_vocab_size = additional_vocab_size |
|
self.qkv_bias = qkv_bias |
|
self.num_hidden_layers = num_hidden_layers |
|
self.intermediate_size = intermediate_size |
|
self.hidden_act = hidden_act |
|
self.embedding_dropout = embedding_dropout |
|
self.attention_dropout = attention_dropout |
|
self.residual_dropout = residual_dropout |
|
self.max_position_embeddings = max_position_embeddings |
|
self.rope_theta = rope_theta |
|
self.rope_scaling = rope_scaling |
|
self.use_qk_norm = use_qk_norm |
|
self.qk_norm_type = qk_norm_type |
|
self.layer_norm_eps = layer_norm_eps |
|
self.norm_after = norm_after |
|
self.initializer_range = initializer_range |
|
self.use_cache = use_cache |
|
|
|
|
|
rope_config_validation(self) |
|
|
|
|
|
class Molmo2Config(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`Molmo2ForConditionalGeneration`]. |
|
It is used to instantiate an Molmo2 model according to the specified arguments, defining the model architecture. |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import Molmo2Config, Molmo2VitConfig, Molmo2AdapterConfig, Molmo2LlmConfig |
|
|
|
>>> # Initializing a Molmo2VitConfig |
|
>>> vit_config = Molmo2VitConfig() |
|
|
|
>>> # Initializing a Molmo2AdapterConfig |
|
>>> adapter_config = Molmo2AdapterConfig() |
|
|
|
>>> # Initializing a Molmo2LlmConfig |
|
>>> llm_config = Molmo2LlmConfig() |
|
|
|
>>> # Initializing a Molmo2Config |
|
>>> configuration = Molmo2Config(vit_config, adapter_config, llm_config, image_patch_id=152069) |
|
|
|
>>> # Initializing a model |
|
>>> model = Molmo2ForConditionalGeneration(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "molmo2" |
|
sub_configs = { |
|
"llm_config": Molmo2LlmConfig, |
|
"vit_config": Molmo2VitConfig, |
|
"adapter_config": Molmo2AdapterConfig, |
|
} |
|
|
|
def __init__( |
|
self, |
|
vit_config: Molmo2VitConfig = None, |
|
adapter_config: Molmo2AdapterConfig = None, |
|
llm_config: Molmo2LlmConfig = None, |
|
image_patch_id: int = None, |
|
initializer_range: float = 0.02, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
if vit_config is None: |
|
self.vit_config = Molmo2VitConfig() |
|
elif isinstance(vit_config, dict): |
|
self.vit_config = Molmo2VitConfig(**vit_config) |
|
else: |
|
self.vit_config = vit_config |
|
if adapter_config is None: |
|
self.adapter_config = Molmo2AdapterConfig() |
|
elif isinstance(adapter_config, dict): |
|
self.adapter_config = Molmo2AdapterConfig(**adapter_config) |
|
else: |
|
self.adapter_config = adapter_config |
|
if llm_config is None: |
|
self.llm_config = Molmo2LlmConfig() |
|
elif isinstance(llm_config, dict): |
|
self.llm_config = Molmo2LlmConfig(**llm_config) |
|
else: |
|
self.llm_config = llm_config |
|
self.image_patch_id = image_patch_id |
|
self.initializer_range = initializer_range |
|
|
|
@property |
|
def image_num_patch(self): |
|
assert self.vit_config is not None |
|
return self.vit_config.image_num_patch |
|
|
|
@property |
|
def num_attention_heads(self): |
|
return self.llm_config.num_attention_heads |
|
|
|
@property |
|
def num_key_value_heads(self): |
|
return self.llm_config.num_key_value_heads |
|
|
|
@property |
|
def head_dim(self): |
|
return self.llm_config.head_dim |
|
|
|
@property |
|
def num_hidden_layers(self): |
|
return self.llm_config.num_hidden_layers |
|
|
|
@property |
|
def hidden_size(self): |
|
return self.llm_config.hidden_size |
|
|
|
@property |
|
def vocab_size(self): |
|
return self.llm_config.vocab_size |
|
|
|
@property |
|
def max_position_embeddings(self): |
|
return self.llm_config.max_position_embeddings |
|
|
|
|
|
Molmo2VitConfig.register_for_auto_class() |
|
Molmo2AdapterConfig.register_for_auto_class() |
|
Molmo2LlmConfig.register_for_auto_class() |
|
Molmo2Config.register_for_auto_class() |