update to pelmo2

Browse files

Files changed (4) hide show

config.json +47 -47
modeling_plamo.py +103 -95
tokenization_plamo.py +1 -1
tokenizer_config.json +52 -52

config.json CHANGED Viewed

@@ -1,49 +1,49 @@
 {
-    "architectures": [
-        "PlamoForCausalLM"
-    ],
-    "attention_window_size": 2048,
-    "auto_map": {
-        "AutoConfig": "modeling_plamo.PlamoConfig",
-        "AutoModelForCausalLM": "modeling_plamo.PlamoForCausalLM"
-    },
-    "bos_token_id": 1,
-    "capacity_factor": 1.0,
-    "eos_token_id": 2,
-    "eval_attention_n_bit": null,
-    "eval_mlp_n_bit": null,
-    "expert_dropout": 0.0,
-    "fp8_accum_dtype": "bfloat16",
-    "group_size": 1024,
-    "hidden_size": 2048,
-    "hidden_size_per_head": 128,
-    "image_feature_size": null,
-    "image_proj_type": "linear",
-    "image_token_id": null,
-    "intermediate_size": 8192,
-    "k_expert": null,
-    "linear_type": "fp8",
-    "mamba_chunk_size": 256,
-    "mamba_d_conv": 4,
-    "mamba_d_state": 64,
-    "mamba_enabled": true,
-    "mamba_num_heads": 32,
-    "mamba_step": 2,
-    "max_position_embeddings": 10485760,
-    "model_type": "plamo2",
-    "n_expert": null,
-    "num_attention_heads": 16,
-    "num_hidden_layers": 16,
-    "num_key_value_heads": 1,
-    "rms_norm_eps": 1e-06,
-    "shared_intermediate_size": null,
-    "sliding_window": 2048,
-    "sparse_intermediate_size": null,
-    "sparse_step": null,
-    "tokenizer_class": "PlamoTokenizer",
-    "torch_dtype": "float32",
-    "transformers_version": "4.44.2",
-    "use_cache": true,
-    "use_predefined_initial_state": false,
-    "vocab_size": 100000
 }

 {
+  "architectures": [
+    "Plamo2ForCausalLM"
+  ],
+  "attention_window_size": 2048,
+  "auto_map": {
+    "AutoConfig": "modeling_plamo.Plamo2Config",
+    "AutoModelForCausalLM": "modeling_plamo.Plamo2ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "capacity_factor": 1.0,
+  "eos_token_id": 2,
+  "eval_attention_n_bit": null,
+  "eval_mlp_n_bit": null,
+  "expert_dropout": 0.0,
+  "fp8_accum_dtype": "bfloat16",
+  "group_size": 1024,
+  "hidden_size": 2048,
+  "hidden_size_per_head": 128,
+  "image_feature_size": null,
+  "image_proj_type": "linear",
+  "image_token_id": null,
+  "intermediate_size": 8192,
+  "k_expert": null,
+  "linear_type": "fp8",
+  "mamba_chunk_size": 256,
+  "mamba_d_conv": 4,
+  "mamba_d_state": 64,
+  "mamba_enabled": true,
+  "mamba_num_heads": 32,
+  "mamba_step": 2,
+  "max_position_embeddings": 10485760,
+  "model_type": "plamo2",
+  "n_expert": null,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 1,
+  "rms_norm_eps": 1e-06,
+  "shared_intermediate_size": null,
+  "sliding_window": 2048,
+  "sparse_intermediate_size": null,
+  "sparse_step": null,
+  "tokenizer_class": "Plamo2Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "use_predefined_initial_state": false,
+  "vocab_size": 100000
 }

modeling_plamo.py CHANGED Viewed

@@ -105,8 +105,8 @@ class LinearType(str, enum.Enum):
     Fp8Retain = "fp8-retain"
-class PlamoConfig(PretrainedConfig):  # type: ignore
-    model_type: str = "plamo"
     def __init__(
         self,
@@ -121,6 +121,8 @@ class PlamoConfig(PretrainedConfig):  # type: ignore
         max_position_embeddings: int = 2048,
         attention_window_size: int = 2048,
         full_attention_idx: list[int] | None = None,
         # Mamba
         mamba_d_state: int = 64,
         mamba_d_conv: int = 4,
@@ -132,7 +134,7 @@ class PlamoConfig(PretrainedConfig):  # type: ignore
         intermediate_size: int = 13312,
         # Tokenizer
         vocab_size: int = 32000,
-        tokenizer_class: str = "PlamoTokenizer",
         pad_token_id: Optional[int] = None,
         bos_token_id: int = 1,
         eos_token_id: int = 2,
@@ -161,6 +163,8 @@ class PlamoConfig(PretrainedConfig):  # type: ignore
         self.num_key_value_heads = num_key_value_heads
         self.attention_window_size = attention_window_size
         self.full_attention_idx = full_attention_idx if full_attention_idx is not None else []
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
@@ -196,8 +200,16 @@ class PlamoConfig(PretrainedConfig):  # type: ignore
             **kwargs,
         )
-class PlamoAttentionCache(torch.nn.Module):
     def __init__(self, key: torch.Tensor, value: torch.Tensor) -> None:
         super().__init__()
         B, nh, L, c = key.shape
@@ -208,7 +220,7 @@ class PlamoAttentionCache(torch.nn.Module):
         self.register_parameter("value", torch.nn.Parameter(value, requires_grad=False))
-class PlamoMambaCache(torch.nn.Module):
     def __init__(self, conv_state: torch.Tensor, ssm_state: torch.Tensor) -> None:
         super().__init__()
         # conv_state: [B, C, d_conv]
@@ -220,10 +232,10 @@ class PlamoMambaCache(torch.nn.Module):
         self.register_parameter("ssm_state", torch.nn.Parameter(ssm_state, requires_grad=False))
-PlamoLayerCache = PlamoAttentionCache | PlamoMambaCache
-class PlamoCache(torch.nn.Module):
     """
     stores states of the model for fast decoding.
     `transformers` uses `transformers.Cache` for this purpose, but the interface and variable names are
@@ -233,7 +245,7 @@ class PlamoCache(torch.nn.Module):
     the state of Mamba properly.
     """
-    def __init__(self, config: PlamoConfig) -> None:
         super().__init__()
         self.config = config
         self.cache = torch.nn.ModuleList([None for _ in range(config.num_hidden_layers)])  # type: ignore
@@ -242,7 +254,7 @@ class PlamoCache(torch.nn.Module):
         c = self.cache[layer_idx]
         if c is None:
             return key, value
-        assert isinstance(c, PlamoAttentionCache)
         def _validate(cache: torch.Tensor, new_tensor: torch.Tensor) -> None:
             assert len(cache.shape) == 4
@@ -258,20 +270,20 @@ class PlamoCache(torch.nn.Module):
     def update_attention(
         self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int
-    ) -> PlamoAttentionCache:
         full_attn = layer_idx in self.config.full_attention_idx
         window_size = self.config.attention_window_size
         if self.cache[layer_idx] is None:
             if full_attn:
-                self.cache[layer_idx] = PlamoAttentionCache(key_states, value_states)
             else:
-                self.cache[layer_idx] = PlamoAttentionCache(
                     key_states[:, :, -window_size:, :], value_states[:, :, -window_size:, :]
                 )
         else:
             c = self.cache[layer_idx]
-            assert isinstance(c, PlamoAttentionCache)
             k, v = self.append_kv(key_states, value_states, layer_idx)
             if full_attn:
                 c.key.data = k
@@ -281,19 +293,19 @@ class PlamoCache(torch.nn.Module):
                 c.value.data = v[:, :, -window_size:, :]
         return self.cache[layer_idx]  # type: ignore
-    def update_mamba(self, conv_state: torch.Tensor, ssm_state: torch.Tensor, layer_idx: int) -> PlamoMambaCache:
         if self.cache[layer_idx] is None:
-            self.cache[layer_idx] = PlamoMambaCache(conv_state, ssm_state)
         else:
             c = self.cache[layer_idx]
-            assert isinstance(c, PlamoMambaCache)
             assert c.conv_state.shape == conv_state.shape
             assert c.ssm_state.shape == ssm_state.shape
             c.conv_state.data = conv_state
             c.ssm_state.data = ssm_state
         return self.cache[layer_idx]  # type: ignore
-    def __getitem__(self, layer_idx: int) -> PlamoLayerCache | None:
         assert layer_idx < len(self.cache)
         layer_cache = self.cache[layer_idx]
         return layer_cache  # type: ignore
@@ -304,12 +316,12 @@ class PlamoCache(torch.nn.Module):
     def get_seq_length(self, layer_idx: Optional[int] = None) -> int:
         if layer_idx is not None:
             c = self.cache[layer_idx]
-            assert isinstance(c, PlamoAttentionCache)
             return c.key.shape[2]  # type: ignore
         sequence_length: int | None = None
         for layer_cache in self.cache:
-            if isinstance(layer_cache, PlamoAttentionCache):
                 sequence_length = (
                     max(layer_cache.key.shape[2], sequence_length)
                     if sequence_length is not None
@@ -333,14 +345,14 @@ class PlamoCache(torch.nn.Module):
         return previous_seq_length
     def reorder_cache(self, beam_idx: torch.Tensor) -> None:
-        def _mamba(cache: PlamoMambaCache) -> PlamoMambaCache:
-            return PlamoMambaCache(
                 conv_state=cache.conv_state.index_select(0, beam_idx),
                 ssm_state=cache.ssm_state.index_select(0, beam_idx),
             )
-        def _attention(cache: PlamoAttentionCache) -> PlamoAttentionCache:
-            return PlamoAttentionCache(
                 key=cache.key.index_select(0, beam_idx),
                 value=cache.value.index_select(0, beam_idx),
             )
@@ -349,10 +361,10 @@ class PlamoCache(torch.nn.Module):
             if self.cache[i] is None:
                 continue
             layer_cache = self.cache[i]
-            if isinstance(layer_cache, PlamoMambaCache):
                 self.cache[i] = _mamba(layer_cache)
             else:
-                assert isinstance(layer_cache, PlamoAttentionCache)
                 self.cache[i] = _attention(layer_cache)
     @property
@@ -363,7 +375,7 @@ class PlamoCache(torch.nn.Module):
 class DecoderInput(NamedTuple):
     hidden_states: torch.Tensor
     attention_mask: Optional[torch.Tensor] = None
-    past_states: Optional[PlamoCache] = None
     output_hidden_states: Optional[bool] = False
     output_attentions: Optional[bool] = False
     gradient_checkpointing: bool = False
@@ -810,7 +822,7 @@ def _causal_conv1d(
 class Mamba(torch.nn.Module):
-    def __init__(self, config: PlamoConfig, layer_idx: int) -> None:
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -862,8 +874,8 @@ class Mamba(torch.nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        past_states: Optional[PlamoCache] = None,
-    ) -> Tuple[torch.Tensor, Optional[PlamoCache]]:
         bsize, length, _ = hidden_states.shape
         is_update = length == 1 and past_states is not None
@@ -905,7 +917,7 @@ class Mamba(torch.nn.Module):
             )
         else:
             c = past_states[self.layer_idx]
-            assert isinstance(c, PlamoMambaCache)
             conv_state = c.conv_state
             ssm_state = c.ssm_state
@@ -1022,7 +1034,7 @@ def swa_mask(q_len: int, kv_len: int, device: torch.device, window_size: int) ->
 class Attention(torch.nn.Module):
-    def __init__(self, config: PlamoConfig, layer_idx: int) -> None:
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -1045,15 +1057,19 @@ class Attention(torch.nn.Module):
         self.q_weight = torch.nn.Parameter(torch.ones((self.q_num_heads, self.qk_dim)))
         self.k_weight = torch.nn.Parameter(torch.ones((self.k_num_heads, self.qk_dim)))
-        self.rotary_emb = RotaryEmbedding(self.qk_dim, max_position_embeddings=self.config.attention_window_size)
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        past_states: Optional[PlamoCache] = None,
         output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[PlamoCache]]:
         bsz, q_len, _ = hidden_states.size()
         qkv = self.qkv_proj(hidden_states)
@@ -1094,15 +1110,13 @@ class Attention(torch.nn.Module):
         key_states = _expand_kv(key_states, self.n_group, self.q_num_heads)
         value_states = _expand_kv(value_states, self.n_group, self.q_num_heads)
-        full_attn = self.layer_idx in self.config.full_attention_idx
         query_states = query_states.to(attn_dtype)
         key_states = key_states.to(attn_dtype)
         value_states = value_states.to(attn_dtype)
         if attention_mask is not None and attention_mask.dtype != torch.bool:
             attention_mask = attention_mask.to(attn_dtype)
         if attention_mask is None:
-            if not full_attn:
                 assert key_states.shape[2] <= self.config.attention_window_size + 1
             attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, is_causal=True)
         else:
@@ -1112,7 +1126,7 @@ class Attention(torch.nn.Module):
                 attention_mask = attention_mask[None, None]
             assert len(attention_mask.shape) == 4
-            if not full_attn:
                 m_swa = swa_mask(
                     query_states.shape[2], key_states.shape[2], query_states.device, self.config.attention_window_size
                 )
@@ -1142,7 +1156,7 @@ class Attention(torch.nn.Module):
 class MLP(nn.Module):
-    def __init__(self, config: PlamoConfig) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -1156,14 +1170,14 @@ class MLP(nn.Module):
         return self.down_proj(h)  # type: ignore
-class PlamoDecoderLayer(torch.nn.Module):
-    def __init__(self, config: PlamoConfig, is_mamba: bool, layer_idx: int) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
-        self.is_mamba = is_mamba
         self.mixer: torch.nn.Module
-        if is_mamba:
             self.mixer = Mamba(config, layer_idx)
         else:
             self.mixer = Attention(config, layer_idx)
@@ -1180,7 +1194,7 @@ class PlamoDecoderLayer(torch.nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        past_state: Optional[PlamoCache] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Any, ...]:
         # from LlamaDecoder
@@ -1224,7 +1238,7 @@ class PlamoDecoderLayer(torch.nn.Module):
         return outputs  # type: ignore
-def is_mamba(config: PlamoConfig, i: int) -> bool:
     if not config.mamba_enabled:
         return False
     assert config.mamba_step > 1
@@ -1236,15 +1250,12 @@ def is_mamba(config: PlamoConfig, i: int) -> bool:
     return (i % config.mamba_step) != (config.mamba_step // 2)
-class PlamoDecoder(torch.nn.Module):
-    def __init__(self, config: PlamoConfig) -> None:
         super().__init__()
         self.layers = torch.nn.ModuleList(
-            [
-                PlamoDecoderLayer(config, is_mamba=is_mamba(config, i), layer_idx=i)
-                for i in range(config.num_hidden_layers)
-            ]
         )
         self.gradient_checkpointing = False
@@ -1283,8 +1294,8 @@ class PlamoDecoder(torch.nn.Module):
         return DecoderOutput(hidden_states, all_hidden_states, all_self_attns)
-class PlamoPreTrainedModel(PreTrainedModel):  # type: ignore
-    config_class = PlamoConfig
     _no_split_modules: List[str]
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -1304,8 +1315,8 @@ class PlamoPreTrainedModel(PreTrainedModel):  # type: ignore
                 module.weight.data[module.padding_idx].zero_()
-class PlamoModel(PlamoPreTrainedModel):
-    def __init__(self, config: PlamoConfig):
         super().__init__(config)
         assert config.eval_attention_n_bit is None
         assert config.eval_mlp_n_bit is None
@@ -1321,7 +1332,7 @@ class PlamoModel(PlamoPreTrainedModel):
                 self.image_proj = nn.Linear(config.image_feature_size, config.hidden_size, bias=False)  # type: ignore
             else:
                 raise ValueError(f"Unknown image_proj_type: {config.image_proj_type}")
-        self.layers = PlamoDecoder(config)  # type: ignore
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
@@ -1376,15 +1387,16 @@ class PlamoModel(PlamoPreTrainedModel):
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[PlamoCache] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         image_features: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        assert input_ids is not None
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1394,22 +1406,22 @@ class PlamoModel(PlamoPreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         seq_length_with_past = seq_length
         past_key_values_length = 0
         if past_key_values is not None:
             past_key_values_length = past_key_values.get_seq_length()
             seq_length_with_past = seq_length_with_past + past_key_values_length
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
         if image_features is not None:
             assert self.config.image_token_id is not None
@@ -1435,12 +1447,8 @@ class PlamoModel(PlamoPreTrainedModel):
         hidden_states = inputs_embeds
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                use_cache = False
         if use_cache and past_key_values is None:
-            past_key_values = PlamoCache(self.config)
         # decoder layers
         out = self.layers(
@@ -1477,7 +1485,7 @@ class PlamoModel(PlamoPreTrainedModel):
         )
-class PlamoForCausalLM(PlamoPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     # Without this, the model cannot be loaded into a meta device.
@@ -1487,9 +1495,9 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
     # https://github.com/pytorch/pytorch/blob/v2.4.1/torch/nn/modules/module.py#L2068
     _supports_param_buffer_assignment = False
-    def __init__(self, config: PlamoConfig) -> None:
         super().__init__(config)
-        self.model = PlamoModel(config)
         self.vocab_size = config.vocab_size
         vocab_size = ((self.vocab_size + 15) // 16) * 16
@@ -1510,10 +1518,10 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
     def set_output_embeddings(self, new_embeddings: torch.nn.Module) -> None:
         self.lm_head = new_embeddings
-    def set_decoder(self, decoder: PlamoModel) -> None:
         self.model = decoder
-    def get_decoder(self) -> PlamoModel:
         return self.model
     def forward(  # type: ignore
@@ -1521,7 +1529,7 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[PlamoCache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         image_features: Optional[torch.Tensor] = None,
         labels: Optional[torch.LongTensor] = None,
@@ -1529,6 +1537,9 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1555,8 +1566,6 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
-        assert input_ids is not None
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1575,24 +1584,23 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
-        logits = logits[..., : self.vocab_size]
         loss = None
         if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -1609,7 +1617,7 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.Tensor,
-        past_key_values: Optional[PlamoCache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         image_features: Optional[torch.Tensor] = None,
@@ -1646,13 +1654,13 @@ class PlamoForCausalLM(PlamoPreTrainedModel):
         return model_inputs
     @staticmethod
-    def _reorder_cache(past_key_values: PlamoCache, beam_idx: torch.Tensor) -> PlamoCache:
         past_key_values.reorder_cache(beam_idx)
         return past_key_values
 class MLPImageProjector(nn.Module):
-    def __init__(self, config: PlamoConfig) -> None:
         super().__init__()
         self.config = config

     Fp8Retain = "fp8-retain"
+class Plamo2Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo2"
     def __init__(
         self,
         max_position_embeddings: int = 2048,
         attention_window_size: int = 2048,
         full_attention_idx: list[int] | None = None,
+        rope_theta: int = 10000,
+        rope_local_theta: int = 10000,
         # Mamba
         mamba_d_state: int = 64,
         mamba_d_conv: int = 4,
         intermediate_size: int = 13312,
         # Tokenizer
         vocab_size: int = 32000,
+        tokenizer_class: str = "Plamo2Tokenizer",
         pad_token_id: Optional[int] = None,
         bos_token_id: int = 1,
         eos_token_id: int = 2,
         self.num_key_value_heads = num_key_value_heads
         self.attention_window_size = attention_window_size
         self.full_attention_idx = full_attention_idx if full_attention_idx is not None else []
+        self.rope_theta = rope_theta
+        self.rope_local_theta = rope_local_theta
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
             **kwargs,
         )
+    @property
+    def layers_block_type(self) -> list[str]:
+        return ["mamba" if is_mamba(self, i) else "attention" for i in range(self.num_hidden_layers)]
+    @property
+    def rope_local_base_freq(self) -> int:
+        return self.rope_local_theta
+class Plamo2AttentionCache(torch.nn.Module):
     def __init__(self, key: torch.Tensor, value: torch.Tensor) -> None:
         super().__init__()
         B, nh, L, c = key.shape
         self.register_parameter("value", torch.nn.Parameter(value, requires_grad=False))
+class Plamo2MambaCache(torch.nn.Module):
     def __init__(self, conv_state: torch.Tensor, ssm_state: torch.Tensor) -> None:
         super().__init__()
         # conv_state: [B, C, d_conv]
         self.register_parameter("ssm_state", torch.nn.Parameter(ssm_state, requires_grad=False))
+Plamo2LayerCache = Plamo2AttentionCache | Plamo2MambaCache
+class Plamo2Cache(torch.nn.Module):
     """
     stores states of the model for fast decoding.
     `transformers` uses `transformers.Cache` for this purpose, but the interface and variable names are
     the state of Mamba properly.
     """
+    def __init__(self, config: Plamo2Config) -> None:
         super().__init__()
         self.config = config
         self.cache = torch.nn.ModuleList([None for _ in range(config.num_hidden_layers)])  # type: ignore
         c = self.cache[layer_idx]
         if c is None:
             return key, value
+        assert isinstance(c, Plamo2AttentionCache)
         def _validate(cache: torch.Tensor, new_tensor: torch.Tensor) -> None:
             assert len(cache.shape) == 4
     def update_attention(
         self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int
+    ) -> Plamo2AttentionCache:
         full_attn = layer_idx in self.config.full_attention_idx
         window_size = self.config.attention_window_size
         if self.cache[layer_idx] is None:
             if full_attn:
+                self.cache[layer_idx] = Plamo2AttentionCache(key_states, value_states)
             else:
+                self.cache[layer_idx] = Plamo2AttentionCache(
                     key_states[:, :, -window_size:, :], value_states[:, :, -window_size:, :]
                 )
         else:
             c = self.cache[layer_idx]
+            assert isinstance(c, Plamo2AttentionCache)
             k, v = self.append_kv(key_states, value_states, layer_idx)
             if full_attn:
                 c.key.data = k
                 c.value.data = v[:, :, -window_size:, :]
         return self.cache[layer_idx]  # type: ignore
+    def update_mamba(self, conv_state: torch.Tensor, ssm_state: torch.Tensor, layer_idx: int) -> Plamo2MambaCache:
         if self.cache[layer_idx] is None:
+            self.cache[layer_idx] = Plamo2MambaCache(conv_state, ssm_state)
         else:
             c = self.cache[layer_idx]
+            assert isinstance(c, Plamo2MambaCache)
             assert c.conv_state.shape == conv_state.shape
             assert c.ssm_state.shape == ssm_state.shape
             c.conv_state.data = conv_state
             c.ssm_state.data = ssm_state
         return self.cache[layer_idx]  # type: ignore
+    def __getitem__(self, layer_idx: int) -> Plamo2LayerCache | None:
         assert layer_idx < len(self.cache)
         layer_cache = self.cache[layer_idx]
         return layer_cache  # type: ignore
     def get_seq_length(self, layer_idx: Optional[int] = None) -> int:
         if layer_idx is not None:
             c = self.cache[layer_idx]
+            assert isinstance(c, Plamo2AttentionCache)
             return c.key.shape[2]  # type: ignore
         sequence_length: int | None = None
         for layer_cache in self.cache:
+            if isinstance(layer_cache, Plamo2AttentionCache):
                 sequence_length = (
                     max(layer_cache.key.shape[2], sequence_length)
                     if sequence_length is not None
         return previous_seq_length
     def reorder_cache(self, beam_idx: torch.Tensor) -> None:
+        def _mamba(cache: Plamo2MambaCache) -> Plamo2MambaCache:
+            return Plamo2MambaCache(
                 conv_state=cache.conv_state.index_select(0, beam_idx),
                 ssm_state=cache.ssm_state.index_select(0, beam_idx),
             )
+        def _attention(cache: Plamo2AttentionCache) -> Plamo2AttentionCache:
+            return Plamo2AttentionCache(
                 key=cache.key.index_select(0, beam_idx),
                 value=cache.value.index_select(0, beam_idx),
             )
             if self.cache[i] is None:
                 continue
             layer_cache = self.cache[i]
+            if isinstance(layer_cache, Plamo2MambaCache):
                 self.cache[i] = _mamba(layer_cache)
             else:
+                assert isinstance(layer_cache, Plamo2AttentionCache)
                 self.cache[i] = _attention(layer_cache)
     @property
 class DecoderInput(NamedTuple):
     hidden_states: torch.Tensor
     attention_mask: Optional[torch.Tensor] = None
+    past_states: Optional[Plamo2Cache] = None
     output_hidden_states: Optional[bool] = False
     output_attentions: Optional[bool] = False
     gradient_checkpointing: bool = False
 class Mamba(torch.nn.Module):
+    def __init__(self, config: Plamo2Config, layer_idx: int) -> None:
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        past_states: Optional[Plamo2Cache] = None,
+    ) -> Tuple[torch.Tensor, Optional[Plamo2Cache]]:
         bsize, length, _ = hidden_states.shape
         is_update = length == 1 and past_states is not None
             )
         else:
             c = past_states[self.layer_idx]
+            assert isinstance(c, Plamo2MambaCache)
             conv_state = c.conv_state
             ssm_state = c.ssm_state
 class Attention(torch.nn.Module):
+    def __init__(self, config: Plamo2Config, layer_idx: int) -> None:
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.q_weight = torch.nn.Parameter(torch.ones((self.q_num_heads, self.qk_dim)))
         self.k_weight = torch.nn.Parameter(torch.ones((self.k_num_heads, self.qk_dim)))
+        self.full_attn = self.layer_idx in self.config.full_attention_idx
+        base = self.config.rope_theta if self.full_attn else self.config.rope_local_theta
+        self.rotary_emb = RotaryEmbedding(
+            self.qk_dim, max_position_embeddings=self.config.attention_window_size, base=base
+        )
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        past_states: Optional[Plamo2Cache] = None,
         output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Plamo2Cache]]:
         bsz, q_len, _ = hidden_states.size()
         qkv = self.qkv_proj(hidden_states)
         key_states = _expand_kv(key_states, self.n_group, self.q_num_heads)
         value_states = _expand_kv(value_states, self.n_group, self.q_num_heads)
         query_states = query_states.to(attn_dtype)
         key_states = key_states.to(attn_dtype)
         value_states = value_states.to(attn_dtype)
         if attention_mask is not None and attention_mask.dtype != torch.bool:
             attention_mask = attention_mask.to(attn_dtype)
         if attention_mask is None:
+            if not self.full_attn:
                 assert key_states.shape[2] <= self.config.attention_window_size + 1
             attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, is_causal=True)
         else:
                 attention_mask = attention_mask[None, None]
             assert len(attention_mask.shape) == 4
+            if not self.full_attn:
                 m_swa = swa_mask(
                     query_states.shape[2], key_states.shape[2], query_states.device, self.config.attention_window_size
                 )
 class MLP(nn.Module):
+    def __init__(self, config: Plamo2Config) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         return self.down_proj(h)  # type: ignore
+class Plamo2DecoderLayer(torch.nn.Module):
+    def __init__(self, config: Plamo2Config, layer_idx: int) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
+        self.is_mamba = config.layers_block_type[layer_idx] == "mamba"
         self.mixer: torch.nn.Module
+        if self.is_mamba:
             self.mixer = Mamba(config, layer_idx)
         else:
             self.mixer = Attention(config, layer_idx)
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        past_state: Optional[Plamo2Cache] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Any, ...]:
         # from LlamaDecoder
         return outputs  # type: ignore
+def is_mamba(config: Plamo2Config, i: int) -> bool:
     if not config.mamba_enabled:
         return False
     assert config.mamba_step > 1
     return (i % config.mamba_step) != (config.mamba_step // 2)
+class Plamo2Decoder(torch.nn.Module):
+    def __init__(self, config: Plamo2Config) -> None:
         super().__init__()
         self.layers = torch.nn.ModuleList(
+            [Plamo2DecoderLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
         self.gradient_checkpointing = False
         return DecoderOutput(hidden_states, all_hidden_states, all_self_attns)
+class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
+    config_class = Plamo2Config
     _no_split_modules: List[str]
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
                 module.weight.data[module.padding_idx].zero_()
+class Plamo2Model(Plamo2PreTrainedModel):
+    def __init__(self, config: Plamo2Config):
         super().__init__(config)
         assert config.eval_attention_n_bit is None
         assert config.eval_mlp_n_bit is None
                 self.image_proj = nn.Linear(config.image_feature_size, config.hidden_size, bias=False)  # type: ignore
             else:
                 raise ValueError(f"Unknown image_proj_type: {config.image_proj_type}")
+        self.layers = Plamo2Decoder(config)  # type: ignore
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Plamo2Cache] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         image_features: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        batch_size, seq_length, _ = inputs_embeds.shape
         seq_length_with_past = seq_length
         past_key_values_length = 0
         if past_key_values is not None:
             past_key_values_length = past_key_values.get_seq_length()
             seq_length_with_past = seq_length_with_past + past_key_values_length
+        assert cache_position is None, "cache_position is not supported yet"
         if image_features is not None:
             assert self.config.image_token_id is not None
         hidden_states = inputs_embeds
         if use_cache and past_key_values is None:
+            past_key_values = Plamo2Cache(self.config)
         # decoder layers
         out = self.layers(
         )
+class Plamo2ForCausalLM(Plamo2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     # Without this, the model cannot be loaded into a meta device.
     # https://github.com/pytorch/pytorch/blob/v2.4.1/torch/nn/modules/module.py#L2068
     _supports_param_buffer_assignment = False
+    def __init__(self, config: Plamo2Config) -> None:
         super().__init__(config)
+        self.model = Plamo2Model(config)
         self.vocab_size = config.vocab_size
         vocab_size = ((self.vocab_size + 15) // 16) * 16
     def set_output_embeddings(self, new_embeddings: torch.nn.Module) -> None:
         self.lm_head = new_embeddings
+    def set_decoder(self, decoder: Plamo2Model) -> None:
         self.model = decoder
+    def get_decoder(self) -> Plamo2Model:
         return self.model
     def forward(  # type: ignore
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Plamo2Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         image_features: Optional[torch.Tensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Any,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
         )
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = logits[:, slice_indices, : self.vocab_size]
         loss = None
         if labels is not None:
+            if len(kwargs) > 0 and set(kwargs.keys()) != set(["ignore_index"]):
+                warnings.warn(
+                    f"The following kwargs may not be supported: {', '.join(kwargs.keys())}. ",
+                    stacklevel=2,
+                )
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
         if not return_dict:
             output = (logits,) + outputs[1:]
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.Tensor,
+        past_key_values: Optional[Plamo2Cache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         image_features: Optional[torch.Tensor] = None,
         return model_inputs
     @staticmethod
+    def _reorder_cache(past_key_values: Plamo2Cache, beam_idx: torch.Tensor) -> Plamo2Cache:
         past_key_values.reorder_cache(beam_idx)
         return past_key_values
 class MLPImageProjector(nn.Module):
+    def __init__(self, config: Plamo2Config) -> None:
         super().__init__()
         self.config = config

tokenization_plamo.py CHANGED Viewed

@@ -237,7 +237,7 @@ class AhoCorasick:
         return [self._tokens[token_id] for token_id in self.encode(data)]
-class PlamoTokenizer(PreTrainedTokenizer):  # type: ignore
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]

         return [self._tokens[token_id] for token_id in self.encode(data)]
+class Plamo2Tokenizer(PreTrainedTokenizer):  # type: ignore
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]

tokenizer_config.json CHANGED Viewed

@@ -1,55 +1,55 @@
 {
-    "add_bos_token": true,
-    "add_eos_token": false,
-    "added_tokens_decoder": {
-        "0": {
-            "content": "<|plamo:unk|>",
-            "lstrip": false,
-            "normalized": false,
-            "rstrip": false,
-            "single_word": false,
-            "special": true
-        },
-        "1": {
-            "content": "<|plamo:bos|>",
-            "lstrip": false,
-            "normalized": false,
-            "rstrip": false,
-            "single_word": false,
-            "special": true
-        },
-        "2": {
-            "content": "<|plamo:eos|>",
-            "lstrip": false,
-            "normalized": false,
-            "rstrip": false,
-            "single_word": false,
-            "special": true
-        },
-        "3": {
-            "content": "<|plamo:pad|>",
-            "lstrip": false,
-            "normalized": false,
-            "rstrip": false,
-            "single_word": false,
-            "special": true
-        }
     },
-    "auto_map": {
-        "AutoTokenizer": [
-            "tokenization_plamo.PlamoTokenizer",
-            null
-        ]
     },
-    "bos_token": "<|plamo:bos|>",
-    "clean_up_tokenization_spaces": false,
-    "cls_token": null,
-    "eos_token": "<|plamo:eos|>",
-    "local_file_only": true,
-    "mask_token": null,
-    "model_max_length": 1000000000000000019884624838656,
-    "pad_token": "<|plamo:pad|>",
-    "sep_token": null,
-    "tokenizer_class": "PlamoTokenizer",
-    "unk_token": "<|plamo:unk|>"
-}

 {
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|plamo:unk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     },
+    "1": {
+      "content": "<|plamo:bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     },
+    "2": {
+      "content": "<|plamo:eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|plamo:pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_plamo.Plamo2Tokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|plamo:bos|>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": null,
+  "eos_token": "<|plamo:eos|>",
+  "local_file_only": true,
+  "mask_token": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|plamo:pad|>",
+  "sep_token": null,
+  "tokenizer_class": "Plamo2Tokenizer",
+  "unk_token": "<|plamo:unk|>"
+}