Motif-Technologies
/

Motif-2.6B

@@ -98,18 +98,15 @@ ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
 class MotifRotaryEmbeddingWithCache(nn.Module):
     """
     Rotary positional embedding module with caching for efficiency.
     Args:
         dim (int): Dimensionality of the embedding.
         max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
         base (int): Base for computing inverse frequency. Default is 10000.
         device (torch.device, optional): Device for tensor storage.
     Methods:
         forward(x, seq_len=None):
             Computes cosine and sine embeddings for input sequence length.
             Automatically updates cache if `seq_len` exceeds cached length.
     Attributes:
         inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
         cos_cached (torch.Tensor): Cached cosine embeddings.
@@ -241,10 +238,8 @@ class MotifRotaryEmbedding(nn.Module):
 def rotate_half(x):
     """
     Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
     Args:
     x (torch.Tensor): The input tensor.
     Returns:
     torch.Tensor: A tensor where the latter half of the dimensions are negated
                   and moved before the first half.
@@ -259,7 +254,6 @@ def rotate_half(x):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
     """
     Applies rotary position embeddings to the input tensors.
     Args:
         q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
         k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
@@ -270,7 +264,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fus
         fused_rope (bool, optional): If True, applies fused rotary embeddings using
             `moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
             Defaults to False.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
     """
@@ -322,26 +315,21 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 class MotifAttention(nn.Module):
     """
     Differential Attention (DiffAttention) module.
     Implements the Differential Attention from
     "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.
         DiffAttention addresses this by computing attention as the difference between
         two separate softmax attention maps, effectively canceling noise and promoting
         sparse, structured attention patterns.
     Reference Implementation
         https://github.com/microsoft/unilm/tree/master/Diff-Transformer
     Args
         The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
         λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
         - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
         - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
         - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
     """
     def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
@@ -964,11 +952,9 @@ MOTIF_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MotifConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1049,51 +1035,39 @@ MOTIF_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance, see our
             [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
@@ -1126,7 +1100,6 @@ MOTIF_INPUTS_DOCSTRING = r"""
 class MotifModel(MotifPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
     Args:
         config: MotifConfig
     """
@@ -1375,7 +1348,6 @@ class MotifModel(MotifPreTrainedModel):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
         `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
         Args:
             attention_mask (`torch.Tensor`):
                 A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
@@ -1434,11 +1406,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
         self.multi_token_heads = config.multi_token_heads
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        else:
-            self.tokenwise_last_layers = nn.ModuleList(
-                [MotifDecoderLayer(config, config.num_hidden_layers - 1) for _ in range(self.multi_token_heads)])
-            self.tokenwise_lm_heads = nn.ModuleList(
-                [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(self.multi_token_heads)])
         # Initialize weights and apply final processing
         self.post_init()
@@ -1490,25 +1457,18 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
             num_logits_to_keep (`int`, *optional*):
                 Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
                 `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MotifForCausalLM
         >>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

 class MotifRotaryEmbeddingWithCache(nn.Module):
     """
     Rotary positional embedding module with caching for efficiency.
     Args:
         dim (int): Dimensionality of the embedding.
         max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
         base (int): Base for computing inverse frequency. Default is 10000.
         device (torch.device, optional): Device for tensor storage.
     Methods:
         forward(x, seq_len=None):
             Computes cosine and sine embeddings for input sequence length.
             Automatically updates cache if `seq_len` exceeds cached length.
     Attributes:
         inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
         cos_cached (torch.Tensor): Cached cosine embeddings.
 def rotate_half(x):
     """
     Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
     Args:
     x (torch.Tensor): The input tensor.
     Returns:
     torch.Tensor: A tensor where the latter half of the dimensions are negated
                   and moved before the first half.
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
     """
     Applies rotary position embeddings to the input tensors.
     Args:
         q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
         k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
         fused_rope (bool, optional): If True, applies fused rotary embeddings using
             `moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
             Defaults to False.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
     """
 class MotifAttention(nn.Module):
     """
     Differential Attention (DiffAttention) module.
     Implements the Differential Attention from
     "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.
         DiffAttention addresses this by computing attention as the difference between
         two separate softmax attention maps, effectively canceling noise and promoting
         sparse, structured attention patterns.
     Reference Implementation
         https://github.com/microsoft/unilm/tree/master/Diff-Transformer
     Args
         The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
         λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
         - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
         - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
         - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
     """
     def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MotifConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance, see our
             [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
 class MotifModel(MotifPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
     Args:
         config: MotifConfig
     """
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
         `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
         Args:
             attention_mask (`torch.Tensor`):
                 A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
         self.multi_token_heads = config.multi_token_heads
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
             num_logits_to_keep (`int`, *optional*):
                 Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
                 `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MotifForCausalLM
         >>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]