Motif-Technologies
/

Motif-2.6B

@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional, Tuple, Union, Callable, Dict
 import torch
 import torch.utils.checkpoint
@@ -28,25 +28,14 @@ from .configuration_motif import MotifConfig
 from dataclasses import dataclass
 import torch.nn.functional as F
-import time
-logger = logging.get_logger(__name__)
-if is_flash_attn_2_available():
-    from transformers.modeling_flash_attention_utils import _flash_attention_forward
-_CONFIG_FOR_DOC = "MotifConfig"
 from transformers.activations import ACT2CLS as _ACT2CLS
 from transformers.activations import ClassInstantier
 class PolyNorm(torch.nn.Module):
-    """
     A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
     The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md,
-    with the change `* torch.rsqrt` => `/ torch.sqrt`
     """
     def __init__(self, eps=1e-6):
@@ -62,11 +51,52 @@ class PolyNorm(torch.nn.Module):
         return self.weight[0] * self._norm(x ** 3) + self.weight[1] * self._norm(
             x ** 2) + self.weight[2] * self._norm(x) + self.bias
-CUSTOM_ACT2CLS = {"poly_norm": PolyNorm}
 ACT2CLS = {**_ACT2CLS, **CUSTOM_ACT2CLS}
 ACT2FN = ClassInstantier(ACT2CLS)
 class MotifRMSNorm(nn.Module):
@@ -80,7 +110,8 @@ class MotifRMSNorm(nn.Module):
     def forward(self, hidden_states):
         input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
         return self.weight * hidden_states.to(input_dtype)
@@ -88,21 +119,24 @@ class MotifRMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
 class MotifRotaryEmbeddingWithCache(nn.Module):
     """
     Rotary positional embedding module with caching for efficiency.
     Args:
         dim (int): Dimensionality of the embedding.
         max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
         base (int): Base for computing inverse frequency. Default is 10000.
         device (torch.device, optional): Device for tensor storage.
     Methods:
         forward(x, seq_len=None):
             Computes cosine and sine embeddings for input sequence length.
             Automatically updates cache if `seq_len` exceeds cached length.
     Attributes:
         inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
         cos_cached (torch.Tensor): Cached cosine embeddings.
@@ -138,8 +172,8 @@ class MotifRotaryEmbeddingWithCache(nn.Module):
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return (
-            self.cos_cached[None, :seq_len].to(dtype=x.dtype),
-            self.sin_cached[None, :seq_len].to(dtype=x.dtype),
         )
@@ -156,6 +190,7 @@ class MotifRotaryEmbedding(nn.Module):
         config: Optional[MotifConfig] = None,
     ):
         super().__init__()
         self.rope_kwargs = {}
         if config is None:
             logger.warning_once(
@@ -200,7 +235,7 @@ class MotifRotaryEmbedding(nn.Module):
                                                                  device,
                                                                  seq_len=seq_len,
                                                                  **self.rope_kwargs)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
             self.max_seq_len_cached = seq_len
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
@@ -234,8 +269,10 @@ class MotifRotaryEmbedding(nn.Module):
 def rotate_half(x):
     """
     Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
     Args:
     x (torch.Tensor): The input tensor.
     Returns:
     torch.Tensor: A tensor where the latter half of the dimensions are negated
                   and moved before the first half.
@@ -247,27 +284,60 @@ def rotate_half(x):
     return rotated_tensor
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """
     Applies rotary position embeddings to the input tensors.
     Args:
         q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
         k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
         cos (torch.Tensor): Cosine values for rotary embedding.
         sin (torch.Tensor): Sine values for rotary embedding.
-        unsqueeze_dim (int, optional): Dimension along which `cos` and `sin` are unsqueezed.
             Defaults to 1.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
     """
-    device = q.device
-    return map(
-        lambda x: (x * cos[position_ids].unsqueeze(unsqueeze_dim).to(device)) +
-        (rotate_half(x) * sin[position_ids].unsqueeze(unsqueeze_dim).to(device)), (q, k))
 class MotifMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -277,33 +347,420 @@ class MotifMLP(nn.Module):
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_state):
-        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    return torch.repeat_interleave(hidden_states, dim=1, repeats=n_rep)
-# @log_timing
 class MotifAttention(nn.Module):
     """
     Differential Attention (DiffAttention) module.
-    Implements the Differential Attention from
     "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.
-        DiffAttention addresses this by computing attention as the difference between
-        two separate softmax attention maps, effectively canceling noise and promoting
         sparse, structured attention patterns.
     Reference Implementation
         https://github.com/microsoft/unilm/tree/master/Diff-Transformer
     Args
-        The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
         λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
         - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
         - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
         - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
     """
     def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
@@ -383,7 +840,7 @@ class MotifAttention(nn.Module):
         self.subln = MotifRMSNorm(2 * self.head_dim, eps=1e-5)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
-        self.rotary_emb = MotifRotaryEmbedding(self.head_dim,
                                                 max_position_embeddings=self.max_position_embeddings,
                                                 base=self.rope_theta)
@@ -429,12 +886,12 @@ class MotifAttention(nn.Module):
             cos, sin = (self.rotary_emb(value_states, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
                         if use_cache else position_embeddings)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states,
-            key_states,
-            cos,
-            sin
-        )
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
@@ -503,7 +960,6 @@ class MotifAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-# @log_timing
 class MotifFlashAttention2(MotifAttention):
     """
     Motif flash attention module, following Motif attention module. This module inherits from `MotifAttention`
@@ -517,7 +973,7 @@ class MotifFlashAttention2(MotifAttention):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        logger.info(f'flash attention True')
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
@@ -525,6 +981,8 @@ class MotifFlashAttention2(MotifAttention):
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def _reshape_heads(self, tensor, batch_size, seq_len):
         """2-way head split tensor reshape"""
         return tensor.reshape(batch_size, seq_len, self.num_heads, 2, self.head_dim)
@@ -534,27 +992,59 @@ class MotifFlashAttention2(MotifAttention):
         return tensor.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
     def _compute_attention(self, query_states, key_states, value_states, attention_mask, q_len, position_ids,
-                           dropout_rate, sliding_window, batch_num):
         """Flash Attention 2 implements"""
         scale_factor = 1.0 / math.sqrt(self.head_dim)
-        # Copied from _flash_attention_forward
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
             causal = self.is_causal and q_len != 1
-        bsz = query_states.shape[0]
-        return _flash_attention_forward(query_states.bfloat16(),
-                                key_states.bfloat16(),
-                                value_states.bfloat16(),
-                                attention_mask,
-                                q_len,
-                                position_ids=position_ids,
-                                dropout=dropout_rate,
-                                sliding_window=sliding_window,
-                                is_causal=self.is_causal,
-                                use_top_left_mask=self._flash_attn_uses_top_left_mask)
     def forward(
             self,
@@ -588,12 +1078,12 @@ class MotifFlashAttention2(MotifAttention):
             cos, sin = (self.rotary_emb(value_states, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
                         if use_cache else position_embeddings)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states,
-            key_states,
-            cos,
-            sin
-        )
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
@@ -604,6 +1094,28 @@ class MotifFlashAttention2(MotifAttention):
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
         q_len = query_states.shape[-2]
         kv_seq_len = key_states.shape[-2]
@@ -613,7 +1125,7 @@ class MotifFlashAttention2(MotifAttention):
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
-                and self.layer_idx >= self.config.max_window_layers):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
@@ -633,13 +1145,14 @@ class MotifFlashAttention2(MotifAttention):
         k1, k2 = k1.contiguous(), k2.contiguous()
         v1, v2 = v1.contiguous(), v2.contiguous()
-        attn11, attn12 = self._compute_attention(q1, k1, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window, self.batch_num), \
-                            self._compute_attention(q1, k1, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window, self.batch_num)
-        attn21, attn22 = self._compute_attention(q2, k2, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window, self.batch_num), \
-                            self._compute_attention(q2, k2, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window, self.batch_num)
-        attn1, attn2 = torch.cat([attn11, attn12], dim=-1).float(), torch.cat([attn21, attn22], dim=-1).float()
         lambda_q1 = self.lambda_q1.unsqueeze(0).expand([bsz, self.lambda_q1.shape[0]])  # bsz, num_head
         lambda_q2 = self.lambda_q2.unsqueeze(0).expand([bsz, self.lambda_q2.shape[0]])  # bsz, num_head
@@ -655,16 +1168,15 @@ class MotifFlashAttention2(MotifAttention):
         attn_output = attn_output * (1 - self.lambda_init)
         if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim * 2):
-            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                              f" {attn_output.size()}")
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).bfloat16()
         attn_output = self.o_proj(attn_output) * self.o_proj_alpha
-        return attn_output.float(), None, past_key_value
-# @log_timing
 class MotifSdpaAttention(MotifAttention):
     """
     Motif attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -758,16 +1270,17 @@ class MotifSdpaAttention(MotifAttention):
 MOTIF_ATTENTION_CLASSES = {
     "eager": MotifAttention,
     "flash_attention_2": MotifFlashAttention2,
-    "sdpa": MotifSdpaAttention,
 }
 class MotifDecoderLayer(nn.Module):
-    def __init__(self, config: MotifConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if config.sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
@@ -777,8 +1290,12 @@ class MotifDecoderLayer(nn.Module):
         else:
             self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
-        RMSNorm = MotifRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -847,7 +1364,13 @@ class MotifDecoderLayer(nn.Module):
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states) * self.post_attention_layernorm_alpha
-        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
@@ -866,9 +1389,11 @@ MOTIF_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MotifConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -918,23 +1443,26 @@ class MotifPreTrainedModel(PreTrainedModel):
                 module_std = module_std / math.sqrt(self.config.dim_model_base_lmh) ### lmhead.. 1
             else:
                 module_std = module_std
-            torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
-            torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 @dataclass
 class MotifModelOutputWithPast(ModelOutput):
-    """
-    This augments `BaseModelOutputWithPast` in `transformers.modeling_outputs` with new optional keys: `causal_mask`, `position_embeddings`.
     The optional keys are currently used in the following ways:
-    - pass information to the token-wise last attention layers in multi-token training
     """
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
@@ -949,39 +1477,51 @@ MOTIF_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance, see our
             [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
@@ -1014,6 +1554,7 @@ MOTIF_INPUTS_DOCSTRING = r"""
 class MotifModel(MotifPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
     Args:
         config: MotifConfig
     """
@@ -1025,14 +1566,19 @@ class MotifModel(MotifPreTrainedModel):
         self.multi_token_heads = config.multi_token_heads
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         num_hidden_layers = config.num_hidden_layers if self.multi_token_heads is None else config.num_hidden_layers - 1
-        self.layers = nn.ModuleList([
-            MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)
-        ])
         self._attn_implementation = config._attn_implementation
-        RMSNorm = MotifRMSNorm
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -1046,6 +1592,34 @@ class MotifModel(MotifPreTrainedModel):
         self.gradient_checkpointing = False
         self.post_init()
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -1084,6 +1658,7 @@ class MotifModel(MotifPreTrainedModel):
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
@@ -1097,17 +1672,17 @@ class MotifModel(MotifPreTrainedModel):
                     "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)")
         if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(past_seen_tokens,
                                           past_seen_tokens + inputs_embeds.shape[1],
                                           device=inputs_embeds.device)
-        position_ids = None
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_key_values,
                                                output_attentions)
@@ -1150,6 +1725,10 @@ class MotifModel(MotifPreTrainedModel):
                 )
             hidden_states = layer_outputs[0]
             if use_cache:
                 next_decoder_cache = layer_outputs[2 if output_attentions else 1]
@@ -1157,8 +1736,9 @@ class MotifModel(MotifPreTrainedModel):
             if output_attentions:
                 all_self_attns += (layer_outputs[1], )
-        hidden_states = self.norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states, )
@@ -1190,6 +1770,8 @@ class MotifModel(MotifPreTrainedModel):
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1261,6 +1843,7 @@ class MotifModel(MotifPreTrainedModel):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
         `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
         Args:
             attention_mask (`torch.Tensor`):
                 A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
@@ -1318,14 +1901,33 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
         self.vocab_size = config.vocab_size
         self.multi_token_heads = config.multi_token_heads
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
         if getattr(config, "tie_word_embeddings", True):
             logger.info('tie embeddings')
             self.tie_weights()
     def get_input_embeddings(self):
         return self.model.embed_tokens
@@ -1345,7 +1947,101 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
     def get_decoder(self):
         return self.model
     @add_start_docstrings_to_model_forward(MOTIF_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1370,18 +2066,25 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
             num_logits_to_keep (`int`, *optional*):
                 Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
                 `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MotifForCausalLM
-        >>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -1394,6 +2097,8 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs: MotifModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1405,16 +2110,31 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
         )
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         logits = logits.float()
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
@@ -1436,4 +2156,4 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )

 import math
+from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from dataclasses import dataclass
 import torch.nn.functional as F
 from transformers.activations import ACT2CLS as _ACT2CLS
 from transformers.activations import ClassInstantier
 class PolyNorm(torch.nn.Module):
+    """
     A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
     The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md,
+    with the change `* torch.rsqrt` => `/ torch.sqrt` for potential MAF incompatibility.
     """
     def __init__(self, eps=1e-6):
         return self.weight[0] * self._norm(x ** 3) + self.weight[1] * self._norm(
             x ** 2) + self.weight[2] * self._norm(x) + self.bias
+class PolyNorm_Test(torch.nn.Module):
+    """
+    A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
+    The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md,
+    with the change `* torch.rsqrt` => `/ torch.sqrt` for potential MAF incompatibility.
+    """
+    def __init__(self, eps=1e-6):
+        super(PolyNorm_Test, self).__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1))
+        self.eps = eps
+    def forward(self, x):
+        #return torch.nn.SiLU(x)
+        return moreh_ops.poly_norm(x, self.weight, self.bias)
+CUSTOM_ACT2CLS = {"poly_norm": PolyNorm, "poly_norm_test": PolyNorm_Test}
 ACT2CLS = {**_ACT2CLS, **CUSTOM_ACT2CLS}
 ACT2FN = ClassInstantier(ACT2CLS)
+logger = logging.get_logger(__name__)
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+try:
+    moreh_ops = torch.ops.moreh
+    MorehRMSNorm = moreh_ops.T5LayerNorm
+    ScaledDotProductAttention = moreh_ops.scaled_dot_product_attention
+    MorehFlashAttention = moreh_ops.flash_attention
+    logger.warning_once("Using moreh ops")
+except AttributeError:
+    MorehRMSNorm = None
+    ScaledDotProductAttention = None
+    MorehFlashAttention = None
+    logger.warning_once("Failed to import moreh ops")
+#_CHECKPOINT_FOR_DOC = "moreh/Motif-102B"
+_CONFIG_FOR_DOC = "MotifConfig"
+#from .moreh_moe import MorehMoeMLP, MorehMoeFusedMLP
 class MotifRMSNorm(nn.Module):
     def forward(self, hidden_states):
         input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
         return self.weight * hidden_states.to(input_dtype)
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+ALL_LAYERNORM_LAYERS.append(MotifRMSNorm if MorehRMSNorm is None else MorehRMSNorm)
 class MotifRotaryEmbeddingWithCache(nn.Module):
     """
     Rotary positional embedding module with caching for efficiency.
     Args:
         dim (int): Dimensionality of the embedding.
         max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
         base (int): Base for computing inverse frequency. Default is 10000.
         device (torch.device, optional): Device for tensor storage.
     Methods:
         forward(x, seq_len=None):
             Computes cosine and sine embeddings for input sequence length.
             Automatically updates cache if `seq_len` exceeds cached length.
     Attributes:
         inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
         cos_cached (torch.Tensor): Cached cosine embeddings.
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return (
+            self.cos_cached[ :seq_len].to(dtype=x.dtype),
+            self.sin_cached[ :seq_len].to(dtype=x.dtype),
         )
         config: Optional[MotifConfig] = None,
     ):
         super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
         self.rope_kwargs = {}
         if config is None:
             logger.warning_once(
                                                                  device,
                                                                  seq_len=seq_len,
                                                                  **self.rope_kwargs)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
 def rotate_half(x):
     """
     Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
     Args:
     x (torch.Tensor): The input tensor.
     Returns:
     torch.Tensor: A tensor where the latter half of the dimensions are negated
                   and moved before the first half.
     return rotated_tensor
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=False):
     """
     Applies rotary position embeddings to the input tensors.
     Args:
         q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
         k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
         cos (torch.Tensor): Cosine values for rotary embedding.
         sin (torch.Tensor): Sine values for rotary embedding.
+        unsqueeze_dim (int, optional): Dimension along which `cos` and `sin` are unsqueezed.
             Defaults to 1.
+        fused_rope (bool, optional): If True, applies fused rotary embeddings using
+            `moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
+            Defaults to False.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
     """
+    '''
+    # (B, NH, S, D_KV) -> (B, S, NH, D_KV)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    '''
+    if not fused_rope:
+        device = q.device
+        return map(
+            lambda x: (x * cos[position_ids].unsqueeze(unsqueeze_dim).to(device)) +
+            (rotate_half(x) * sin[position_ids].unsqueeze(unsqueeze_dim).to(device)), (q, k))
+    else:
+        # (B, NH, S, D_KV) -> (B, S, NH, D_KV)
+        cos = cos[position_ids]
+        sin = sin[position_ids]
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        # Expand 'batch' dim
+        cos = cos.expand(q.shape[0], *cos.shape[1:])
+        sin = sin.expand(q.shape[0], *sin.shape[1:])
+        q_embed = moreh_ops.apply_rotary_emb(q, cos, sin, opcode=1)
+        k_embed = moreh_ops.apply_rotary_emb(k, cos, sin, opcode=1)
+        # (B, S, NH, D_KV) -> (B, NH, S, D_KV)
+        q_embed = q_embed.transpose(1, 2)
+        k_embed = k_embed.transpose(1, 2)
+        return q_embed, k_embed
 class MotifMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
+        if config.wesar_weights:
+            self.gate_up_proj_alpha = nn.Parameter(torch.tensor(1) *config.gate_up_proj_alpha)
+            self.down_proj_alpha = nn.Parameter(torch.tensor(1) * config.down_proj_alpha)
+        else:
+            self.gate_up_proj_alpha=1
+            self.down_proj_alpha=1
+        if config.muP:
+            self.down_proj.__do_scale_tager__ = True
+            self.gate_proj.__do_scale_tager_mu_dim_model__  = True
+            self.up_proj.__do_scale_tager_mu_dim_model__ = True
+            self.down_proj.__do_scale_tager_mu_ffn__ = True
     def forward(self, hidden_state):
+        hidden_state = hidden_state*self.gate_up_proj_alpha
+        #hidden_state = self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))*
+        return self.down_proj_alpha*self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+class MorehMoeFusedMLP(nn.Module):
+    def __init__(self,
+                 ffn_dim,
+                 hidden_dim,
+                 hidden_act_moe,
+                 num_experts,
+                 num_groups=1,
+                 device=None,
+                 continual_training=False):
+        super().__init__()
+        self.ffn_dim = ffn_dim
+        self.hidden_dim = hidden_dim
+        self.hidden_act_moe = hidden_act_moe
+        self.num_experts = num_experts
+        self.num_groups = num_groups
+        assert self.num_experts % self.num_groups == 0
+        self.num_experts_per_group = self.num_experts // self.num_groups
+        ## bsz, seq, group size, 2*ffn_size
+        moreh_ops = torch.ops.moreh
+        self.w13 = nn.ModuleList([
+            moreh_ops.MoeFanInLinear(self.hidden_dim,
+                                     self.ffn_dim * 2,
+                                     bias=False,
+                                     num_experts=self.num_experts_per_group,
+                                     device=device)
+            for _ in range(self.num_groups)
+        ])
+        self.w2 = nn.ModuleList([
+            moreh_ops.MoeFanOutLinear(self.ffn_dim,
+                                      self.hidden_dim,
+                                      bias=False,
+                                      num_experts=self.num_experts_per_group,
+                                      device=device)
+            for _ in range(self.num_groups)
+        ])
+        ## use silu?
+        self.act_fn = ACT2FN[self.hidden_act_moe]
+        if continual_training:
+            logger.info('two optipons 1. zero init all weights, 2. add scaling param to moe output.')
+            self._zero_init()
+    def _zero_init(self):
+        for module in self.w2:
+            for n,param in module.named_parameters():
+                logger.info(f'{n} {param.shape}')
+                param.data.zero_()
+    def forward(self, hidden_states, selected_experts, routing_weights):
+        w13_final_output = None
+        for group_idx in range(self.num_groups):
+            w13_output_in_group = self._get_w13_output(hidden_states,
+                                                       selected_experts,
+                                                       group_idx)
+            if w13_final_output is None:
+                w13_final_output = w13_output_in_group
+            else:
+                w13_final_output += w13_output_in_group
+        current_hidden_states = self.act_fn(
+            w13_final_output[:, :, :, :self.ffn_dim]
+        ) * w13_final_output[:, :, :, self.ffn_dim:]
+        final_hidden_states = None
+        for group_idx in range(self.num_groups):
+            w2_output_in_group = self._get_w2_output(current_hidden_states,
+                                                     selected_experts,
+                                                     routing_weights, group_idx)
+            if final_hidden_states is None:
+                final_hidden_states = w2_output_in_group
+            else:
+                final_hidden_states += w2_output_in_group
+        return final_hidden_states
+    def _get_w13_output(self, hidden_states, selected_experts, group_idx):
+        selected_experts_in_group = selected_experts - (
+            group_idx * self.num_experts_per_group)
+        w13_output = self.w13[group_idx](hidden_states,
+                                         selected_experts_in_group)
+        return w13_output
+    def _get_w2_output(self, hidden_states, selected_experts, routing_weights,
+                       group_idx):
+        selected_experts_in_group = selected_experts - (
+            group_idx * self.num_experts_per_group)
+        output = self.w2[group_idx](hidden_states, selected_experts_in_group,
+                                    routing_weights)
+        return output
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.scoring_func = config.scoring_func
+        self.seq_aux = config.seq_aux
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, self.gating_dim)))
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty((self.n_routed_experts)))
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        import torch.nn.init as init
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states.type(torch.float32),
+                          self.weight.type(torch.float32), None)
+        if self.scoring_func == "sigmoid":
+            scores = logits.sigmoid()
+        else:
+            raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.scoring_func}"
+            )
+        ### select top-k experts
+        if self.topk_method == "greedy":
+            topk_weight, topk_idx = torch.topk(scores,
+                                               k=self.top_k,
+                                               dim=-1,
+                                               sorted=False)
+        elif self.topk_method == "group_limited_greedy":
+            group_scores = (scores.view(bsz * seq_len, self.n_group,
+                                        -1).max(dim=-1).values)  # [n, n_group]
+            group_idx = torch.topk(group_scores,
+                                   k=self.topk_group,
+                                   dim=-1,
+                                   sorted=False)[1]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+            score_mask = (group_mask.unsqueeze(-1).expand(
+                bsz * seq_len, self.n_group,
+                self.n_routed_experts // self.n_group).reshape(
+                    bsz * seq_len, -1))  # [n, e]
+            tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+            topk_weight, topk_idx = torch.topk(tmp_scores,
+                                               k=self.top_k,
+                                               dim=-1,
+                                               sorted=False)
+        elif self.topk_method == "noaux_tc":
+            ###  will be used. ###
+            scores_for_choice = scores.view(
+                bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
+            group_scores = (scores_for_choice.view(
+                bsz * seq_len, self.n_group,
+                -1).topk(2, dim=-1)[0].sum(dim=-1))  # [n, n_group]
+            group_idx = torch.topk(group_scores,
+                                   k=self.topk_group,
+                                   dim=-1,
+                                   sorted=False)[1]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+            score_mask = (group_mask.unsqueeze(-1).expand(
+                bsz * seq_len, self.n_group,
+                self.n_routed_experts // self.n_group).reshape(
+                    bsz * seq_len, -1))  # [n, e]
+            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(),
+                                                       0.0)  # [n, e]
+            _, topk_idx = torch.topk(tmp_scores,
+                                     k=self.top_k,
+                                     dim=-1,
+                                     sorted=False)
+            topk_weight = scores.gather(1, topk_idx)
+        else:
+            raise NotImplementedError(
+                f"insupportable TopK function for MoE gating: {self.topk_method}"
+            )
+        ### norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = topk_weight * self.routed_scaling_factor  # must multiply the scaling factor
+        return topk_idx, topk_weight
+class MotifMoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.use_moreh_moe = config.use_moreh_moe
+        self.use_fused_mlp = config.use_fused_mlp
+        if hasattr(config, "ep_size") and config.ep_size > 1:
+            assert config.ep_size == dist.get_world_size()
+            assert not config.use_moreh_moe
+            self.ep_size = config.ep_size
+            self.experts_per_rank = config.n_routed_experts // config.ep_size
+            self.ep_rank = dist.get_rank()
+            self.experts = nn.ModuleList([
+                (DeepseekV3MLP(config,
+                               intermediate_size=config.moe_intermediate_size)
+                 if i >= self.ep_rank * self.experts_per_rank and i <
+                 (self.ep_rank + 1) * self.experts_per_rank else None)
+                for i in range(config.n_routed_experts)
+            ])
+        else:
+            self.ep_size = 1
+            self.experts_per_rank = config.n_routed_experts
+            self.ep_rank = 0
+            if self.use_moreh_moe:
+                if not self.use_fused_mlp:
+                    self.experts = MorehMoeMLP(
+                        ffn_dim=config.moe_intermediate_size,
+                        hidden_dim=config.hidden_size,
+                        hidden_act_moe=config.hidden_act_moe,
+                        num_experts=config.n_routed_experts,
+                        device=None)
+                else:
+                    ## group expert.
+                    self.experts = MorehMoeFusedMLP(
+                        ffn_dim=config.moe_intermediate_size,
+                        hidden_dim=config.hidden_size,
+                        hidden_act_moe=config.hidden_act_moe,
+                        num_experts=config.n_routed_experts,
+                        num_groups=config.n_group,
+                        device=None,
+                        continual_training=config.continual_training,
+                        )
+            else:
+                self.experts = nn.ModuleList([
+                    DeepseekV3MLP(
+                        config, intermediate_size=config.moe_intermediate_size)
+                    for i in range(config.n_routed_experts)
+                ])
+        self.gate = MoEGate(config)
+    def forward(self, hidden_states):
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+        topk_idx, topk_weight = self.gate(hidden_states)
+        if self.use_moreh_moe:
+            y = self.experts(hidden_states, topk_idx.view(*orig_shape[:-1], -1),
+                             topk_weight.view(*orig_shape[:-1], -1))
+            y = y.type(hidden_states.dtype)
+        else:
+            hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+            flat_topk_idx = topk_idx.view(-1)
+            if self.training:
+                hidden_states = hidden_states.repeat_interleave(
+                    self.num_experts_per_tok, dim=0)
+                y = torch.empty_like(hidden_states)
+                for i, expert in enumerate(self.experts):
+                    y[flat_topk_idx == i] = expert(
+                        hidden_states[flat_topk_idx == i])
+                y = (y.view(*topk_weight.shape, -1) *
+                     topk_weight.unsqueeze(-1)).sum(dim=1)
+                y = y.type(hidden_states.dtype)
+                y = y.view(*orig_shape)
+                # y = AddAuxiliaryLoss.apply(y, aux_loss)
+            else:
+                y = self.moe_infer(hidden_states, topk_idx,
+                                   topk_weight).view(*orig_shape)
+        return y, identity
+    @torch.no_grad()
+    def moe_infer(self, x, topk_ids, topk_weight):
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+        cnts.scatter_(1, topk_ids, 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        idxs = topk_ids.view(-1).argsort()
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        sorted_tokens_shape = sorted_tokens.shape
+        if self.ep_size > 1:
+            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size,
+                                                        -1).sum(dim=1)
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0])
+            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
+            output_splits = (tokens_per_expert_group.view(
+                self.ep_size, -1).sum(1).cpu().numpy().tolist())
+            gathered_tokens = sorted_tokens.new_empty(
+                tokens_per_expert_group.sum(dim=0).cpu().item(),
+                sorted_tokens.shape[1])
+            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
+            dist.all_to_all(
+                list(gathered_tokens.split(output_splits)),
+                list(sorted_tokens.split(input_split_sizes)),
+            )
+            tokens_per_expert_post_gather = tokens_per_expert_group.view(
+                self.ep_size, self.experts_per_rank).sum(dim=0)
+            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],),
+                                    dtype=np.int32)
+            s = 0
+            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
+                gatherd_idxs[s:s + k] = i % self.experts_per_rank
+                s += k
+            gatherd_idxs = gatherd_idxs.argsort()
+            sorted_tokens = gathered_tokens[gatherd_idxs]
+            tokens_per_expert = tokens_per_expert_post_gather
+        tokens_per_expert = tokens_per_expert.cpu().numpy()
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            end_idx = start_idx + num_tokens
+            if num_tokens == 0:
+                continue
+            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+            expert_out = expert(tokens_for_this_expert)
+            outputs.append(expert_out)
+            start_idx = end_idx
+        outs = torch.cat(outputs,
+                         dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+        if self.ep_size > 1:
+            new_x = torch.empty_like(outs)
+            new_x[gatherd_idxs] = outs
+            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
+            dist.all_to_all(
+                list(gathered_tokens.split(input_split_sizes)),
+                list(new_x.split(output_splits)),
+            )
+            outs = gathered_tokens
+        new_x = torch.empty_like(outs)
+        new_x[idxs] = outs
+        final_out = (new_x.view(
+            *topk_ids.shape, -1).type(topk_weight.dtype).mul_(
+                topk_weight.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype))
+        return final_out
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+    """
+    return torch.repeat_interleave(hidden_states, dim=1, repeats=n_rep)
 class MotifAttention(nn.Module):
     """
     Differential Attention (DiffAttention) module.
+    Implements the Differential Attention from
     "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.
+        DiffAttention addresses this by computing attention as the difference between
+        two separate softmax attention maps, effectively canceling noise and promoting
         sparse, structured attention patterns.
     Reference Implementation
         https://github.com/microsoft/unilm/tree/master/Diff-Transformer
     Args
+        The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
         λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
         - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
         - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
         - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
     """
     def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
         self.subln = MotifRMSNorm(2 * self.head_dim, eps=1e-5)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
+        self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
                                                 max_position_embeddings=self.max_position_embeddings,
                                                 base=self.rope_theta)
             cos, sin = (self.rotary_emb(value_states, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
                         if use_cache else position_embeddings)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                        key_states,
+                                                        cos,
+                                                        sin,
+                                                        position_ids=position_ids,
+                                                        fused_rope=self.config.fused_rope)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
         return attn_output, attn_weights, past_key_value
 class MotifFlashAttention2(MotifAttention):
     """
     Motif flash attention module, following Motif attention module. This module inherits from `MotifAttention`
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        logger.info(f'flash attention is used {not self._flash_attn_uses_top_left_mask}')
     def _reshape_heads(self, tensor, batch_size, seq_len):
         """2-way head split tensor reshape"""
         return tensor.reshape(batch_size, seq_len, self.num_heads, 2, self.head_dim)
         return tensor.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
     def _compute_attention(self, query_states, key_states, value_states, attention_mask, q_len, position_ids,
+                           dropout_rate, sliding_window, is_moreh_attention, batch_num):
         """Flash Attention 2 implements"""
         scale_factor = 1.0 / math.sqrt(self.head_dim)
+            # Copied from _flash_attention_forward
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
             causal = self.is_causal and q_len != 1
+        if is_moreh_attention:
+            bsz = query_states.shape[0]
+            if batch_num:
+                query_states = query_states.reshape(bsz*q_len,self.num_heads,self.head_dim)
+                key_states = key_states.reshape(bsz*q_len,self.num_heads,self.head_dim)
+                value_states = value_states.reshape(bsz*q_len,self.num_heads,self.head_dim)
+                attn_out = moreh_ops.flash_attention_varlen_dp(query_states,
+                                                        key_states,
+                                                        value_states,
+                                                        attention_mask,
+                                                        attention_mask,
+                                                        max_seqlen_q=q_len,
+                                                        max_seqlen_kv=q_len,
+                                                        dropout_p=dropout_rate,
+                                                        softmax_scale=scale_factor,
+                                                        is_causal=causal,
+                                                        batch_num=batch_num)
+                attn_out = attn_out.reshape(bsz, q_len, self.num_heads, -1)
+            else:
+                return MorehFlashAttention(query_states,
+                                        key_states,
+                                        value_states,
+                                        padding_mask=attention_mask,
+                                        dropout_p=dropout_rate,
+                                        softmax_scale=scale_factor,
+                                        causal=causal)
+            return attn_out
+        else:
+            attn_out = _flash_attention_forward(query_states,
+                                            key_states,
+                                            value_states,
+                                            attention_mask,
+                                            q_len,
+                                            position_ids=position_ids,
+                                            dropout=dropout_rate,
+                                            sliding_window=sliding_window,
+                                            is_causal=True,
+                                            softmax_scale=scale_factor,
+                                            use_top_left_mask=self._flash_attn_uses_top_left_mask)
+            #logger.info(attn_out)
+            return attn_out
     def forward(
             self,
             cos, sin = (self.rotary_emb(value_states, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
                         if use_cache else position_embeddings)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                        key_states,
+                                                        cos,
+                                                        sin,
+                                                        position_ids=position_ids,
+                                                        fused_rope=False)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32 and MorehFlashAttention is None:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}.")
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
         q_len = query_states.shape[-2]
         kv_seq_len = key_states.shape[-2]
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
+                and self.layer_idx >= self.config.max_window_layers and MorehFlashAttention is None):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
         k1, k2 = k1.contiguous(), k2.contiguous()
         v1, v2 = v1.contiguous(), v2.contiguous()
+        is_moreh_attention = MorehFlashAttention is not None
+        attn11, attn12 = self._compute_attention(q1, k1, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num), \
+                            self._compute_attention(q1, k1, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num)
+        attn21, attn22 = self._compute_attention(q2, k2, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num), \
+                            self._compute_attention(q2, k2, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num)
+        attn1, attn2 = torch.cat([attn11, attn12], dim=-1), torch.cat([attn21, attn22], dim=-1)
         lambda_q1 = self.lambda_q1.unsqueeze(0).expand([bsz, self.lambda_q1.shape[0]])  # bsz, num_head
         lambda_q2 = self.lambda_q2.unsqueeze(0).expand([bsz, self.lambda_q2.shape[0]])  # bsz, num_head
         attn_output = attn_output * (1 - self.lambda_init)
         if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim * 2):
+            raise ValueError(f"`attn_output` should be of size {(bsz, q_len, self.num_heads, 2*self.head_dim)}, but is"
                              f" {attn_output.size()}")
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output) * self.o_proj_alpha
+        return attn_output, None, past_key_value
 class MotifSdpaAttention(MotifAttention):
     """
     Motif attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
 MOTIF_ATTENTION_CLASSES = {
     "eager": MotifAttention,
     "flash_attention_2": MotifFlashAttention2,
+    "sdpa": MotifAttention,
 }
 class MotifDecoderLayer(nn.Module):
+    def __init__(self, config: MotifConfig, moe_layer: bool, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        if config.use_moreh_attention:
+            config._attn_implementation = "flash_attention_2"
         if config.sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
         else:
             self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
+        ### moe
+        self.moe = None
+        if moe_layer:
+            self.moe = MotifMoE(config)
+        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states) * self.post_attention_layernorm_alpha
+        if self.moe is not None:
+            hidden_states, identity = self.moe(hidden_states)
+            ## add output of shared expert and output of small moe experts.
+            ## hidden state must be zero tensor (for first forward)
+            hidden_states += self.mlp(identity)
+        else:
+            hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MotifConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
                 module_std = module_std / math.sqrt(self.config.dim_model_base_lmh) ### lmhead.. 1
             else:
                 module_std = module_std
+            module.weight.data.normal_(mean=0.0, std=module_std)
+            module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
+            #torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=module_std)
+            module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
+            #torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 @dataclass
 class MotifModelOutputWithPast(ModelOutput):
+    """
+    This augments `BaseModelOutputWithPast` in `transformers.modeling_outputs` with new optional keys: `causal_mask`, `position_embeddings`.
     The optional keys are currently used in the following ways:
+    - pass information to the token-wise last attention layers in multi-token training
     """
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance, see our
             [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
 class MotifModel(MotifPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
     Args:
         config: MotifConfig
     """
         self.multi_token_heads = config.multi_token_heads
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        # NOTE: For multi-token models, the last decoder layers (one for each token index)
+        # are implemented as a part of `MotifModelForCausalLM` to enable a custom forward-backward procedure.
         num_hidden_layers = config.num_hidden_layers if self.multi_token_heads is None else config.num_hidden_layers - 1
+        if config.moe:
+            moe_layer = [True for i in range(num_hidden_layers)]
+        else:
+            moe_layer = [False for i in range(num_hidden_layers)]
+        logger.info(f'current_moe layer { moe_layer }')
+        self.layers = nn.ModuleList([MotifDecoderLayer(config = config, moe_layer= moe_layer[layer_idx],
+                                    layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
         self._attn_implementation = config._attn_implementation
+        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.gradient_checkpointing = False
         self.post_init()
+        self.use_pipeline = config.use_pipeline
+        if self.use_pipeline:
+            logger.info('use reinforced pp..')
+            if config.num_stages==2:
+                ### moe version
+                if config.decontam_attn:
+                    self.split_layers = [15]
+                else:
+                    if num_hidden_layers == 32:
+                        self.split_layers = [14] # 14: 15,17 # 13: 14:18
+                    else:
+                        self.split_layers = [6]
+            elif config.num_stages==3:
+                self.split_layers = [9,20] ## 10, 11, 11
+            else:
+                self.split_layers = [6,15,24] #7(0,7),9(6,15),9(15,24),7(24,31)
+            logger.info(f' check the split layers (moe): {self.split_layers}')
+        self.scale_emb = 1
+        # Reparameterization <|_1_|>
+        if config.wesar_weights :
+            logger.info(f'config.wesar_weights {config.wesar_weights}')
+            self.norm_alpha = nn.Parameter(torch.tensor(1).float())
+            self.scale_emb = 10
+        else:
+            self.norm_alpha = 1
     def get_input_embeddings(self):
         return self.embed_tokens
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
                     "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)")
         if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.scale_emb
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(past_seen_tokens,
                                           past_seen_tokens + inputs_embeds.shape[1],
                                           device=inputs_embeds.device)
+        #position_ids = None
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_key_values,
                                                output_attentions)
                 )
             hidden_states = layer_outputs[0]
+            if self.use_pipeline and idx in self.split_layers:
+                hidden_states = torch.moreh.pipeline_assign(hidden_states)
             if use_cache:
                 next_decoder_cache = layer_outputs[2 if output_attentions else 1]
             if output_attentions:
                 all_self_attns += (layer_outputs[1], )
+        # <|_2_|>
+        hidden_states = self.norm(hidden_states)* self.norm_alpha
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states, )
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if MorehFlashAttention is not None:
+                return attention_mask
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
         `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
         Args:
             attention_mask (`torch.Tensor`):
                 A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
         self.vocab_size = config.vocab_size
         self.multi_token_heads = config.multi_token_heads
+        if self.multi_token_heads is None:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        else:
+            self.tokenwise_last_layers = nn.ModuleList(
+                [MotifDecoderLayer(config, config.num_hidden_layers - 1) for _ in range(self.multi_token_heads)])
+            self.tokenwise_lm_heads = nn.ModuleList(
+                [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(self.multi_token_heads)])
+        self.should_skip_separate_backward_pass = self.multi_token_heads is not None
         # Initialize weights and apply final processing
         self.post_init()
+        # <|_3_|>
+        if config.muP:
+            self.lm_head.__do_scale_tager_mu_dim_base_model__=True
+        # <|_4_|>
+        self.lm_head_alpha = 1
+        if config.wesar_weights:
+            self.lm_head_alpha = nn.Parameter(torch.tensor(1).float())
         if getattr(config, "tie_word_embeddings", True):
             logger.info('tie embeddings')
             self.tie_weights()
+        else:
+            # <|_5_|>
+            self.lm_head.__do_scale_tager_mu_dim_base_model__ = False
     def get_input_embeddings(self):
         return self.model.embed_tokens
     def get_decoder(self):
         return self.model
+    def multi_token_forward_backward(self,
+                                     hidden_states: torch.FloatTensor,
+                                     outputs: MotifModelOutputWithPast,
+                                     labels: torch.LongTensor,
+                                     position_ids: Optional[torch.LongTensor],
+                                     output_attentions: Optional[bool],
+                                     use_cache: Optional[bool],
+                                     cache_position: Optional[torch.LongTensor],
+                                     return_dict: Optional[bool],
+                                     num_logits_to_keep: int = 0) -> CausalLMOutputWithPast:
+        """
+        This implements the main forward-backward procedure for multi-token model training proposed in
+        the paper https://arxiv.org/abs/2404.19737.
+        Essentially,
+        - The multi-token model tries to predict n (instead of 1) tokens at a time.
+        - Applying this only during training and using first-token prediction during inference is still helpful.
+        - The change in architecture: when using n-token prediction, each token index (between 1 and n) has its own
+            (1) last attention layer and (2) lm head.
+        - The change in loss: sum of cross-entropy losses corresponding to each token index.
+        - Custom forward-backward procedure for memory efficiency: refer to the implementation of `multi_head_forward_backward`.
+        """
+        if not return_dict:
+            raise NotImplementedError("return_dict must be True for multi-token training")
+        past_key_values = outputs.past_key_values
+        causal_mask = outputs.causal_mask
+        position_embeddings = outputs.position_embeddings
+        if labels is not None:
+            labels = labels.to(hidden_states.device)
+        def _tokenwise_forward(hidden_states: torch.Tensor, token_idx):
+            ## Model forward
+            layer = self.tokenwise_last_layers[token_idx]
+            lm_head = self.tokenwise_lm_heads[token_idx]
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,  # TODO: update past_key_values?
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+            last_hidden_states = layer_outputs[0]
+            if num_logits_to_keep > 0:
+                assert labels is None
+                last_hidden_states = last_hidden_states[:, -num_logits_to_keep:, :]
+            tokenwise_logits = lm_head(last_hidden_states)
+            if labels is None:
+                return {
+                    "loss": None,
+                    "logits": tokenwise_logits,
+                }
+            ## Compute loss
+            shift_n = token_idx + 1
+            shift_logits = tokenwise_logits[..., :-shift_n, :].contiguous()
+            shift_labels = labels[..., shift_n:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            tokenwise_loss = loss_fct(shift_logits, shift_labels)
+            return {
+                "loss": tokenwise_loss,
+                "logits": tokenwise_logits,
+            }
+        head_fns = [
+            lambda hidden_states, token_idx=token_idx: _tokenwise_forward(hidden_states, token_idx)
+            for token_idx in range(self.multi_token_heads)
+        ]
+        loss, logits = multi_head_forward_backward(hidden_states,
+                                                   head_fns,
+                                                   return_keys=("loss", "logits"),
+                                                   return_only_first_head=True)
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
     @add_start_docstrings_to_model_forward(MOTIF_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
             num_logits_to_keep (`int`, *optional*):
                 Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
                 `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MotifForCausalLM
+        >>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs_include_causal_mask = self.multi_token_heads is not None
+        outputs_include_position_embeddings = self.multi_token_heads is not None
         outputs: MotifModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            outputs_include_causal_mask=outputs_include_causal_mask,
+            outputs_include_position_embeddings=outputs_include_position_embeddings,
         )
         hidden_states = outputs[0]
+        if self.multi_token_heads is not None:
+            return self.multi_token_forward_backward(hidden_states,
+                                                     outputs,
+                                                     labels,
+                                                     position_ids,
+                                                     output_attentions,
+                                                     use_cache,
+                                                     cache_position,
+                                                     return_dict,
+                                                     num_logits_to_keep=num_logits_to_keep)
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        hidden_states = hidden_states * self.lm_head_alpha
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         logits = logits.float()
         loss = None
         if labels is not None:
+            logits = logits
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+        )