Update modeling_motif.py (#1)

Files changed (5) hide show

README.md +37 -1
config.json +5 -59
configuration_motif.py +5 -89
generation_config.json +1 -1
modeling_motif.py +108 -1001

README.md CHANGED Viewed

@@ -195,4 +195,40 @@ The benchmarks and corresponding scores listed in the table below are taken dire
 |MBPP|0-shot|53.9|62.2|60.3|+11.87%|-3.05%|
 |MBPP+|0-shot|44.4|50.6|50.8|+14.41%|+0.40%|
 |MultiPL-E|0-shot|22.6|34.9|-|-|-|
-|||||**Average**|**+18.55%**|**+1.12%**|

 |MBPP|0-shot|53.9|62.2|60.3|+11.87%|-3.05%|
 |MBPP+|0-shot|44.4|50.6|50.8|+14.41%|+0.40%|
 |MultiPL-E|0-shot|22.6|34.9|-|-|-|
+|||||**Average**|**+18.55%**|**+1.12%**|
+## How to use
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained(
+    "Motif-Technologies/Motif-2.6B",
+    trust_remote_code = True,
+    _attn_implementation = "eager", # also supports flash_attention_2
+).cuda()
+tokenizer = AutoTokenizer.from_pretrained(
+    "Motif-Technologies/Motif-2.6B",
+    trust_remote_code = True,
+)
+query = "What is the capital city of South Korea?"
+input_ids = tokenizer.apply_chat_template(
+    [
+        {'role': 'system', 'content': 'you are an helpful assistant'},
+        {'role': 'user', 'content': query},
+    ],
+    add_generation_prompt = True,
+    return_tensors='pt',
+).cuda()
+output = model.generate(input_ids, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)
+output = tokenizer.decode(res[0, input_ids.shape[-1]:], skip_special_tokens = True)
+print(output)
+"""
+The capital city of South Korea is Seoul. Located in the southern part of the country, Seoul is not only the largest city in South Korea but also one of the largest metropolitan areas in the world.
+It is a vibrant and dynamic city known for its rich history, cultural heritage, and modern amenities. Seoul is a major economic, cultural, and political center in East Asia, and it plays a crucial role in the region's politics, economy, and culture.
+The city is divided into different administrative districts, each with its own unique characteristics and attractions.
+"""

config.json CHANGED Viewed

@@ -8,82 +8,28 @@
     "AutoConfig": "configuration_motif.MotifConfig",
     "AutoModelForCausalLM": "modeling_motif.MotifForCausalLM"
   },
-  "bfloat16": true,
   "bos_token_id": 219396,
-  "continual_training": false,
-  "decoder_split_layers": [],
-  "decontam_attn": false,
-  "dim_model_base": 2048,
-  "dim_model_base_attn": 128,
-  "dim_model_base_init": 2048,
-  "dim_model_base_lmh": 1,
-  "dim_model_base_logits": 2048,
-  "dim_model_base_lr": 256,
-  "down_proj_alpha": 0.15625,
-  "embed_tokens_alpha": null,
-  "encoder_split_layers": [],
   "eos_token_id": 219395,
-  "first_expansion": false,
-  "fused_rope": true,
-  "gate_up_proj_alpha": 0.15625,
   "hidden_act": "poly_norm",
-  "hidden_act_moe": null,
   "hidden_size": 2048,
-  "hidden_states_shrink": 0.17677669529663687,
-  "init_scale_o": 1,
   "initializer_range": 2e-05,
-  "input_layernorm_alpha": null,
   "intermediate_size": 8192,
-  "k_proj_alpha": 0.15625,
-  "lm_head_alpha": null,
   "loss_reduction": "mean",
   "max_position_embeddings": 16384,
   "max_window_layers": 28,
-  "mix_attn": false,
   "model_type": "Motif",
-  "moe": false,
-  "moe_intermediate_size": null,
-  "moe_layer": false,
-  "muP": false,
-  "multi_token_heads": null,
-  "n_group": null,
-  "n_routed_experts": null,
-  "norm_alpha": null,
-  "norm_topk_prob": null,
   "num_attention_heads": 16,
   "num_hidden_layers": 32,
   "num_key_value_heads": 16,
-  "num_stages": false,
-  "o_proj_alpha": 0.15625,
-  "post_attention_layernorm_alpha": null,
-  "q_proj_alpha": 0.15625,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 500000.0,
-  "routed_scaling_factor": null,
-  "scale_emb": 1,
-  "scoring_func": null,
-  "seq_aux": null,
   "sliding_window": null,
-  "tensor_parallel": true,
   "tie_word_embeddings": true,
-  "topk_group": null,
-  "topk_method": null,
-  "torch_dtype": "float32",
-  "transformers_version": "4.51.3",
-  "use_advanced_parallelization": true,
   "use_bias": false,
-  "use_cache": false,
-  "use_emb_alpha": false,
-  "use_fused_mlp": null,
-  "use_moreh_attention": true,
-  "use_moreh_moe": false,
-  "use_mrope": false,
-  "use_norm_alpha": false,
-  "use_pipeline": false,
-  "use_qk_norm": false,
   "use_sliding_window": false,
-  "v_proj_alpha": 0.15625,
-  "vocab_size": 219520,
-  "wesar_weights": false
-}

     "AutoConfig": "configuration_motif.MotifConfig",
     "AutoModelForCausalLM": "modeling_motif.MotifForCausalLM"
   },
   "bos_token_id": 219396,
   "eos_token_id": 219395,
   "hidden_act": "poly_norm",
   "hidden_size": 2048,
   "initializer_range": 2e-05,
   "intermediate_size": 8192,
   "loss_reduction": "mean",
   "max_position_embeddings": 16384,
   "max_window_layers": 28,
   "model_type": "Motif",
   "num_attention_heads": 16,
   "num_hidden_layers": 32,
   "num_key_value_heads": 16,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 500000.0,
   "sliding_window": null,
   "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
   "use_bias": false,
+  "use_cache": true,
   "use_sliding_window": false,
+  "vocab_size": 219520
+}

configuration_motif.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
-from typing import Optional
-import math
 logger = logging.get_logger(__name__)
@@ -13,11 +14,8 @@ class MotifConfig(PretrainedConfig):
     Motif model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of
     Motif-102B [moreh/Motif-102B](https://huggingface.co/moreh/Motif-102B).
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 151936):
             Vocabulary size of the Motif model. Defines the number of different tokens that can be represented by the
@@ -97,16 +95,12 @@ class MotifConfig(PretrainedConfig):
             The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
     ```python
     >>> from transformers import MotifModel, MotifConfig
     >>> # Initializing a Motif style configuration
     >>> configuration = MotifConfig()
     >>> # Initializing a model from the Motif-102B style configuration
     >>> model = MotifModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
@@ -134,13 +128,8 @@ class MotifConfig(PretrainedConfig):
         sliding_window=4096,
         max_window_layers=28,
         attention_dropout=0.0,
-        multi_token_heads: Optional[int] = None,
         **kwargs,
     ):
-        """
-        Arguments:
-            multi_token_heads: If not None, use multi-token heads as in the paper https://arxiv.org/pdf/2404.19737
-        """
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -165,87 +154,14 @@ class MotifConfig(PretrainedConfig):
         self.rope_scaling = rope_scaling
         self.attention_dropout = attention_dropout
-        ###kwargs
-        # some scale factors
-        self.scale_emb = getattr(kwargs, "scale_emb", 1)
-        self.init_scale_o = getattr(kwargs, "init_scale_o", 1)
-        # muparam
-        self.hidden_states_shrink = 1 / math.sqrt(num_hidden_layers)
-        self.dim_model_base = hidden_size
-        self.dim_model_base_attn  = (hidden_size // num_attention_heads)
-        self.dim_model_base_init = hidden_size
-        self.dim_model_base_lr = getattr(kwargs, "dim_model_base_lr", hidden_size//8)
-        self.dim_model_base_lmh = 1
-        self.dim_model_base_logits = hidden_size
-        self.muP = getattr(kwargs, "muP", False)
-        # proxy hidden size ( following YuLan-Mini )
-        # reparameterization(wesar_weights)
-        logger.info(kwargs)
-        self.wesar_weights = getattr(kwargs, "wesar_weights", False)
-        logger.info(f'initial wesar reparameterization : {self.wesar_weights}')
-        # alpha (scale factor)
-        self.embed_tokens_alpha = getattr(kwargs, "embed_tokens_alpha", None)
-        self.q_proj_alpha = getattr(kwargs, "q_proj_alpha", None)
-        self.k_proj_alpha = getattr(kwargs, "k_proj_alpha", None)
-        self.v_proj_alpha = getattr(kwargs, "v_proj_alpha", None)
-        self.o_proj_alpha = getattr(kwargs, "o_proj_alpha", None)
-        self.down_proj_alpha =  getattr(kwargs, "down_proj_alpha", None)
-        self.gate_up_proj_alpha =  getattr(kwargs, "gate_up_proj_alpha", None)
-        self.input_layernorm_alpha = getattr(kwargs, "input_layernorm_alpha", None)
-        self.post_attention_layernorm_alpha =  getattr(kwargs, "post_attention_layernorm_alpha", None)
-        self.norm_alpha = getattr(kwargs, "norm_alpha", None)
-        self.lm_head_alpha =  getattr(kwargs, "lm_head_alpha", None)
-        self.use_norm_alpha =  getattr(kwargs, "use_norm_alpha", False)
-        self.use_emb_alpha =  getattr(kwargs, "use_emb_alpha", False)
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
-        self.multi_token_heads = multi_token_heads
-        self.multi_token_config_validation()
-        # moe
-        self.topk_method = getattr(kwargs, "topk_method", None)
-        self.scoring_func = getattr(kwargs, "scoring_func", None)
-        self.routed_scaling_factor = getattr(kwargs, "routed_scaling_factor", None)
-        self.norm_topk_prob = getattr(kwargs, "norm_topk_prob", None)
-        self.seq_aux = getattr(kwargs, "seq_aux", None)
-        self.hidden_act_moe =  getattr(kwargs, "hidden_act_moe", None)
-        self.n_group =  getattr(kwargs, "n_group", None)
-        self.n_routed_experts = getattr(kwargs, "n_routed_experts", None)
-        self.moe_intermediate_size =  getattr(kwargs, "moe_intermediate_size", None)
-        self.topk_group = getattr(kwargs, "topk_group", None)
-        self.use_fused_mlp =  getattr(kwargs, "use_fused_mlp", None)
-        self.use_moreh_moe =  getattr(kwargs, "use_moreh_moe", False)
-        self.continual_training =  getattr(kwargs, "continual_training", False)
-        # external
-        self.first_expansion =  getattr(kwargs, "first_expansion", False)
-        self.moe_layer =  getattr(kwargs, "moe_layer", False)
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
-        logger.info(f' kwargs : {kwargs}')
-        logger.info(f'after wesar reparameterization : {self.wesar_weights}')
-    def multi_token_config_validation(self):
-        if self.multi_token_heads is not None:
-            assert isinstance(self.multi_token_heads, int) and self.multi_token_heads >= 1

+import math
+from typing import Optional
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
     Motif model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of
     Motif-102B [moreh/Motif-102B](https://huggingface.co/moreh/Motif-102B).
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 151936):
             Vocabulary size of the Motif model. Defines the number of different tokens that can be represented by the
             The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
     ```python
     >>> from transformers import MotifModel, MotifConfig
     >>> # Initializing a Motif style configuration
     >>> configuration = MotifConfig()
     >>> # Initializing a model from the Motif-102B style configuration
     >>> model = MotifModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
         sliding_window=4096,
         max_window_layers=28,
         attention_dropout=0.0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.rope_scaling = rope_scaling
         self.attention_dropout = attention_dropout
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+        logger.info(f' kwargs : {kwargs}')

generation_config.json CHANGED Viewed

@@ -6,5 +6,5 @@
     219405
   ],
   "transformers_version": "4.51.3",
-  "use_cache": false
 }

     219405
   ],
   "transformers_version": "4.51.3",
+  "use_cache": true
 }

modeling_motif.py CHANGED Viewed

@@ -1,178 +1,32 @@
 import math
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_flash_attention_utils import _flash_attention_forward
-from transformers.modeling_outputs import (
-    CausalLMOutputWithPast,
-    ModelOutput,
-)
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_greater_or_equal_2_10,
-    is_flash_attn_2_available,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_motif import MotifConfig
-from dataclasses import dataclass
-import torch.nn.functional as F
-import time
-logger = logging.get_logger(__name__)
-if is_flash_attn_2_available():
-    from transformers.modeling_flash_attention_utils import _flash_attention_forward
-try:
-    moreh_ops = torch.ops.moreh
-    MorehRMSNorm = moreh_ops.T5LayerNorm
-    ScaledDotProductAttention = moreh_ops.scaled_dot_product_attention
-    MorehFlashAttention = moreh_ops.flash_attention
-    logger.warning_once("Using moreh ops")
-except AttributeError:
-    MorehRMSNorm = None
-    ScaledDotProductAttention = None
-    MorehFlashAttention = None
-    logger.warning_once("Failed to import moreh ops")
-# DEBUG = False
-# logger.info(f"DEBUG: {DEBUG} : will log timing")
-# def log_timing(obj):
-#     """Decorator to log timing of function or class execution"""
-#     if isinstance(obj, type):
-#         # If decorating a class
-#         class TimedClass(obj):
-#             def __getattribute__(self, name):
-#                 attr = super().__getattribute__(name)
-#                 if callable(attr) and not name.startswith('__'):
-#                     def timed_method(*args, **kwargs):
-#                         if not DEBUG:
-#                             return attr(*args, **kwargs)
-#                         if name != "forward":
-#                             return attr(*args, **kwargs)
-#                         start_time = time.time()
-#                         logger.info(f"Entering {obj.__name__}.{name}")
-#                         result = attr(*args, **kwargs)
-#                         end_time = time.time()
-#                         logger.info(f"Exiting {obj.__name__}.{name}, took {end_time - start_time:.4f} seconds")
-#                         return result
-#                     return timed_method
-#                 return attr
-#         return TimedClass
-#     else:
-#         # If decorating a function
-#         def wrapper(*args, **kwargs):
-#             if not DEBUG:
-#                 return obj(*args, **kwargs)
-#             start_time = time.time()
-#             logger.info(f"Entering {obj.__name__}")
-#             result = obj(*args, **kwargs)
-#             end_time = time.time()
-#             logger.info(f"Exiting {obj.__name__}, took {end_time - start_time:.4f} seconds")
-#             return result
-#         return wrapper
-#_CHECKPOINT_FOR_DOC = "moreh/Motif-102B"
-_CONFIG_FOR_DOC = "MotifConfig"
-#from .moreh_moe import MorehMoeMLP, MorehMoeFusedMLP
-import torch
-from transformers.activations import ACT2CLS as _ACT2CLS
-from transformers.activations import ClassInstantier
-moreh_ops = torch.ops.moreh
-from typing import Callable, Dict, List, Tuple
-import torch
-# @log_timing
-def multi_head_forward_backward(shared_activation: torch.Tensor,
-                                head_fns: List[Callable[[torch.Tensor], Dict[str, torch.Tensor]]],
-                                return_keys=("loss", ),
-                                return_only_first_head=True) -> Tuple[torch.Tensor, ...]:
-    """
-    The forward-backward pattern introduced in the paper https://arxiv.org/abs/2404.19737
-    to reduce memory overhead due to activations from multiple heads.
-    Args:
-    - shared_activation: the shared activation across all heads
-    - head_fns: the head-wise forward computations that start from `shared_activation`.
-        it should output a dictionary of tensors with keys matching `return_keys`
-    - return_keys: the keys to return in order
-    - return_only_first_head: whether to return only the values from the first head
-    Returns:
-    - a tuple of return tensors
-    Side effect:
-    - (only when `torch.is_grad_enabled()`)
-        the gradients accumulated as if `sum(head_fn(shared_activation)["loss"] for head_fn in head_fns).backward()` had been called
-    """
-    if not return_only_first_head:
-        raise NotImplementedError
-    return_key_set = set(return_keys)
-    if "loss" not in return_key_set:
-        raise Exception("'loss' is a required return key.")
-    detached_shared_activation = shared_activation.detach()
-    detached_shared_activation.requires_grad = True
-    return_values = {key: None for key in return_keys}
-    for head_idx, head_fn in enumerate(head_fns):
-        if head_idx > 0 and not torch.is_grad_enabled():
-            continue
-        # forward pass for the head
-        headwise_outputs = head_fn(detached_shared_activation)
-        if set(headwise_outputs.keys()) != return_key_set:
-            raise Exception(f"Headwise output keys {headwise_outputs.keys()} do not match return keys {return_keys}.")
-        # backward pass for the head
-        # effect 1: the parameters of the head
-        # effect 2: gradient accumulated in `detached_shared_activation.grad`
-        if torch.is_grad_enabled():
-            headwise_loss = headwise_outputs["loss"]
-            headwise_loss.backward(
-            )  # NOTE: You do not need to retain graph since no graph is shared across backward passes
-        if head_idx == 0:
-            for key in return_keys:
-                return_values[key] = headwise_outputs[key]
-    assert all(value is not None for value in return_values.values())
-    # backward pass for the shared part
-    if torch.is_grad_enabled():
-        shared_activation.backward(detached_shared_activation.grad)
-    return tuple(return_values[key] for key in return_keys)
 class PolyNorm(torch.nn.Module):
-    """
     A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
-    The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md,
-    with the change `* torch.rsqrt` => `/ torch.sqrt` for potential MAF incompatibility.
     """
     def __init__(self, eps=1e-6):
@@ -189,29 +43,16 @@ class PolyNorm(torch.nn.Module):
             x ** 2) + self.weight[2] * self._norm(x) + self.bias
-class PolyNorm_Test(torch.nn.Module):
-    """
-    A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
-    The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md,
-    with the change `* torch.rsqrt` => `/ torch.sqrt` for potential MAF incompatibility.
-    """
-    def __init__(self, eps=1e-6):
-        super(PolyNorm_Test, self).__init__()
-        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
-        self.bias = torch.nn.Parameter(torch.zeros(1))
-        self.eps = eps
-    def forward(self, x):
-        #return torch.nn.SiLU(x)
-        return moreh_ops.poly_norm(x, self.weight, self.bias)
-CUSTOM_ACT2CLS = {"poly_norm": PolyNorm_Test, "poly_norm_test": PolyNorm_Test}
 ACT2CLS = {**_ACT2CLS, **CUSTOM_ACT2CLS}
 ACT2FN = ClassInstantier(ACT2CLS)
 class MotifRMSNorm(nn.Module):
@@ -235,7 +76,7 @@ class MotifRMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-ALL_LAYERNORM_LAYERS.append(MotifRMSNorm if MorehRMSNorm is None else MorehRMSNorm)
 class MotifRotaryEmbeddingWithCache(nn.Module):
@@ -267,7 +108,6 @@ class MotifRotaryEmbeddingWithCache(nn.Module):
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(seq_len=max_position_embeddings,
                                 device=self.inv_freq.device,
                                 dtype=torch.get_default_dtype())
@@ -288,12 +128,11 @@ class MotifRotaryEmbeddingWithCache(nn.Module):
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return (
-            self.cos_cached[None, :seq_len].to(dtype=x.dtype),
-            self.sin_cached[None, :seq_len].to(dtype=x.dtype),
         )
-# @log_timing
 class MotifRotaryEmbedding(nn.Module):
     def __init__(
@@ -324,7 +163,6 @@ class MotifRotaryEmbedding(nn.Module):
             self.max_seq_len_cached = max_position_embeddings
             self.original_max_seq_len = max_position_embeddings
         else:
-            # BC: "rope_type" was originally "type"
             if config.rope_scaling is not None:
                 self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
             else:
@@ -386,10 +224,10 @@ class MotifRotaryEmbedding(nn.Module):
 def rotate_half(x):
     """
     Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
     Args:
     x (torch.Tensor): The input tensor.
     Returns:
     torch.Tensor: A tensor where the latter half of the dimensions are negated
                   and moved before the first half.
@@ -401,8 +239,7 @@ def rotate_half(x):
     return rotated_tensor
-# @log_timing
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
     """
     Applies rotary position embeddings to the input tensors.
@@ -411,438 +248,47 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fus
         k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
         cos (torch.Tensor): Cosine values for rotary embedding.
         sin (torch.Tensor): Sine values for rotary embedding.
-        unsqueeze_dim (int, optional): Dimension along which `cos` and `sin` are unsqueezed.
             Defaults to 1.
-        fused_rope (bool, optional): If True, applies fused rotary embeddings using
-            `moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
-            Defaults to False.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
     """
     '''
-    # (B, NH, S, D_KV) -> (B, S, NH, D_KV)
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     '''
-    #cos = cos[position_ids]
-    #sin = sin[position_ids]
-    #cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
-    #sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
-    q = q.transpose(1, 2)
-    k = k.transpose(1, 2)
-    # Expand 'batch' dim
-    cos = cos.expand(q.shape[0], *cos.shape[1:])
-    sin = sin.expand(q.shape[0], *sin.shape[1:])
-    q_embed = moreh_ops.apply_rotary_emb(q, cos, sin, opcode=1)
-    k_embed = moreh_ops.apply_rotary_emb(k, cos, sin, opcode=1)
-    # (B, S, NH, D_KV) -> (B, NH, S, D_KV)
-    q_embed = q_embed.transpose(1, 2)
-    k_embed = k_embed.transpose(1, 2)
-    return q_embed, k_embed
-# @log_timing
 class MotifMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
-        if config.wesar_weights:
-            self.gate_up_proj_alpha = nn.Parameter(torch.tensor(1) *config.gate_up_proj_alpha)
-            self.down_proj_alpha = nn.Parameter(torch.tensor(1) * config.down_proj_alpha)
-        else:
-            self.gate_up_proj_alpha=1
-            self.down_proj_alpha=1
-        if config.muP:
-            self.down_proj.__do_scale_tager__ = True
-            self.gate_proj.__do_scale_tager_mu_dim_model__  = True
-            self.up_proj.__do_scale_tager_mu_dim_model__ = True
-            self.down_proj.__do_scale_tager_mu_ffn__ = True
     def forward(self, hidden_state):
-        hidden_state = hidden_state*self.gate_up_proj_alpha
-        #hidden_state = self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))*
-        return self.down_proj_alpha*self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
-class MorehMoeFusedMLP(nn.Module):
-    def __init__(self,
-                 ffn_dim,
-                 hidden_dim,
-                 hidden_act_moe,
-                 num_experts,
-                 num_groups=1,
-                 device=None,
-                 continual_training=False):
-        super().__init__()
-        self.ffn_dim = ffn_dim
-        self.hidden_dim = hidden_dim
-        self.hidden_act_moe = hidden_act_moe
-        self.num_experts = num_experts
-        self.num_groups = num_groups
-        assert self.num_experts % self.num_groups == 0
-        self.num_experts_per_group = self.num_experts // self.num_groups
-        ## bsz, seq, group size, 2*ffn_size
-        moreh_ops = torch.ops.moreh
-        self.w13 = nn.ModuleList([
-            moreh_ops.MoeFanInLinear(self.hidden_dim,
-                                     self.ffn_dim * 2,
-                                     bias=False,
-                                     num_experts=self.num_experts_per_group,
-                                     device=device)
-            for _ in range(self.num_groups)
-        ])
-        self.w2 = nn.ModuleList([
-            moreh_ops.MoeFanOutLinear(self.ffn_dim,
-                                      self.hidden_dim,
-                                      bias=False,
-                                      num_experts=self.num_experts_per_group,
-                                      device=device)
-            for _ in range(self.num_groups)
-        ])
-        ## use silu?
-        self.act_fn = ACT2FN[self.hidden_act_moe]
-        if continual_training:
-            logger.info('two optipons 1. zero init all weights, 2. add scaling param to moe output.')
-            self._zero_init()
-    def _zero_init(self):
-        for module in self.w2:
-            for n,param in module.named_parameters():
-                logger.info(f'{n} {param.shape}')
-                param.data.zero_()
-    def forward(self, hidden_states, selected_experts, routing_weights):
-        w13_final_output = None
-        for group_idx in range(self.num_groups):
-            w13_output_in_group = self._get_w13_output(hidden_states,
-                                                       selected_experts,
-                                                       group_idx)
-            if w13_final_output is None:
-                w13_final_output = w13_output_in_group
-            else:
-                w13_final_output += w13_output_in_group
-        current_hidden_states = self.act_fn(
-            w13_final_output[:, :, :, :self.ffn_dim]
-        ) * w13_final_output[:, :, :, self.ffn_dim:]
-        final_hidden_states = None
-        for group_idx in range(self.num_groups):
-            w2_output_in_group = self._get_w2_output(current_hidden_states,
-                                                     selected_experts,
-                                                     routing_weights, group_idx)
-            if final_hidden_states is None:
-                final_hidden_states = w2_output_in_group
-            else:
-                final_hidden_states += w2_output_in_group
-        return final_hidden_states
-    def _get_w13_output(self, hidden_states, selected_experts, group_idx):
-        selected_experts_in_group = selected_experts - (
-            group_idx * self.num_experts_per_group)
-        w13_output = self.w13[group_idx](hidden_states,
-                                         selected_experts_in_group)
-        return w13_output
-    def _get_w2_output(self, hidden_states, selected_experts, routing_weights,
-                       group_idx):
-        selected_experts_in_group = selected_experts - (
-            group_idx * self.num_experts_per_group)
-        output = self.w2[group_idx](hidden_states, selected_experts_in_group,
-                                    routing_weights)
-        return output
-class MoEGate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.top_k = config.num_experts_per_tok
-        self.n_routed_experts = config.n_routed_experts
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.scoring_func = config.scoring_func
-        self.seq_aux = config.seq_aux
-        self.topk_method = config.topk_method
-        self.n_group = config.n_group
-        self.topk_group = config.topk_group
-        # topk selection algorithm
-        self.norm_topk_prob = config.norm_topk_prob
-        self.gating_dim = config.hidden_size
-        self.weight = nn.Parameter(
-            torch.empty((self.n_routed_experts, self.gating_dim)))
-        if self.topk_method == "noaux_tc":
-            self.e_score_correction_bias = nn.Parameter(
-                torch.empty((self.n_routed_experts)))
-        self.reset_parameters()
-    def reset_parameters(self) -> None:
-        import torch.nn.init as init
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-    def forward(self, hidden_states):
-        bsz, seq_len, h = hidden_states.shape
-        ### compute gating score
-        hidden_states = hidden_states.view(-1, h)
-        logits = F.linear(hidden_states.type(torch.float32),
-                          self.weight.type(torch.float32), None)
-        if self.scoring_func == "sigmoid":
-            scores = logits.sigmoid()
-        else:
-            raise NotImplementedError(
-                f"insupportable scoring function for MoE gating: {self.scoring_func}"
-            )
-        ### select top-k experts
-        if self.topk_method == "greedy":
-            topk_weight, topk_idx = torch.topk(scores,
-                                               k=self.top_k,
-                                               dim=-1,
-                                               sorted=False)
-        elif self.topk_method == "group_limited_greedy":
-            group_scores = (scores.view(bsz * seq_len, self.n_group,
-                                        -1).max(dim=-1).values)  # [n, n_group]
-            group_idx = torch.topk(group_scores,
-                                   k=self.topk_group,
-                                   dim=-1,
-                                   sorted=False)[1]  # [n, top_k_group]
-            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
-            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-            score_mask = (group_mask.unsqueeze(-1).expand(
-                bsz * seq_len, self.n_group,
-                self.n_routed_experts // self.n_group).reshape(
-                    bsz * seq_len, -1))  # [n, e]
-            tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
-            topk_weight, topk_idx = torch.topk(tmp_scores,
-                                               k=self.top_k,
-                                               dim=-1,
-                                               sorted=False)
-        elif self.topk_method == "noaux_tc":
-            ###  will be used. ###
-            scores_for_choice = scores.view(
-                bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
-            group_scores = (scores_for_choice.view(
-                bsz * seq_len, self.n_group,
-                -1).topk(2, dim=-1)[0].sum(dim=-1))  # [n, n_group]
-            group_idx = torch.topk(group_scores,
-                                   k=self.topk_group,
-                                   dim=-1,
-                                   sorted=False)[1]  # [n, top_k_group]
-            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
-            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-            score_mask = (group_mask.unsqueeze(-1).expand(
-                bsz * seq_len, self.n_group,
-                self.n_routed_experts // self.n_group).reshape(
-                    bsz * seq_len, -1))  # [n, e]
-            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(),
-                                                       0.0)  # [n, e]
-            _, topk_idx = torch.topk(tmp_scores,
-                                     k=self.top_k,
-                                     dim=-1,
-                                     sorted=False)
-            topk_weight = scores.gather(1, topk_idx)
-        else:
-            raise NotImplementedError(
-                f"insupportable TopK function for MoE gating: {self.topk_method}"
-            )
-        ### norm gate to sum 1
-        if self.top_k > 1 and self.norm_topk_prob:
-            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
-            topk_weight = topk_weight / denominator
-        topk_weight = topk_weight * self.routed_scaling_factor  # must multiply the scaling factor
-        return topk_idx, topk_weight
-class MotifMoE(nn.Module):
-    """
-    A mixed expert module containing shared experts.
-    """
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.num_experts_per_tok = config.num_experts_per_tok
-        self.use_moreh_moe = config.use_moreh_moe
-        self.use_fused_mlp = config.use_fused_mlp
-        if hasattr(config, "ep_size") and config.ep_size > 1:
-            assert config.ep_size == dist.get_world_size()
-            assert not config.use_moreh_moe
-            self.ep_size = config.ep_size
-            self.experts_per_rank = config.n_routed_experts // config.ep_size
-            self.ep_rank = dist.get_rank()
-            self.experts = nn.ModuleList([
-                (DeepseekV3MLP(config,
-                               intermediate_size=config.moe_intermediate_size)
-                 if i >= self.ep_rank * self.experts_per_rank and i <
-                 (self.ep_rank + 1) * self.experts_per_rank else None)
-                for i in range(config.n_routed_experts)
-            ])
-        else:
-            self.ep_size = 1
-            self.experts_per_rank = config.n_routed_experts
-            self.ep_rank = 0
-            if self.use_moreh_moe:
-                if not self.use_fused_mlp:
-                    self.experts = MorehMoeMLP(
-                        ffn_dim=config.moe_intermediate_size,
-                        hidden_dim=config.hidden_size,
-                        hidden_act_moe=config.hidden_act_moe,
-                        num_experts=config.n_routed_experts,
-                        device=None)
-                else:
-                    ## group expert.
-                    self.experts = MorehMoeFusedMLP(
-                        ffn_dim=config.moe_intermediate_size,
-                        hidden_dim=config.hidden_size,
-                        hidden_act_moe=config.hidden_act_moe,
-                        num_experts=config.n_routed_experts,
-                        num_groups=config.n_group,
-                        device=None,
-                        continual_training=config.continual_training,
-                        )
-            else:
-                self.experts = nn.ModuleList([
-                    DeepseekV3MLP(
-                        config, intermediate_size=config.moe_intermediate_size)
-                    for i in range(config.n_routed_experts)
-                ])
-        self.gate = MoEGate(config)
-    def forward(self, hidden_states):
-        identity = hidden_states
-        orig_shape = hidden_states.shape
-        topk_idx, topk_weight = self.gate(hidden_states)
-        if self.use_moreh_moe:
-            y = self.experts(hidden_states, topk_idx.view(*orig_shape[:-1], -1),
-                             topk_weight.view(*orig_shape[:-1], -1))
-            y = y.type(hidden_states.dtype)
-        else:
-            hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-            flat_topk_idx = topk_idx.view(-1)
-            if self.training:
-                hidden_states = hidden_states.repeat_interleave(
-                    self.num_experts_per_tok, dim=0)
-                y = torch.empty_like(hidden_states)
-                for i, expert in enumerate(self.experts):
-                    y[flat_topk_idx == i] = expert(
-                        hidden_states[flat_topk_idx == i])
-                y = (y.view(*topk_weight.shape, -1) *
-                     topk_weight.unsqueeze(-1)).sum(dim=1)
-                y = y.type(hidden_states.dtype)
-                y = y.view(*orig_shape)
-                # y = AddAuxiliaryLoss.apply(y, aux_loss)
-            else:
-                y = self.moe_infer(hidden_states, topk_idx,
-                                   topk_weight).view(*orig_shape)
-        return y, identity
-    @torch.no_grad()
-    def moe_infer(self, x, topk_ids, topk_weight):
-        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
-        cnts.scatter_(1, topk_ids, 1)
-        tokens_per_expert = cnts.sum(dim=0)
-        idxs = topk_ids.view(-1).argsort()
-        sorted_tokens = x[idxs // topk_ids.shape[1]]
-        sorted_tokens_shape = sorted_tokens.shape
-        if self.ep_size > 1:
-            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size,
-                                                        -1).sum(dim=1)
-            tokens_per_expert_group = tokens_per_expert.new_empty(
-                tokens_per_expert.shape[0])
-            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
-            output_splits = (tokens_per_expert_group.view(
-                self.ep_size, -1).sum(1).cpu().numpy().tolist())
-            gathered_tokens = sorted_tokens.new_empty(
-                tokens_per_expert_group.sum(dim=0).cpu().item(),
-                sorted_tokens.shape[1])
-            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
-            dist.all_to_all(
-                list(gathered_tokens.split(output_splits)),
-                list(sorted_tokens.split(input_split_sizes)),
-            )
-            tokens_per_expert_post_gather = tokens_per_expert_group.view(
-                self.ep_size, self.experts_per_rank).sum(dim=0)
-            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],),
-                                    dtype=np.int32)
-            s = 0
-            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
-                gatherd_idxs[s:s + k] = i % self.experts_per_rank
-                s += k
-            gatherd_idxs = gatherd_idxs.argsort()
-            sorted_tokens = gathered_tokens[gatherd_idxs]
-            tokens_per_expert = tokens_per_expert_post_gather
-        tokens_per_expert = tokens_per_expert.cpu().numpy()
-        outputs = []
-        start_idx = 0
-        for i, num_tokens in enumerate(tokens_per_expert):
-            end_idx = start_idx + num_tokens
-            if num_tokens == 0:
-                continue
-            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
-            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
-            expert_out = expert(tokens_for_this_expert)
-            outputs.append(expert_out)
-            start_idx = end_idx
-        outs = torch.cat(outputs,
-                         dim=0) if len(outputs) else sorted_tokens.new_empty(0)
-        if self.ep_size > 1:
-            new_x = torch.empty_like(outs)
-            new_x[gatherd_idxs] = outs
-            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
-            dist.all_to_all(
-                list(gathered_tokens.split(input_split_sizes)),
-                list(new_x.split(output_splits)),
-            )
-            outs = gathered_tokens
-        new_x = torch.empty_like(outs)
-        new_x[idxs] = outs
-        final_out = (new_x.view(
-            *topk_ids.shape, -1).type(topk_weight.dtype).mul_(
-                topk_weight.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype))
-        return final_out
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
@@ -851,32 +297,31 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     return torch.repeat_interleave(hidden_states, dim=1, repeats=n_rep)
-# @log_timing
 class MotifAttention(nn.Module):
     """
     Differential Attention (DiffAttention) module.
-    Implements the Differential Attention from
     "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.
-        DiffAttention addresses this by computing attention as the difference between
-        two separate softmax attention maps, effectively canceling noise and promoting
         sparse, structured attention patterns.
     Reference Implementation
         https://github.com/microsoft/unilm/tree/master/Diff-Transformer
     Args
-        The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
         λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
         - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
         - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
         - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
     """
     def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
@@ -899,11 +344,7 @@ class MotifAttention(nn.Module):
         self.rope_theta = config.rope_theta
         self.is_causal = True
         self.attention_dropout = config.attention_dropout
-        try:
-            self.batch_num = config.batch_num
-            logger.info(f'self.batcn_num : {self.batch_num}')
-        except:
-            self.batch_num = None
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                              f" and `num_heads`: {self.num_heads}).")
@@ -912,61 +353,22 @@ class MotifAttention(nn.Module):
         self.num_key_value_heads //= 2
         self.n_rep = self.num_heads // self.num_key_value_heads
-        ##mix attn
-        self.mix_attn = config.mix_attn
-        if self.mix_attn:
-            self.cq, self.ck = 6, 11
-            self.ch = 2
-            self.key_query_conv = nn.Conv2d(
-                in_channels=self.num_heads*2,
-                out_channels=self.num_heads*2,
-                kernel_size=(self.cq, self.ck),
-                padding="same",
-                groups=self.num_heads*2
-            )
-            self.head_conv = nn.Conv1d(
-                in_channels=self.num_heads,
-                out_channels=self.num_heads,
-                kernel_size=1,
-                padding=0,
-                groups=self.num_heads // self.ch
-            )
-            self.group_norm = nn.GroupNorm(num_groups=self.num_heads, num_channels=self.num_heads)
-        # re-init projections
         self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
-        # init lambdas
         for name in ["lambda_q1", "lambda_k1", "lambda_q2", "lambda_k2"]:
             setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
             getattr(self, name).data.normal_(mean=0.0, std=0.1)
-        # Uses same norm as motif norm, without elementwise_affine option
         self.subln = MotifRMSNorm(2 * self.head_dim, eps=1e-5)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
-        self.rotary_emb = MotifRotaryEmbedding(self.head_dim,
                                                 max_position_embeddings=self.max_position_embeddings,
                                                 base=self.rope_theta)
-        for param in ["q_proj_alpha", "k_proj_alpha", "v_proj_alpha", "o_proj_alpha"]:
-            setattr(
-                self, param,
-                nn.Parameter(torch.tensor(getattr(config, param, 1.0), dtype=torch.float))
-                if config.wesar_weights else 1.0)
     def forward(
             self,
             hidden_states: torch.Tensor,
@@ -976,15 +378,13 @@ class MotifAttention(nn.Module):
             output_attentions: bool = False,
             use_cache: bool = False,
             cache_position: Optional[torch.LongTensor] = None,
-            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states) * self.q_proj_alpha
-        key_states = self.k_proj(hidden_states) * self.k_proj_alpha
-        value_states = self.v_proj(hidden_states) * self.v_proj_alpha
-        ## bsz, seq, n_heads, head_dim
         query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -1006,17 +406,15 @@ class MotifAttention(nn.Module):
                                                         key_states,
                                                         cos,
                                                         sin,
-                                                        fused_rope=self.config.fused_rope)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        ## bsz, #haead, q_len, head_dim -> bsz, head, q_len, q_len
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         kv_seq_len = key_states.shape[-2]
@@ -1025,49 +423,31 @@ class MotifAttention(nn.Module):
         attention_mask = torch.triu(
             torch.full((q_len, kv_seq_len), float("-inf"), dtype=attn_weights.dtype, device=attn_weights.device),
             1 + offset)
-        ##attn weights conv2d, softmax and add attention_mask
-        if self.mix_attn:
-            ## condition mask==0, value : 0
-            attn_weights = attn_weights.masked_fill( attention_mask == 0, 0)
-            attn_weights = self.key_query_conv(attn_weights)
-            attn_weights = attn_weights[:, :, :kv_seq_len, :kv_seq_len]
-        ###add attn
         attn_weights = attn_weights + attention_mask
-        # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        # differential transformer lambdas
         lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(attn_weights)
         lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(attn_weights)
         lambda_full = lambda_1 - lambda_2 + self.lambda_init
         attn_weights = attn_weights.view(bsz, self.num_heads, 2, q_len, -1)
         attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
-        ##head_conv
-        if self.mix_attn:
-            attn_weights = attn_weights.view(bsz, self.num_heads, -1).contiguous()
-            attn_weights = self.head_conv(attn_weights)
-            attn_weights = attn_weights.view(bsz, self.num_heads, q_len, -1).contiguous()
-        ##shape : bsz, #heads, seq, head_dim
         attn_output = torch.matmul(attn_weights, value_states)
         attn_output = self.subln(attn_output)
         attn_output = attn_output * (1 - self.lambda_init)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim * 2):
             raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                              f" {attn_output.size()}")
-        if self.mix_attn:
-            attn_output = self.group_norm(attn_output)
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output) * self.o_proj_alpha
         if not output_attentions:
             attn_weights = None
@@ -1075,7 +455,6 @@ class MotifAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-# @log_timing
 class MotifFlashAttention2(MotifAttention):
     """
     Motif flash attention module, following Motif attention module. This module inherits from `MotifAttention`
@@ -1085,18 +464,16 @@ class MotifFlashAttention2(MotifAttention):
     config.max_window_layers layers.
     """
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        logger.info(f'flash attention True')
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def _reshape_heads(self, tensor, batch_size, seq_len):
         """2-way head split tensor reshape"""
         return tensor.reshape(batch_size, seq_len, self.num_heads, 2, self.head_dim)
@@ -1106,55 +483,27 @@ class MotifFlashAttention2(MotifAttention):
         return tensor.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
     def _compute_attention(self, query_states, key_states, value_states, attention_mask, q_len, position_ids,
-                           dropout_rate, sliding_window, is_moreh_attention, batch_num):
         """Flash Attention 2 implements"""
-        if is_moreh_attention:
-            scale_factor = 1.0 / math.sqrt(self.head_dim)
-            # Copied from _flash_attention_forward
-            if not self._flash_attn_uses_top_left_mask:
-                causal = self.is_causal
-            else:
-                causal = self.is_causal and q_len != 1
-            bsz = query_states.shape[0]
-            if batch_num:
-                query_states = query_states.reshape(bsz*q_len,self.num_heads,self.head_dim)
-                key_states = key_states.reshape(bsz*q_len,self.num_heads,self.head_dim)
-                value_states = value_states.reshape(bsz*q_len,self.num_heads,self.head_dim)
-                attn_out = moreh_ops.flash_attention_varlen_dp(query_states,
-                                                        key_states,
-                                                        value_states,
-                                                        attention_mask,
-                                                        attention_mask,
-                                                        max_seqlen_q=q_len,
-                                                        max_seqlen_kv=q_len,
-                                                        dropout_p=dropout_rate,
-                                                        softmax_scale=scale_factor,
-                                                        is_causal=causal,
-                                                        batch_num=batch_num)
-                attn_out = attn_out.reshape(bsz, q_len, self.num_heads, -1)
-            else:
-                return MorehFlashAttention(query_states,
-                                        key_states,
-                                        value_states,
-                                        padding_mask=attention_mask,
-                                        dropout_p=dropout_rate,
-                                        softmax_scale=scale_factor,
-                                        causal=causal)
-            return attn_out
         else:
-            return _flash_attention_forward(query_states,
-                                            key_states,
-                                            value_states,
                                             attention_mask,
                                             q_len,
                                             position_ids=position_ids,
                                             dropout=dropout_rate,
                                             sliding_window=sliding_window,
-                                            is_causal=self.is_causal,
                                             use_top_left_mask=self._flash_attn_uses_top_left_mask)
     def forward(
             self,
@@ -1169,9 +518,9 @@ class MotifFlashAttention2(MotifAttention):
     ):
         bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states) * self.q_proj_alpha
-        key_states = self.k_proj(hidden_states) * self.k_proj_alpha
-        value_states = self.v_proj(hidden_states) * self.v_proj_alpha
         query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -1192,13 +541,12 @@ class MotifFlashAttention2(MotifAttention):
                                                         key_states,
                                                         cos,
                                                         sin,
-                                                        fused_rope=True)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
@@ -1207,7 +555,7 @@ class MotifFlashAttention2(MotifAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
-        if input_dtype == torch.float32 and MorehFlashAttention is None:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
@@ -1234,7 +582,7 @@ class MotifFlashAttention2(MotifAttention):
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
-                and self.layer_idx >= self.config.max_window_layers and MorehFlashAttention is None):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
@@ -1254,12 +602,10 @@ class MotifFlashAttention2(MotifAttention):
         k1, k2 = k1.contiguous(), k2.contiguous()
         v1, v2 = v1.contiguous(), v2.contiguous()
-        is_moreh_attention = MorehFlashAttention is not None
-        attn11, attn12 = self._compute_attention(q1, k1, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num), \
-                            self._compute_attention(q1, k1, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num)
-        attn21, attn22 = self._compute_attention(q2, k2, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num), \
-                            self._compute_attention(q2, k2, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window, is_moreh_attention, self.batch_num)
         attn1, attn2 = torch.cat([attn11, attn12], dim=-1), torch.cat([attn21, attn22], dim=-1)
@@ -1277,16 +623,15 @@ class MotifFlashAttention2(MotifAttention):
         attn_output = attn_output * (1 - self.lambda_init)
         if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim * 2):
-            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                              f" {attn_output.size()}")
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output) * self.o_proj_alpha
         return attn_output, None, past_key_value
-# @log_timing
 class MotifSdpaAttention(MotifAttention):
     """
     Motif attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -1294,7 +639,6 @@ class MotifSdpaAttention(MotifAttention):
     SDPA API.
     """
-    # Adapted from MotifAttention.forward
     def forward(
             self,
             hidden_states: torch.Tensor,
@@ -1307,7 +651,6 @@ class MotifSdpaAttention(MotifAttention):
             position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
                 "MotifModel is using MotifSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
@@ -1343,8 +686,7 @@ class MotifSdpaAttention(MotifAttention):
         query_states, key_states = apply_rotary_pos_emb(query_states,
                                                         key_states,
                                                         cos,
-                                                        sin,
-                                                        fused_rope=self.config.fused_rope)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
@@ -1380,45 +722,25 @@ class MotifSdpaAttention(MotifAttention):
 MOTIF_ATTENTION_CLASSES = {
     "eager": MotifAttention,
     "flash_attention_2": MotifFlashAttention2,
-    "sdpa": MotifSdpaAttention,
 }
-# @log_timing
 class MotifDecoderLayer(nn.Module):
-    def __init__(self, config: MotifConfig, moe_layer: bool, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        if config.use_moreh_attention:
-            config._attn_implementation = "flash_attention_2"
         if config.sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered.")
-        if not config.mix_attn:
-            self.self_attn = MOTIF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        else:
-            self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
-        ### moe
-        self.moe = None
-        if moe_layer:
-            self.moe = MotifMoE(config)
-        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        if config.wesar_weights and config.use_norm_alpha:
-            self.input_layernorm_alpha = nn.Parameter(torch.tensor(1).float())
-        else:
-            self.input_layernorm_alpha = 1
-        if config.wesar_weights and config.use_norm_alpha :
-            self.post_attention_layernorm_alpha = nn.Parameter(torch.tensor(1).float())
-        else:
-            self.post_attention_layernorm_alpha = 1
     def forward(
         self,
@@ -1456,7 +778,7 @@ class MotifDecoderLayer(nn.Module):
         residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states) * self.input_layernorm_alpha
         # Self Attention
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
@@ -1473,16 +795,8 @@ class MotifDecoderLayer(nn.Module):
         # Fully Connected
         residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states) * self.post_attention_layernorm_alpha
-        if self.moe is not None:
-            hidden_states, identity = self.moe(hidden_states)
-            ## add output of shared expert and output of small moe experts.
-            ## hidden state must be zero tensor (for first forward)
-            hidden_states += self.mlp(identity)
-        else:
-            hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states, )
@@ -1532,45 +846,24 @@ class MotifPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         module_std = self.config.initializer_range
         if isinstance(module, nn.Linear):
-            if getattr(module, "__do_scale_tager__", False):
-                module_std = module_std / self.config.init_scale_o
-            if getattr(module, "__do_scale_tager_mu_o__", False):
-                if self.config.dim_model_base_init is not None:
-                    module_std = module_std / math.sqrt(2*(self.config.hidden_size / self.config.dim_model_base_init)*self.config.num_hidden_layers)
-                else:
-                    module_std = module_std
-            elif getattr(module, "__do_scale_tager_mu_ffn__", False):
-                if self.config.dim_model_base_init is not None:
-                    module_std = module_std = module_std / math.sqrt(2*(self.config.hidden_size / self.config.dim_model_base_init)*self.config.num_hidden_layers)
-                else:
-                    module_std = module_std
-            elif getattr(module, "__do_scale_tager_mu_dim_model__", False):
-                if self.config.dim_model_base_init is not None:
-                    module_std = module_std / math.sqrt(self.config.hidden_size / self.config.dim_model_base_init)
-                else:
-                    module_std = module_std
-            elif getattr(module, "__do_scale_tager_mu_dim_base_model__", False):
-                module_std = module_std / math.sqrt(self.config.dim_model_base_lmh) ### lmhead.. 1
-            else:
-                module_std = module_std
-            torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
-            torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 @dataclass
 class MotifModelOutputWithPast(ModelOutput):
-    """
-    This augments `BaseModelOutputWithPast` in `transformers.modeling_outputs` with new optional keys: `causal_mask`, `position_embeddings`.
     The optional keys are currently used in the following ways:
-    - pass information to the token-wise last attention layers in multi-token training
     """
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
@@ -1655,7 +948,6 @@ MOTIF_INPUTS_DOCSTRING = r"""
 """
-# @log_timing
 @add_start_docstrings(
     "The bare Motif Model outputting raw hidden-states without any specific head on top.",
     MOTIF_START_DOCSTRING,
@@ -1672,23 +964,11 @@ class MotifModel(MotifPreTrainedModel):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.multi_token_heads = config.multi_token_heads
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        # NOTE: For multi-token models, the last decoder layers (one for each token index)
-        # are implemented as a part of `MotifModelForCausalLM` to enable a custom forward-backward procedure.
-        num_hidden_layers = config.num_hidden_layers if self.multi_token_heads is None else config.num_hidden_layers - 1
-        if config.moe:
-            moe_layer = [True for i in range(num_hidden_layers)]
-        else:
-            moe_layer = [False for i in range(num_hidden_layers)]
-        logger.info(f'current_moe layer { moe_layer }')
-        self.layers = nn.ModuleList([MotifDecoderLayer(config = config, moe_layer= moe_layer[layer_idx],
-                                    layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
-        self._attn_implementation = config._attn_implementation
-        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
@@ -1701,36 +981,6 @@ class MotifModel(MotifPreTrainedModel):
         self.gradient_checkpointing = False
         self.post_init()
-        self.use_pipeline = config.use_pipeline
-        if self.use_pipeline:
-            logger.info('use reinforced pp..')
-            if config.num_stages==2:
-                ### moe version
-                if config.decontam_attn:
-                    self.split_layers = [15]
-                else:
-                    if num_hidden_layers == 32:
-                        self.split_layers = [15] # 14: 15,17 # 13: 14:18
-                    else:
-                        self.split_layers = [6]
-            elif config.num_stages==3:
-                self.split_layers = [9,20] ## 11, 11, 10
-            elif config.num_stages==4:
-                self.split_layers = [7,15,23] #7,9,9,7
-            elif config.num_stages==16:
-                self.split_layers = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29]
-            logger.info(f' check the split layers (moe): {self.split_layers}')
-        self.scale_emb = 1
-        # Reparameterization <|_1_|>
-        if config.wesar_weights :
-            logger.info(f'config.wesar_weights {config.wesar_weights}')
-            self.norm_alpha = nn.Parameter(torch.tensor(1).float())
-            self.scale_emb = 10
-        else:
-            self.norm_alpha = 1
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -1769,7 +1019,6 @@ class MotifModel(MotifPreTrainedModel):
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False
-        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
@@ -1783,26 +1032,23 @@ class MotifModel(MotifPreTrainedModel):
                     "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)")
         if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.scale_emb
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(past_seen_tokens,
                                           past_seen_tokens + inputs_embeds.shape[1],
                                           device=inputs_embeds.device)
-        position_ids = None
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_key_values,
                                                output_attentions)
         hidden_states = inputs_embeds
         bsz, q_len, _ = hidden_states.size()
-        # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, seq_len=q_len)
-        # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
@@ -1837,20 +1083,14 @@ class MotifModel(MotifPreTrainedModel):
             hidden_states = layer_outputs[0]
-            if self.use_pipeline and idx in self.split_layers:
-                hidden_states = torch.moreh.pipeline_assign(hidden_states)
             if use_cache:
                 next_decoder_cache = layer_outputs[2 if output_attentions else 1]
             if output_attentions:
                 all_self_attns += (layer_outputs[1], )
-        # <|_2_|>
-        hidden_states = self.norm(hidden_states)* self.norm_alpha
-        # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states, )
@@ -1881,8 +1121,6 @@ class MotifModel(MotifPreTrainedModel):
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
-            if MorehFlashAttention is not None:
-                return attention_mask
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1909,6 +1147,7 @@ class MotifModel(MotifPreTrainedModel):
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         # SlidingWindowCache or StaticCache
         if using_sliding_window_cache or using_static_cache:
             target_length = past_key_values.get_max_cache_shape()
@@ -2003,7 +1242,6 @@ class MotifModel(MotifPreTrainedModel):
         return causal_mask
-# @log_timing
 class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
@@ -2011,35 +1249,14 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
         super().__init__(config)
         self.model = MotifModel(config)
         self.vocab_size = config.vocab_size
-        self.multi_token_heads = config.multi_token_heads
-        if self.multi_token_heads is None:
-            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        else:
-            self.tokenwise_last_layers = nn.ModuleList(
-                [MotifDecoderLayer(config, config.num_hidden_layers - 1) for _ in range(self.multi_token_heads)])
-            self.tokenwise_lm_heads = nn.ModuleList(
-                [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(self.multi_token_heads)])
-        self.should_skip_separate_backward_pass = self.multi_token_heads is not None
         # Initialize weights and apply final processing
         self.post_init()
-        # <|_3_|>
-        if config.muP:
-            self.lm_head.__do_scale_tager_mu_dim_base_model__=True
-        # <|_4_|>
-        self.lm_head_alpha = 1
-        if config.wesar_weights:
-            self.lm_head_alpha = nn.Parameter(torch.tensor(1).float())
         if getattr(config, "tie_word_embeddings", True):
-            logger.info('tie embeddings')
             self.tie_weights()
-        else:
-            # <|_5_|>
-            self.lm_head.__do_scale_tager_mu_dim_base_model__ = False
     def get_input_embeddings(self):
         return self.model.embed_tokens
@@ -2059,101 +1276,7 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
     def get_decoder(self):
         return self.model
-    def multi_token_forward_backward(self,
-                                     hidden_states: torch.FloatTensor,
-                                     outputs: MotifModelOutputWithPast,
-                                     labels: torch.LongTensor,
-                                     position_ids: Optional[torch.LongTensor],
-                                     output_attentions: Optional[bool],
-                                     use_cache: Optional[bool],
-                                     cache_position: Optional[torch.LongTensor],
-                                     return_dict: Optional[bool],
-                                     num_logits_to_keep: int = 0) -> CausalLMOutputWithPast:
-        """
-        This implements the main forward-backward procedure for multi-token model training proposed in
-        the paper https://arxiv.org/abs/2404.19737.
-        Essentially,
-        - The multi-token model tries to predict n (instead of 1) tokens at a time.
-        - Applying this only during training and using first-token prediction during inference is still helpful.
-        - The change in architecture: when using n-token prediction, each token index (between 1 and n) has its own
-            (1) last attention layer and (2) lm head.
-        - The change in loss: sum of cross-entropy losses corresponding to each token index.
-        - Custom forward-backward procedure for memory efficiency: refer to the implementation of `multi_head_forward_backward`.
-        """
-        if not return_dict:
-            raise NotImplementedError("return_dict must be True for multi-token training")
-        past_key_values = outputs.past_key_values
-        causal_mask = outputs.causal_mask
-        position_embeddings = outputs.position_embeddings
-        if labels is not None:
-            labels = labels.to(hidden_states.device)
-        def _tokenwise_forward(hidden_states: torch.Tensor, token_idx):
-            ## Model forward
-            layer = self.tokenwise_last_layers[token_idx]
-            lm_head = self.tokenwise_lm_heads[token_idx]
-            layer_outputs = layer(
-                hidden_states,
-                attention_mask=causal_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,  # TODO: update past_key_values?
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                position_embeddings=position_embeddings,
-            )
-            last_hidden_states = layer_outputs[0]
-            if num_logits_to_keep > 0:
-                assert labels is None
-                last_hidden_states = last_hidden_states[:, -num_logits_to_keep:, :]
-            tokenwise_logits = lm_head(last_hidden_states)
-            if labels is None:
-                return {
-                    "loss": None,
-                    "logits": tokenwise_logits,
-                }
-            ## Compute loss
-            shift_n = token_idx + 1
-            shift_logits = tokenwise_logits[..., :-shift_n, :].contiguous()
-            shift_labels = labels[..., shift_n:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            tokenwise_loss = loss_fct(shift_logits, shift_labels)
-            return {
-                "loss": tokenwise_loss,
-                "logits": tokenwise_logits,
-            }
-        head_fns = [
-            lambda hidden_states, token_idx=token_idx: _tokenwise_forward(hidden_states, token_idx)
-            for token_idx in range(self.multi_token_heads)
-        ]
-        loss, logits = multi_head_forward_backward(hidden_states,
-                                                   head_fns,
-                                                   return_keys=("loss", "logits"),
-                                                   return_only_first_head=True)
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
     @add_start_docstrings_to_model_forward(MOTIF_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -2209,8 +1332,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs_include_causal_mask = self.multi_token_heads is not None
-        outputs_include_position_embeddings = self.multi_token_heads is not None
         outputs: MotifModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -2222,25 +1343,12 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
-            outputs_include_causal_mask=outputs_include_causal_mask,
-            outputs_include_position_embeddings=outputs_include_position_embeddings,
         )
         hidden_states = outputs[0]
-        if self.multi_token_heads is not None:
-            return self.multi_token_forward_backward(hidden_states,
-                                                     outputs,
-                                                     labels,
-                                                     position_ids,
-                                                     output_attentions,
-                                                     use_cache,
-                                                     cache_position,
-                                                     return_dict,
-                                                     num_logits_to_keep=num_logits_to_keep)
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        hidden_states = hidden_states * self.lm_head_alpha
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         logits = logits.float()
@@ -2254,7 +1362,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, self.config.vocab_size)
             shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
@@ -2268,4 +1375,4 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )

 import math
+from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2CLS as _ACT2CLS
+from transformers.activations import ClassInstantier
 from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings)
+from .configuration_motif import MotifConfig
 class PolyNorm(torch.nn.Module):
+    """
     A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
+    The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md
     """
     def __init__(self, eps=1e-6):
             x ** 2) + self.weight[2] * self._norm(x) + self.bias
+CUSTOM_ACT2CLS = {"poly_norm": PolyNorm}
 ACT2CLS = {**_ACT2CLS, **CUSTOM_ACT2CLS}
 ACT2FN = ClassInstantier(ACT2CLS)
+logger = logging.get_logger(__name__)
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+_CONFIG_FOR_DOC = "MotifConfig"
 class MotifRMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
 class MotifRotaryEmbeddingWithCache(nn.Module):
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self._set_cos_sin_cache(seq_len=max_position_embeddings,
                                 device=self.inv_freq.device,
                                 dtype=torch.get_default_dtype())
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return (
+            self.cos_cached[ :seq_len].to(dtype=x.dtype),
+            self.sin_cached[ :seq_len].to(dtype=x.dtype),
         )
 class MotifRotaryEmbedding(nn.Module):
     def __init__(
             self.max_seq_len_cached = max_position_embeddings
             self.original_max_seq_len = max_position_embeddings
         else:
             if config.rope_scaling is not None:
                 self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
             else:
 def rotate_half(x):
     """
     Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
     Args:
     x (torch.Tensor): The input tensor.
     Returns:
     torch.Tensor: A tensor where the latter half of the dimensions are negated
                   and moved before the first half.
     return rotated_tensor
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """
     Applies rotary position embeddings to the input tensors.
         k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
         cos (torch.Tensor): Cosine values for rotary embedding.
         sin (torch.Tensor): Sine values for rotary embedding.
+        unsqueeze_dim (int, optional): Dimension along which `cos` and `sin` are unsqueezed.
             Defaults to 1.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
     """
     '''
+    # (B, NH, S, D_KV) -> (B, S, NH, D_KV)
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     '''
+    device = q.device
+    return map(
+            lambda x: (x * cos[position_ids].unsqueeze(unsqueeze_dim).to(device)) +
+            (rotate_half(x) * sin[position_ids].unsqueeze(unsqueeze_dim).to(device)), (q, k))
 class MotifMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
     """
     return torch.repeat_interleave(hidden_states, dim=1, repeats=n_rep)
 class MotifAttention(nn.Module):
     """
     Differential Attention (DiffAttention) module.
+    Implements the Differential Attention from
     "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.
+        DiffAttention addresses this by computing attention as the difference between
+        two separate softmax attention maps, effectively canceling noise and promoting
         sparse, structured attention patterns.
     Reference Implementation
         https://github.com/microsoft/unilm/tree/master/Diff-Transformer
     Args
+        The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
         λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
         - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
         - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
         - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
     """
     def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
         self.rope_theta = config.rope_theta
         self.is_causal = True
         self.attention_dropout = config.attention_dropout
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                              f" and `num_heads`: {self.num_heads}).")
         self.num_key_value_heads //= 2
         self.n_rep = self.num_heads // self.num_key_value_heads
         self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         for name in ["lambda_q1", "lambda_k1", "lambda_q2", "lambda_k2"]:
             setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
             getattr(self, name).data.normal_(mean=0.0, std=0.1)
         self.subln = MotifRMSNorm(2 * self.head_dim, eps=1e-5)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
+        self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
                                                 max_position_embeddings=self.max_position_embeddings,
                                                 base=self.rope_theta)
     def forward(
             self,
             hidden_states: torch.Tensor,
             output_attentions: bool = False,
             use_cache: bool = False,
             cache_position: Optional[torch.LongTensor] = None,
+            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
                                                         key_states,
                                                         cos,
                                                         sin,
+                                                        position_ids=position_ids)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         kv_seq_len = key_states.shape[-2]
         attention_mask = torch.triu(
             torch.full((q_len, kv_seq_len), float("-inf"), dtype=attn_weights.dtype, device=attn_weights.device),
             1 + offset)
         attn_weights = attn_weights + attention_mask
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(attn_weights)
         lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(attn_weights)
         lambda_full = lambda_1 - lambda_2 + self.lambda_init
         attn_weights = attn_weights.view(bsz, self.num_heads, 2, q_len, -1)
         attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
         attn_output = torch.matmul(attn_weights, value_states)
         attn_output = self.subln(attn_output)
         attn_output = attn_output * (1 - self.lambda_init)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim * 2):
             raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                              f" {attn_output.size()}")
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
 class MotifFlashAttention2(MotifAttention):
     """
     Motif flash attention module, following Motif attention module. This module inherits from `MotifAttention`
     config.max_window_layers layers.
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        logger.info(f'flash attention is used {not self._flash_attn_uses_top_left_mask}')
     def _reshape_heads(self, tensor, batch_size, seq_len):
         """2-way head split tensor reshape"""
         return tensor.reshape(batch_size, seq_len, self.num_heads, 2, self.head_dim)
         return tensor.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
     def _compute_attention(self, query_states, key_states, value_states, attention_mask, q_len, position_ids,
+                           dropout_rate, sliding_window):
         """Flash Attention 2 implements"""
+        _input_type = query_states.dtype
+        scale_factor = 1.0 / math.sqrt(self.head_dim)
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
         else:
+            causal = self.is_causal and q_len != 1
+        attn_out = _flash_attention_forward(query_states.bfloat16(),
+                                            key_states.bfloat16(),
+                                            value_states.bfloat16(),
                                             attention_mask,
                                             q_len,
                                             position_ids=position_ids,
                                             dropout=dropout_rate,
                                             sliding_window=sliding_window,
+                                            is_causal=True,
+                                            softmax_scale=scale_factor,
                                             use_top_left_mask=self._flash_attn_uses_top_left_mask)
+        return attn_out.to(_input_type)
     def forward(
             self,
     ):
         bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
                                                         key_states,
                                                         cos,
                                                         sin,
+                                                        position_ids=position_ids)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
+                and self.layer_idx >= self.config.max_window_layers):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
         k1, k2 = k1.contiguous(), k2.contiguous()
         v1, v2 = v1.contiguous(), v2.contiguous()
+        attn11, attn12 = self._compute_attention(q1, k1, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window), \
+                            self._compute_attention(q1, k1, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window)
+        attn21, attn22 = self._compute_attention(q2, k2, v1, attention_mask, q_len, position_ids, dropout_rate, sliding_window), \
+                            self._compute_attention(q2, k2, v2, attention_mask, q_len, position_ids, dropout_rate, sliding_window)
         attn1, attn2 = torch.cat([attn11, attn12], dim=-1), torch.cat([attn21, attn22], dim=-1)
         attn_output = attn_output * (1 - self.lambda_init)
         if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim * 2):
+            raise ValueError(f"`attn_output` should be of size {(bsz, q_len, self.num_heads, 2*self.head_dim)}, but is"
                              f" {attn_output.size()}")
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
 class MotifSdpaAttention(MotifAttention):
     """
     Motif attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
     SDPA API.
     """
     def forward(
             self,
             hidden_states: torch.Tensor,
             position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             logger.warning_once(
                 "MotifModel is using MotifSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
         query_states, key_states = apply_rotary_pos_emb(query_states,
                                                         key_states,
                                                         cos,
+                                                        sin)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
 MOTIF_ATTENTION_CLASSES = {
     "eager": MotifAttention,
     "flash_attention_2": MotifFlashAttention2,
+    "sdpa": MotifAttention,
 }
 class MotifDecoderLayer(nn.Module):
+    def __init__(self, config: MotifConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if config.sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered.")
+        self.self_attn = MOTIF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.mlp = MotifMLP(config)
+        self.input_layernorm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
         # Self Attention
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
         # Fully Connected
         residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states, )
     def _init_weights(self, module):
         module_std = self.config.initializer_range
         if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=module_std)
+            module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=module_std)
+            module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 @dataclass
 class MotifModelOutputWithPast(ModelOutput):
+    """
+    This augments `BaseModelOutputWithPast` in `transformers.modeling_outputs` with new optional keys: `causal_mask`, `position_embeddings`.
     The optional keys are currently used in the following ways:
+    - pass information to the token-wise last attention layers in multi-token training
     """
     last_hidden_state: torch.FloatTensor = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 """
 @add_start_docstrings(
     "The bare Motif Model outputting raw hidden-states without any specific head on top.",
     MOTIF_START_DOCSTRING,
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        num_hidden_layers = config.num_hidden_layers
+        self.layers = nn.ModuleList([MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
+        self.norm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.gradient_checkpointing = False
         self.post_init()
     def get_input_embeddings(self):
         return self.embed_tokens
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
                     "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)")
         if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(past_seen_tokens,
                                           past_seen_tokens + inputs_embeds.shape[1],
                                           device=inputs_embeds.device)
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_key_values,
                                                output_attentions)
         hidden_states = inputs_embeds
         bsz, q_len, _ = hidden_states.size()
         position_embeddings = self.rotary_emb(hidden_states, seq_len=q_len)
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
             hidden_states = layer_outputs[0]
             if use_cache:
                 next_decoder_cache = layer_outputs[2 if output_attentions else 1]
             if output_attentions:
                 all_self_attns += (layer_outputs[1], )
+        hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             all_hidden_states += (hidden_states, )
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         # SlidingWindowCache or StaticCache
         if using_sliding_window_cache or using_static_cache:
             target_length = past_key_values.get_max_cache_shape()
         return causal_mask
 class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
         super().__init__(config)
         self.model = MotifModel(config)
         self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
         if getattr(config, "tie_word_embeddings", True):
             self.tie_weights()
     def get_input_embeddings(self):
         return self.model.embed_tokens
     def get_decoder(self):
         return self.model
     @add_start_docstrings_to_model_forward(MOTIF_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs: MotifModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
         )
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        hidden_states = hidden_states
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         logits = logits.float()
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, self.config.vocab_size)
             shift_labels = shift_labels.view(-1)
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+        )