Motif-Technologies
/

Motif-2.6B

@@ -284,14 +284,12 @@ class MotifMLP(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_state):
-        hidden_state = hidden_state
-        #hidden_state = self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))*
         return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
@@ -394,7 +392,7 @@ class MotifAttention(nn.Module):
             output_attentions: bool = False,
             use_cache: bool = False,
             cache_position: Optional[torch.LongTensor] = None,
-            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
@@ -493,8 +491,6 @@ class MotifFlashAttention2(MotifAttention):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
@@ -516,7 +512,6 @@ class MotifFlashAttention2(MotifAttention):
         """Flash Attention 2 implements"""
         scale_factor = 1.0 / math.sqrt(self.head_dim)
-            # Copied from _flash_attention_forward
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
@@ -881,7 +876,6 @@ class MotifPreTrainedModel(PreTrainedModel):
         if isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=module_std)
             module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
-            #torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.bias is not None:
                 module.bias.data.zero_()
@@ -1001,12 +995,8 @@ class MotifModel(MotifPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        # NOTE: For multi-token models, the last decoder layers (one for each token index)
-        # are implemented as a part of `MotifModelForCausalLM` to enable a custom forward-backward procedure.
         num_hidden_layers = config.num_hidden_layers
         self.layers = nn.ModuleList([MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
-        self._attn_implementation = config._attn_implementation
         self.norm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -1079,7 +1069,6 @@ class MotifModel(MotifPreTrainedModel):
             cache_position = torch.arange(past_seen_tokens,
                                           past_seen_tokens + inputs_embeds.shape[1],
                                           device=inputs_embeds.device)
-        #position_ids = None
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -1132,7 +1121,6 @@ class MotifModel(MotifPreTrainedModel):
             if output_attentions:
                 all_self_attns += (layer_outputs[1], )
-        # <|_2_|>
         hidden_states = self.norm(hidden_states)
         # add hidden states from the last decoder layer
@@ -1192,6 +1180,7 @@ class MotifModel(MotifPreTrainedModel):
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         # SlidingWindowCache or StaticCache
         if using_sliding_window_cache or using_static_cache:
             target_length = past_key_values.get_max_cache_shape()
@@ -1407,7 +1396,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, self.config.vocab_size)
             shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)

         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_state):
         return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
             output_attentions: bool = False,
             use_cache: bool = False,
             cache_position: Optional[torch.LongTensor] = None,
+            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         """Flash Attention 2 implements"""
         scale_factor = 1.0 / math.sqrt(self.head_dim)
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
         if isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=module_std)
             module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
             if module.bias is not None:
                 module.bias.data.zero_()
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         num_hidden_layers = config.num_hidden_layers
         self.layers = nn.ModuleList([MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
         self.norm = MotifRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
             cache_position = torch.arange(past_seen_tokens,
                                           past_seen_tokens + inputs_embeds.shape[1],
                                           device=inputs_embeds.device)
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
             if output_attentions:
                 all_self_attns += (layer_outputs[1], )
         hidden_states = self.norm(hidden_states)
         # add hidden states from the last decoder layer
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         # SlidingWindowCache or StaticCache
         if using_sliding_window_cache or using_static_cache:
             target_length = past_key_values.get_max_cache_shape()
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, self.config.vocab_size)
             shift_labels = shift_labels.view(-1)
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)