Motif-Technologies
/

Motif-2.6B

@@ -263,7 +263,7 @@ def rotate_half(x):
     return rotated_tensor
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=False):
     """
     Applies rotary position embeddings to the input tensors.
@@ -274,9 +274,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fus
         sin (torch.Tensor): Sine values for rotary embedding.
         unsqueeze_dim (int, optional): Dimension along which `cos` and `sin` are unsqueezed.
             Defaults to 1.
-        fused_rope (bool, optional): If True, applies fused rotary embeddings using
-            `moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
-            Defaults to False.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
@@ -288,31 +285,10 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fus
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     '''
-    if not fused_rope:
-        device = q.device
-        return map(
             lambda x: (x * cos[position_ids].unsqueeze(unsqueeze_dim).to(device)) +
             (rotate_half(x) * sin[position_ids].unsqueeze(unsqueeze_dim).to(device)), (q, k))
-    else:
-        # (B, NH, S, D_KV) -> (B, S, NH, D_KV)
-        cos = cos[position_ids]
-        sin = sin[position_ids]
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        # Expand 'batch' dim
-        cos = cos.expand(q.shape[0], *cos.shape[1:])
-        sin = sin.expand(q.shape[0], *sin.shape[1:])
-        q_embed = moreh_ops.apply_rotary_emb(q, cos, sin, opcode=1)
-        k_embed = moreh_ops.apply_rotary_emb(k, cos, sin, opcode=1)
-        # (B, S, NH, D_KV) -> (B, NH, S, D_KV)
-        q_embed = q_embed.transpose(1, 2)
-        k_embed = k_embed.transpose(1, 2)
-        return q_embed, k_embed
 class MotifMLP(nn.Module):
@@ -461,8 +437,7 @@ class MotifAttention(nn.Module):
                                                         key_states,
                                                         cos,
                                                         sin,
-                                                        position_ids=position_ids,
-                                                        fused_rope=self.config.fused_rope)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
@@ -609,8 +584,7 @@ class MotifFlashAttention2(MotifAttention):
                                                         key_states,
                                                         cos,
                                                         sin,
-                                                        position_ids=position_ids,
-                                                        fused_rope=False)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
@@ -758,8 +732,7 @@ class MotifSdpaAttention(MotifAttention):
         query_states, key_states = apply_rotary_pos_emb(query_states,
                                                         key_states,
                                                         cos,
-                                                        sin,
-                                                        fused_rope=self.config.fused_rope)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models

     return rotated_tensor
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """
     Applies rotary position embeddings to the input tensors.
         sin (torch.Tensor): Sine values for rotary embedding.
         unsqueeze_dim (int, optional): Dimension along which `cos` and `sin` are unsqueezed.
             Defaults to 1.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     '''
+    device = q.device
+    return map(
             lambda x: (x * cos[position_ids].unsqueeze(unsqueeze_dim).to(device)) +
             (rotate_half(x) * sin[position_ids].unsqueeze(unsqueeze_dim).to(device)), (q, k))
 class MotifMLP(nn.Module):
                                                         key_states,
                                                         cos,
                                                         sin,
+                                                        position_ids=position_ids)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                                                         key_states,
                                                         cos,
                                                         sin,
+                                                        position_ids=position_ids)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
         query_states, key_states = apply_rotary_pos_emb(query_states,
                                                         key_states,
                                                         cos,
+                                                        sin)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models