Motif-Technologies
/

Motif-2.6B

@@ -34,8 +34,7 @@ from transformers.activations import ClassInstantier
 class PolyNorm(torch.nn.Module):
     """
     A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
-    The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md,
-    with the change `* torch.rsqrt` => `/ torch.sqrt` for potential MAF incompatibility.
     """
     def __init__(self, eps=1e-6):
@@ -117,7 +116,6 @@ class MotifRotaryEmbeddingWithCache(nn.Module):
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(seq_len=max_position_embeddings,
                                 device=self.inv_freq.device,
                                 dtype=torch.get_default_dtype())
@@ -173,7 +171,6 @@ class MotifRotaryEmbedding(nn.Module):
             self.max_seq_len_cached = max_position_embeddings
             self.original_max_seq_len = max_position_embeddings
         else:
-            # BC: "rope_type" was originally "type"
             if config.rope_scaling is not None:
                 self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
             else:
@@ -364,18 +361,15 @@ class MotifAttention(nn.Module):
         self.num_key_value_heads //= 2
         self.n_rep = self.num_heads // self.num_key_value_heads
-        # re-init projections
         self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
-        # init lambdas
         for name in ["lambda_q1", "lambda_k1", "lambda_q2", "lambda_k2"]:
             setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
             getattr(self, name).data.normal_(mean=0.0, std=0.1)
-        # Uses same norm as motif norm, without elementwise_affine option
         self.subln = MotifRMSNorm(2 * self.head_dim, eps=1e-5)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
@@ -400,8 +394,6 @@ class MotifAttention(nn.Module):
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        ## bsz, seq, n_heads, head_dim
         query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, 2 * self.head_dim).transpose(1, 2)
@@ -428,11 +420,9 @@ class MotifAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        ## bsz, #haead, q_len, head_dim -> bsz, head, q_len, q_len
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         kv_seq_len = key_states.shape[-2]
@@ -442,24 +432,19 @@ class MotifAttention(nn.Module):
             torch.full((q_len, kv_seq_len), float("-inf"), dtype=attn_weights.dtype, device=attn_weights.device),
             1 + offset)
-        ###add attn
         attn_weights = attn_weights + attention_mask
-        # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        # differential transformer lambdas
         lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(attn_weights)
         lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(attn_weights)
         lambda_full = lambda_1 - lambda_2 + self.lambda_init
         attn_weights = attn_weights.view(bsz, self.num_heads, 2, q_len, -1)
         attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
-        ##shape : bsz, #heads, seq, head_dim
         attn_output = torch.matmul(attn_weights, value_states)
         attn_output = self.subln(attn_output)
         attn_output = attn_output * (1 - self.lambda_init)
@@ -487,10 +472,8 @@ class MotifFlashAttention2(MotifAttention):
     config.max_window_layers layers.
     """
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
@@ -572,7 +555,6 @@ class MotifFlashAttention2(MotifAttention):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
@@ -665,7 +647,6 @@ class MotifSdpaAttention(MotifAttention):
     SDPA API.
     """
-    # Adapted from MotifAttention.forward
     def forward(
             self,
             hidden_states: torch.Tensor,
@@ -678,7 +659,6 @@ class MotifSdpaAttention(MotifAttention):
             position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
                 "MotifModel is using MotifSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
@@ -882,7 +862,6 @@ class MotifPreTrainedModel(PreTrainedModel):
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=module_std)
             module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
-            #torch.nn.init.trunc_normal_(module.weight.data, mean=0.0, std=module_std, a=-3*module_std, b=3*module_std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
@@ -1048,7 +1027,6 @@ class MotifModel(MotifPreTrainedModel):
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False
-        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
@@ -1077,10 +1055,8 @@ class MotifModel(MotifPreTrainedModel):
         hidden_states = inputs_embeds
         bsz, q_len, _ = hidden_states.size()
-        # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, seq_len=q_len)
-        # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
@@ -1123,7 +1099,6 @@ class MotifModel(MotifPreTrainedModel):
         hidden_states = self.norm(hidden_states)
-        # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states, )
@@ -1289,7 +1264,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
         self.post_init()
         if getattr(config, "tie_word_embeddings", True):
-            logger.info('tie embeddings')
             self.tie_weights()
     def get_input_embeddings(self):

 class PolyNorm(torch.nn.Module):
     """
     A trainable activation function introduced in https://arxiv.org/html/2411.03884v1.
+    The code is copied from https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md
     """
     def __init__(self, eps=1e-6):
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self._set_cos_sin_cache(seq_len=max_position_embeddings,
                                 device=self.inv_freq.device,
                                 dtype=torch.get_default_dtype())
             self.max_seq_len_cached = max_position_embeddings
             self.original_max_seq_len = max_position_embeddings
         else:
             if config.rope_scaling is not None:
                 self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
             else:
         self.num_key_value_heads //= 2
         self.n_rep = self.num_heads // self.num_key_value_heads
         self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         for name in ["lambda_q1", "lambda_k1", "lambda_q2", "lambda_k2"]:
             setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
             getattr(self, name).data.normal_(mean=0.0, std=0.1)
         self.subln = MotifRMSNorm(2 * self.head_dim, eps=1e-5)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, 2 * self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, 2 * self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, 2 * self.head_dim).transpose(1, 2)
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         kv_seq_len = key_states.shape[-2]
             torch.full((q_len, kv_seq_len), float("-inf"), dtype=attn_weights.dtype, device=attn_weights.device),
             1 + offset)
         attn_weights = attn_weights + attention_mask
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(attn_weights)
         lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(attn_weights)
         lambda_full = lambda_1 - lambda_2 + self.lambda_init
         attn_weights = attn_weights.view(bsz, self.num_heads, 2, q_len, -1)
         attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
         attn_output = torch.matmul(attn_weights, value_states)
         attn_output = self.subln(attn_output)
         attn_output = attn_output * (1 - self.lambda_init)
     config.max_window_layers layers.
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
     SDPA API.
     """
     def forward(
             self,
             hidden_states: torch.Tensor,
             position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             logger.warning_once(
                 "MotifModel is using MotifSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=module_std)
             module.weight.data = torch.where(abs(module.weight.data) > module_std*3, 0, module.weight.data)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
         hidden_states = inputs_embeds
         bsz, q_len, _ = hidden_states.size()
         position_embeddings = self.rotary_emb(hidden_states, seq_len=q_len)
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
         hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             all_hidden_states += (hidden_states, )
         self.post_init()
         if getattr(config, "tie_word_embeddings", True):
             self.tie_weights()
     def get_input_embeddings(self):