Motif-Technologies
/

Motif-2.6B

Text Generation

text-generation-inference

Model card Files Files and versions

eunhwanpark-motiftech commited on 22 days ago

Commit

f76fc65

·

verified ·

1 Parent(s): 498f8bc

Update modeling_motif.py

Files changed (1) hide show

modeling_motif.py +3 -12

modeling_motif.py CHANGED Viewed

@@ -558,7 +558,7 @@ class MotifAttention(nn.Module):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output) * self.o_proj_alpha
         if not output_attentions:
             attn_weights = None
@@ -1285,7 +1285,7 @@ class MotifModel(MotifPreTrainedModel):
                 all_self_attns += (layer_outputs[1], )
         # <|_2_|>
-        hidden_states = self.norm(hidden_states)* self.norm_alpha
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -1461,15 +1461,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
         # Initialize weights and apply final processing
         self.post_init()
-        # <|_3_|>
-        if config.muP:
-            self.lm_head.__do_scale_tager_mu_dim_base_model__=True
-        # <|_4_|>
-        self.lm_head_alpha = 1
-        if config.wesar_weights:
-            self.lm_head_alpha = nn.Parameter(torch.tensor(1).float())
         if getattr(config, "tie_word_embeddings", True):
             logger.info('tie embeddings')
             self.tie_weights()
@@ -1676,7 +1667,7 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
                                                      num_logits_to_keep=num_logits_to_keep)
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        hidden_states = hidden_states * self.lm_head_alpha
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         logits = logits.float()

         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
                 all_self_attns += (layer_outputs[1], )
         # <|_2_|>
+        hidden_states = self.norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
         # Initialize weights and apply final processing
         self.post_init()
         if getattr(config, "tie_word_embeddings", True):
             logger.info('tie embeddings')
             self.tie_weights()
                                                      num_logits_to_keep=num_logits_to_keep)
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        hidden_states = hidden_states
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         logits = logits.float()