Motif-Technologies
/

Motif-2.6B

Text Generation

text-generation-inference

Model card Files Files and versions

leejunhyeok commited on 22 days ago

Commit

a55dcfd

·

verified ·

1 Parent(s): 91c40ce

Update modeling_motif.py

Files changed (1) hide show

modeling_motif.py +2 -13

modeling_motif.py CHANGED Viewed

@@ -839,7 +839,7 @@ MOTIF_ATTENTION_CLASSES = {
 class MotifDecoderLayer(nn.Module):
-    def __init__(self, config: MotifConfig, moe_layer: bool, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if config.use_moreh_attention:
@@ -853,10 +853,6 @@ class MotifDecoderLayer(nn.Module):
         else:
             self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
-        ### moe
-        self.moe = None
-        if moe_layer:
-            self.moe = MotifMoE(config)
         RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -927,13 +923,7 @@ class MotifDecoderLayer(nn.Module):
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states) * self.post_attention_layernorm_alpha
-        if self.moe is not None:
-            hidden_states, identity = self.moe(hidden_states)
-            ## add output of shared expert and output of small moe experts.
-            ## hidden state must be zero tensor (for first forward)
-            hidden_states += self.mlp(identity)
-        else:
-            hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
@@ -1114,7 +1104,6 @@ class MotifModel(MotifPreTrainedModel):
         num_hidden_layers = config.num_hidden_layers if self.multi_token_heads is None else config.num_hidden_layers - 1
-        logger.info(f'current_moe layer { moe_layer }')
         self.layers = nn.ModuleList([
             MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)
         ])

 class MotifDecoderLayer(nn.Module):
+    def __init__(self, config: MotifConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if config.use_moreh_attention:
         else:
             self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
         RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states) * self.post_attention_layernorm_alpha
+        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         num_hidden_layers = config.num_hidden_layers if self.multi_token_heads is None else config.num_hidden_layers - 1
         self.layers = nn.ModuleList([
             MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)
         ])