leejunhyeok commited on
Commit
7d2479a
·
verified ·
1 Parent(s): 7ec81c6

Update configuration_motif.py

Browse files
Files changed (1) hide show
  1. configuration_motif.py +1 -75
configuration_motif.py CHANGED
@@ -134,7 +134,6 @@ class MotifConfig(PretrainedConfig):
134
  sliding_window=4096,
135
  max_window_layers=28,
136
  attention_dropout=0.0,
137
- multi_token_heads: Optional[int] = None,
138
  **kwargs,
139
  ):
140
  """
@@ -165,87 +164,14 @@ class MotifConfig(PretrainedConfig):
165
  self.rope_scaling = rope_scaling
166
  self.attention_dropout = attention_dropout
167
 
168
- ###kwargs
169
-
170
- # some scale factors
171
-
172
- self.scale_emb = getattr(kwargs, "scale_emb", 1)
173
- self.init_scale_o = getattr(kwargs, "init_scale_o", 1)
174
-
175
- # muparam
176
- self.hidden_states_shrink = 1 / math.sqrt(num_hidden_layers)
177
- self.dim_model_base = hidden_size
178
- self.dim_model_base_attn = (hidden_size // num_attention_heads)
179
- self.dim_model_base_init = hidden_size
180
- self.dim_model_base_lr = getattr(kwargs, "dim_model_base_lr", hidden_size//8)
181
- self.dim_model_base_lmh = 1
182
- self.dim_model_base_logits = hidden_size
183
-
184
- self.muP = getattr(kwargs, "muP", False)
185
- # proxy hidden size ( following YuLan-Mini )
186
- # reparameterization(wesar_weights)
187
- logger.info(kwargs)
188
- self.wesar_weights = getattr(kwargs, "wesar_weights", False)
189
- logger.info(f'initial wesar reparameterization : {self.wesar_weights}')
190
-
191
- # alpha (scale factor)
192
- self.embed_tokens_alpha = getattr(kwargs, "embed_tokens_alpha", None)
193
- self.q_proj_alpha = getattr(kwargs, "q_proj_alpha", None)
194
- self.k_proj_alpha = getattr(kwargs, "k_proj_alpha", None)
195
- self.v_proj_alpha = getattr(kwargs, "v_proj_alpha", None)
196
- self.o_proj_alpha = getattr(kwargs, "o_proj_alpha", None)
197
- self.down_proj_alpha = getattr(kwargs, "down_proj_alpha", None)
198
- self.gate_up_proj_alpha = getattr(kwargs, "gate_up_proj_alpha", None)
199
- self.input_layernorm_alpha = getattr(kwargs, "input_layernorm_alpha", None)
200
- self.post_attention_layernorm_alpha = getattr(kwargs, "post_attention_layernorm_alpha", None)
201
- self.norm_alpha = getattr(kwargs, "norm_alpha", None)
202
- self.lm_head_alpha = getattr(kwargs, "lm_head_alpha", None)
203
- self.use_norm_alpha = getattr(kwargs, "use_norm_alpha", False)
204
- self.use_emb_alpha = getattr(kwargs, "use_emb_alpha", False)
205
-
206
  # Validate the correctness of rotary position embeddings parameters
207
  # BC: if there is a 'type' field, move it to 'rope_type'.
208
  if self.rope_scaling is not None and "type" in self.rope_scaling:
209
  self.rope_scaling["rope_type"] = self.rope_scaling["type"]
210
  rope_config_validation(self)
211
-
212
- self.multi_token_heads = multi_token_heads
213
- self.multi_token_config_validation()
214
-
215
-
216
-
217
- # moe
218
- self.topk_method = getattr(kwargs, "topk_method", None)
219
- self.scoring_func = getattr(kwargs, "scoring_func", None)
220
- self.routed_scaling_factor = getattr(kwargs, "routed_scaling_factor", None)
221
- self.norm_topk_prob = getattr(kwargs, "norm_topk_prob", None)
222
- self.seq_aux = getattr(kwargs, "seq_aux", None)
223
- self.hidden_act_moe = getattr(kwargs, "hidden_act_moe", None)
224
-
225
-
226
- self.n_group = getattr(kwargs, "n_group", None)
227
- self.n_routed_experts = getattr(kwargs, "n_routed_experts", None)
228
- self.moe_intermediate_size = getattr(kwargs, "moe_intermediate_size", None)
229
- self.topk_group = getattr(kwargs, "topk_group", None)
230
-
231
-
232
- self.use_fused_mlp = getattr(kwargs, "use_fused_mlp", None)
233
- self.use_moreh_moe = getattr(kwargs, "use_moreh_moe", False)
234
- self.continual_training = getattr(kwargs, "continual_training", False)
235
-
236
- # external
237
- self.first_expansion = getattr(kwargs, "first_expansion", False)
238
- self.moe_layer = getattr(kwargs, "moe_layer", False)
239
-
240
-
241
-
242
  super().__init__(
243
  tie_word_embeddings=tie_word_embeddings,
244
  **kwargs,
245
  )
246
  logger.info(f' kwargs : {kwargs}')
247
- logger.info(f'after wesar reparameterization : {self.wesar_weights}')
248
-
249
- def multi_token_config_validation(self):
250
- if self.multi_token_heads is not None:
251
- assert isinstance(self.multi_token_heads, int) and self.multi_token_heads >= 1
 
134
  sliding_window=4096,
135
  max_window_layers=28,
136
  attention_dropout=0.0,
 
137
  **kwargs,
138
  ):
139
  """
 
164
  self.rope_scaling = rope_scaling
165
  self.attention_dropout = attention_dropout
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  # Validate the correctness of rotary position embeddings parameters
168
  # BC: if there is a 'type' field, move it to 'rope_type'.
169
  if self.rope_scaling is not None and "type" in self.rope_scaling:
170
  self.rope_scaling["rope_type"] = self.rope_scaling["type"]
171
  rope_config_validation(self)
172
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  super().__init__(
174
  tie_word_embeddings=tie_word_embeddings,
175
  **kwargs,
176
  )
177
  logger.info(f' kwargs : {kwargs}')