Update configuration_motif.py
Browse files- configuration_motif.py +1 -75
configuration_motif.py
CHANGED
@@ -134,7 +134,6 @@ class MotifConfig(PretrainedConfig):
|
|
134 |
sliding_window=4096,
|
135 |
max_window_layers=28,
|
136 |
attention_dropout=0.0,
|
137 |
-
multi_token_heads: Optional[int] = None,
|
138 |
**kwargs,
|
139 |
):
|
140 |
"""
|
@@ -165,87 +164,14 @@ class MotifConfig(PretrainedConfig):
|
|
165 |
self.rope_scaling = rope_scaling
|
166 |
self.attention_dropout = attention_dropout
|
167 |
|
168 |
-
###kwargs
|
169 |
-
|
170 |
-
# some scale factors
|
171 |
-
|
172 |
-
self.scale_emb = getattr(kwargs, "scale_emb", 1)
|
173 |
-
self.init_scale_o = getattr(kwargs, "init_scale_o", 1)
|
174 |
-
|
175 |
-
# muparam
|
176 |
-
self.hidden_states_shrink = 1 / math.sqrt(num_hidden_layers)
|
177 |
-
self.dim_model_base = hidden_size
|
178 |
-
self.dim_model_base_attn = (hidden_size // num_attention_heads)
|
179 |
-
self.dim_model_base_init = hidden_size
|
180 |
-
self.dim_model_base_lr = getattr(kwargs, "dim_model_base_lr", hidden_size//8)
|
181 |
-
self.dim_model_base_lmh = 1
|
182 |
-
self.dim_model_base_logits = hidden_size
|
183 |
-
|
184 |
-
self.muP = getattr(kwargs, "muP", False)
|
185 |
-
# proxy hidden size ( following YuLan-Mini )
|
186 |
-
# reparameterization(wesar_weights)
|
187 |
-
logger.info(kwargs)
|
188 |
-
self.wesar_weights = getattr(kwargs, "wesar_weights", False)
|
189 |
-
logger.info(f'initial wesar reparameterization : {self.wesar_weights}')
|
190 |
-
|
191 |
-
# alpha (scale factor)
|
192 |
-
self.embed_tokens_alpha = getattr(kwargs, "embed_tokens_alpha", None)
|
193 |
-
self.q_proj_alpha = getattr(kwargs, "q_proj_alpha", None)
|
194 |
-
self.k_proj_alpha = getattr(kwargs, "k_proj_alpha", None)
|
195 |
-
self.v_proj_alpha = getattr(kwargs, "v_proj_alpha", None)
|
196 |
-
self.o_proj_alpha = getattr(kwargs, "o_proj_alpha", None)
|
197 |
-
self.down_proj_alpha = getattr(kwargs, "down_proj_alpha", None)
|
198 |
-
self.gate_up_proj_alpha = getattr(kwargs, "gate_up_proj_alpha", None)
|
199 |
-
self.input_layernorm_alpha = getattr(kwargs, "input_layernorm_alpha", None)
|
200 |
-
self.post_attention_layernorm_alpha = getattr(kwargs, "post_attention_layernorm_alpha", None)
|
201 |
-
self.norm_alpha = getattr(kwargs, "norm_alpha", None)
|
202 |
-
self.lm_head_alpha = getattr(kwargs, "lm_head_alpha", None)
|
203 |
-
self.use_norm_alpha = getattr(kwargs, "use_norm_alpha", False)
|
204 |
-
self.use_emb_alpha = getattr(kwargs, "use_emb_alpha", False)
|
205 |
-
|
206 |
# Validate the correctness of rotary position embeddings parameters
|
207 |
# BC: if there is a 'type' field, move it to 'rope_type'.
|
208 |
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
209 |
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
210 |
rope_config_validation(self)
|
211 |
-
|
212 |
-
self.multi_token_heads = multi_token_heads
|
213 |
-
self.multi_token_config_validation()
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
# moe
|
218 |
-
self.topk_method = getattr(kwargs, "topk_method", None)
|
219 |
-
self.scoring_func = getattr(kwargs, "scoring_func", None)
|
220 |
-
self.routed_scaling_factor = getattr(kwargs, "routed_scaling_factor", None)
|
221 |
-
self.norm_topk_prob = getattr(kwargs, "norm_topk_prob", None)
|
222 |
-
self.seq_aux = getattr(kwargs, "seq_aux", None)
|
223 |
-
self.hidden_act_moe = getattr(kwargs, "hidden_act_moe", None)
|
224 |
-
|
225 |
-
|
226 |
-
self.n_group = getattr(kwargs, "n_group", None)
|
227 |
-
self.n_routed_experts = getattr(kwargs, "n_routed_experts", None)
|
228 |
-
self.moe_intermediate_size = getattr(kwargs, "moe_intermediate_size", None)
|
229 |
-
self.topk_group = getattr(kwargs, "topk_group", None)
|
230 |
-
|
231 |
-
|
232 |
-
self.use_fused_mlp = getattr(kwargs, "use_fused_mlp", None)
|
233 |
-
self.use_moreh_moe = getattr(kwargs, "use_moreh_moe", False)
|
234 |
-
self.continual_training = getattr(kwargs, "continual_training", False)
|
235 |
-
|
236 |
-
# external
|
237 |
-
self.first_expansion = getattr(kwargs, "first_expansion", False)
|
238 |
-
self.moe_layer = getattr(kwargs, "moe_layer", False)
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
super().__init__(
|
243 |
tie_word_embeddings=tie_word_embeddings,
|
244 |
**kwargs,
|
245 |
)
|
246 |
logger.info(f' kwargs : {kwargs}')
|
247 |
-
logger.info(f'after wesar reparameterization : {self.wesar_weights}')
|
248 |
-
|
249 |
-
def multi_token_config_validation(self):
|
250 |
-
if self.multi_token_heads is not None:
|
251 |
-
assert isinstance(self.multi_token_heads, int) and self.multi_token_heads >= 1
|
|
|
134 |
sliding_window=4096,
|
135 |
max_window_layers=28,
|
136 |
attention_dropout=0.0,
|
|
|
137 |
**kwargs,
|
138 |
):
|
139 |
"""
|
|
|
164 |
self.rope_scaling = rope_scaling
|
165 |
self.attention_dropout = attention_dropout
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
# Validate the correctness of rotary position embeddings parameters
|
168 |
# BC: if there is a 'type' field, move it to 'rope_type'.
|
169 |
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
170 |
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
171 |
rope_config_validation(self)
|
172 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
super().__init__(
|
174 |
tie_word_embeddings=tie_word_embeddings,
|
175 |
**kwargs,
|
176 |
)
|
177 |
logger.info(f' kwargs : {kwargs}')
|
|
|
|
|
|
|
|
|
|