[hotfix] update ffn dim
Browse files- configuration_grok1.py +2 -2
- modeling_grok1.py +3 -4
configuration_grok1.py
CHANGED
@@ -9,7 +9,7 @@ class Grok1Config(PretrainedConfig):
|
|
9 |
self,
|
10 |
vocab_size=32000,
|
11 |
hidden_size=4096,
|
12 |
-
|
13 |
num_hidden_layers=32,
|
14 |
num_attention_heads=32,
|
15 |
num_key_value_heads=32,
|
@@ -37,7 +37,7 @@ class Grok1Config(PretrainedConfig):
|
|
37 |
self.embedding_multiplier_scale = embedding_multiplier_scale
|
38 |
self.output_multiplier_scale = output_multiplier_scale
|
39 |
self.hidden_size = hidden_size
|
40 |
-
self.
|
41 |
self.num_hidden_layers = num_hidden_layers
|
42 |
self.num_attention_heads = num_attention_heads
|
43 |
|
|
|
9 |
self,
|
10 |
vocab_size=32000,
|
11 |
hidden_size=4096,
|
12 |
+
intermediate_size=32768,
|
13 |
num_hidden_layers=32,
|
14 |
num_attention_heads=32,
|
15 |
num_key_value_heads=32,
|
|
|
37 |
self.embedding_multiplier_scale = embedding_multiplier_scale
|
38 |
self.output_multiplier_scale = output_multiplier_scale
|
39 |
self.hidden_size = hidden_size
|
40 |
+
self.intermediate_size = intermediate_size
|
41 |
self.num_hidden_layers = num_hidden_layers
|
42 |
self.num_attention_heads = num_attention_heads
|
43 |
|
modeling_grok1.py
CHANGED
@@ -395,11 +395,11 @@ class DecoderLayer(nn.Module):
|
|
395 |
def __init__(
|
396 |
self,
|
397 |
hidden_size: int,
|
|
|
398 |
num_heads: int,
|
399 |
num_key_value_heads: int,
|
400 |
num_experts: int,
|
401 |
top_k: int,
|
402 |
-
widening_factor: float = 4.0,
|
403 |
max_position_embeddings: int = 2048,
|
404 |
attn_output_multiplier: float = 1.0,
|
405 |
max_attn_val: float = 30.0,
|
@@ -414,8 +414,7 @@ class DecoderLayer(nn.Module):
|
|
414 |
attn_output_multiplier=attn_output_multiplier,
|
415 |
max_attn_val=max_attn_val,
|
416 |
)
|
417 |
-
|
418 |
-
self.moe_block = MoeBlock(hidden_size, ffn_dim, num_experts, top_k)
|
419 |
self.pre_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
420 |
self.post_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
421 |
self.pre_moe_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
@@ -543,11 +542,11 @@ class Grok1Model(Grok1PretrainedModel):
|
|
543 |
[
|
544 |
DecoderLayer(
|
545 |
hidden_size=config.hidden_size,
|
|
|
546 |
num_heads=config.num_attention_heads,
|
547 |
num_key_value_heads=config.num_key_value_heads,
|
548 |
num_experts=config.num_experts,
|
549 |
top_k=config.num_experts_per_tok,
|
550 |
-
widening_factor=config.widening_factor,
|
551 |
max_position_embeddings=config.max_position_embeddings,
|
552 |
attn_output_multiplier=config.attn_output_multiplier,
|
553 |
max_attn_val=config.max_attn_value,
|
|
|
395 |
def __init__(
|
396 |
self,
|
397 |
hidden_size: int,
|
398 |
+
intermediate_size: int,
|
399 |
num_heads: int,
|
400 |
num_key_value_heads: int,
|
401 |
num_experts: int,
|
402 |
top_k: int,
|
|
|
403 |
max_position_embeddings: int = 2048,
|
404 |
attn_output_multiplier: float = 1.0,
|
405 |
max_attn_val: float = 30.0,
|
|
|
414 |
attn_output_multiplier=attn_output_multiplier,
|
415 |
max_attn_val=max_attn_val,
|
416 |
)
|
417 |
+
self.moe_block = MoeBlock(hidden_size, intermediate_size, num_experts, top_k)
|
|
|
418 |
self.pre_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
419 |
self.post_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
420 |
self.pre_moe_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
|
|
542 |
[
|
543 |
DecoderLayer(
|
544 |
hidden_size=config.hidden_size,
|
545 |
+
intermediate_size=config.intermediate_size,
|
546 |
num_heads=config.num_attention_heads,
|
547 |
num_key_value_heads=config.num_key_value_heads,
|
548 |
num_experts=config.num_experts,
|
549 |
top_k=config.num_experts_per_tok,
|
|
|
550 |
max_position_embeddings=config.max_position_embeddings,
|
551 |
attn_output_multiplier=config.attn_output_multiplier,
|
552 |
max_attn_val=config.max_attn_value,
|