Deepseek V3

#93
by cybercyb - opened
Files changed (3) hide show
  1. config.json +3 -0
  2. configuration_deepseek.py +11 -0
  3. modeling_deepseek.py +2 -1
config.json CHANGED
@@ -9,6 +9,7 @@
9
  "AutoModel": "modeling_deepseek.DeepseekV3Model",
10
  "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
11
  },
 
12
  "bos_token_id": 0,
13
  "eos_token_id": 1,
14
  "ep_size": 1,
@@ -31,6 +32,7 @@
31
  "num_hidden_layers": 61,
32
  "num_key_value_heads": 128,
33
  "num_nextn_predict_layers": 1,
 
34
  "q_lora_rank": 1536,
35
  "qk_nope_head_dim": 128,
36
  "qk_rope_head_dim": 64,
@@ -56,6 +58,7 @@
56
  "rope_theta": 10000,
57
  "routed_scaling_factor": 2.5,
58
  "scoring_func": "sigmoid",
 
59
  "tie_word_embeddings": false,
60
  "topk_group": 4,
61
  "topk_method": "noaux_tc",
 
9
  "AutoModel": "modeling_deepseek.DeepseekV3Model",
10
  "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
11
  },
12
+ "aux_loss_alpha": 0.001,
13
  "bos_token_id": 0,
14
  "eos_token_id": 1,
15
  "ep_size": 1,
 
32
  "num_hidden_layers": 61,
33
  "num_key_value_heads": 128,
34
  "num_nextn_predict_layers": 1,
35
+ "pretraining_tp": 1,
36
  "q_lora_rank": 1536,
37
  "qk_nope_head_dim": 128,
38
  "qk_rope_head_dim": 64,
 
58
  "rope_theta": 10000,
59
  "routed_scaling_factor": 2.5,
60
  "scoring_func": "sigmoid",
61
+ "seq_aux": true,
62
  "tie_word_embeddings": false,
63
  "topk_group": 4,
64
  "topk_method": "noaux_tc",
configuration_deepseek.py CHANGED
@@ -82,6 +82,11 @@ class DeepseekV3Config(PretrainedConfig):
82
  Beginning of stream token id.
83
  eos_token_id (`int`, *optional*, defaults to 2):
84
  End of stream token id.
 
 
 
 
 
85
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
86
  Whether to tie weight embeddings
87
  rope_theta (`float`, *optional*, defaults to 10000.0):
@@ -136,6 +141,8 @@ class DeepseekV3Config(PretrainedConfig):
136
  first_k_dense_replace = 3,
137
  norm_topk_prob = True,
138
  scoring_func = 'sigmoid',
 
 
139
  hidden_act="silu",
140
  max_position_embeddings=4096,
141
  initializer_range=0.02,
@@ -144,6 +151,7 @@ class DeepseekV3Config(PretrainedConfig):
144
  pad_token_id=None,
145
  bos_token_id=0,
146
  eos_token_id=1,
 
147
  tie_word_embeddings=False,
148
  rope_theta=10000.0,
149
  rope_scaling=None,
@@ -176,6 +184,8 @@ class DeepseekV3Config(PretrainedConfig):
176
  self.first_k_dense_replace = first_k_dense_replace
177
  self.norm_topk_prob = norm_topk_prob
178
  self.scoring_func = scoring_func
 
 
179
  # for backward compatibility
180
  if num_key_value_heads is None:
181
  num_key_value_heads = num_attention_heads
@@ -184,6 +194,7 @@ class DeepseekV3Config(PretrainedConfig):
184
  self.hidden_act = hidden_act
185
  self.initializer_range = initializer_range
186
  self.rms_norm_eps = rms_norm_eps
 
187
  self.use_cache = use_cache
188
  self.rope_theta = rope_theta
189
  self.rope_scaling = rope_scaling
 
82
  Beginning of stream token id.
83
  eos_token_id (`int`, *optional*, defaults to 2):
84
  End of stream token id.
85
+ pretraining_tp (`int`, *optional*, defaults to 1):
86
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
87
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
88
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
89
+ issue](https://github.com/pytorch/pytorch/issues/76232).
90
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
91
  Whether to tie weight embeddings
92
  rope_theta (`float`, *optional*, defaults to 10000.0):
 
141
  first_k_dense_replace = 3,
142
  norm_topk_prob = True,
143
  scoring_func = 'sigmoid',
144
+ aux_loss_alpha = 0.001,
145
+ seq_aux = True,
146
  hidden_act="silu",
147
  max_position_embeddings=4096,
148
  initializer_range=0.02,
 
151
  pad_token_id=None,
152
  bos_token_id=0,
153
  eos_token_id=1,
154
+ pretraining_tp=1,
155
  tie_word_embeddings=False,
156
  rope_theta=10000.0,
157
  rope_scaling=None,
 
184
  self.first_k_dense_replace = first_k_dense_replace
185
  self.norm_topk_prob = norm_topk_prob
186
  self.scoring_func = scoring_func
187
+ self.aux_loss_alpha = aux_loss_alpha
188
+ self.seq_aux = seq_aux
189
  # for backward compatibility
190
  if num_key_value_heads is None:
191
  num_key_value_heads = num_attention_heads
 
194
  self.hidden_act = hidden_act
195
  self.initializer_range = initializer_range
196
  self.rms_norm_eps = rms_norm_eps
197
+ self.pretraining_tp = pretraining_tp
198
  self.use_cache = use_cache
199
  self.rope_theta = rope_theta
200
  self.rope_scaling = rope_scaling
modeling_deepseek.py CHANGED
@@ -398,6 +398,7 @@ class MoEGate(nn.Module):
398
  self.n_routed_experts = config.n_routed_experts
399
  self.routed_scaling_factor = config.routed_scaling_factor
400
  self.scoring_func = config.scoring_func
 
401
  self.topk_method = config.topk_method
402
  self.n_group = config.n_group
403
  self.topk_group = config.topk_group
@@ -454,7 +455,7 @@ class MoEGate(nn.Module):
454
  )
455
  .reshape(bsz * seq_len, -1)
456
  ) # [n, e]
457
- tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e]
458
  _, topk_idx = torch.topk(
459
  tmp_scores, k=self.top_k, dim=-1, sorted=False
460
  )
 
398
  self.n_routed_experts = config.n_routed_experts
399
  self.routed_scaling_factor = config.routed_scaling_factor
400
  self.scoring_func = config.scoring_func
401
+ self.seq_aux = config.seq_aux
402
  self.topk_method = config.topk_method
403
  self.n_group = config.n_group
404
  self.topk_group = config.topk_group
 
455
  )
456
  .reshape(bsz * seq_len, -1)
457
  ) # [n, e]
458
+ tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e]
459
  _, topk_idx = torch.topk(
460
  tmp_scores, k=self.top_k, dim=-1, sorted=False
461
  )