Qubitium commited on
Commit
be8d061
·
verified ·
1 Parent(s): 0ed44b2

sync modeling_deepseek.py with upstream

Browse files
Files changed (1) hide show
  1. modeling_deepseek.py +2 -3
modeling_deepseek.py CHANGED
@@ -398,7 +398,6 @@ class MoEGate(nn.Module):
398
  self.n_routed_experts = config.n_routed_experts
399
  self.routed_scaling_factor = config.routed_scaling_factor
400
  self.scoring_func = config.scoring_func
401
- self.seq_aux = config.seq_aux
402
  self.topk_method = config.topk_method
403
  self.n_group = config.n_group
404
  self.topk_group = config.topk_group
@@ -455,7 +454,7 @@ class MoEGate(nn.Module):
455
  )
456
  .reshape(bsz * seq_len, -1)
457
  ) # [n, e]
458
- tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e]
459
  _, topk_idx = torch.topk(
460
  tmp_scores, k=self.top_k, dim=-1, sorted=False
461
  )
@@ -1846,4 +1845,4 @@ class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
1846
  past_key_values=transformer_outputs.past_key_values,
1847
  hidden_states=transformer_outputs.hidden_states,
1848
  attentions=transformer_outputs.attentions,
1849
- )
 
398
  self.n_routed_experts = config.n_routed_experts
399
  self.routed_scaling_factor = config.routed_scaling_factor
400
  self.scoring_func = config.scoring_func
 
401
  self.topk_method = config.topk_method
402
  self.n_group = config.n_group
403
  self.topk_group = config.topk_group
 
454
  )
455
  .reshape(bsz * seq_len, -1)
456
  ) # [n, e]
457
+ tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e]
458
  _, topk_idx = torch.topk(
459
  tmp_scores, k=self.top_k, dim=-1, sorted=False
460
  )
 
1845
  past_key_values=transformer_outputs.past_key_values,
1846
  hidden_states=transformer_outputs.hidden_states,
1847
  attentions=transformer_outputs.attentions,
1848
+ )