SmallDoge
/

Doge-20M-MoE-Instruct-SFT

Question Answering

text-generation

Model card Files Files and versions

JingzeShi commited on Apr 17

Commit

ac7a5a6

·

verified ·

1 Parent(s): ebae748

Upload DogeForCausalLM

Files changed (3) hide show

config.json +1 -1
model.safetensors +1 -1
modeling_doge.py +3 -2

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "./data/Doge-20M-MoE-Instruct-SFT/checkpoint-7258",
   "architectures": [
     "DogeForCausalLM"
   ],

 {
+  "_name_or_path": "./data/Doge-20M-MoE-Instruct-SFT",
   "architectures": [
     "DogeForCausalLM"
   ],

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1bc235190b0ddce6cb8b0e6c1e86db0a6c7d5c4e4b23656024e6f6cfdb52221d
 size 69786512

 version https://git-lfs.github.com/spec/v1
+oid sha256:388975546a77884213c70f44574a087115a55456b28c53251d72954ff0214245
 size 69786512

modeling_doge.py CHANGED Viewed

@@ -502,12 +502,13 @@ class DogeCDMoE(DogeMLP):
         routing_weights = self.router_gate(hidden_states).view(2, bsz * seq_len, -1)
         # get experts with the highest routing weights
-        (scores_x, scores_y), (indices_x, indices_y) = [w.topk(self.num_keys, dim=-1) for w in routing_weights]
         all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2)
         all_indices = indices_x.unsqueeze(-1) * self.num_keys + indices_y.unsqueeze(-2)
         all_scores = all_scores.view(*all_scores.shape[:-2], -1)
         all_indices = all_indices.view(*all_indices.shape[:-2], -1)
-        scores, indices = all_scores.topk(self.top_k, dim=-1)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)

         routing_weights = self.router_gate(hidden_states).view(2, bsz * seq_len, -1)
         # get experts with the highest routing weights
+        (scores_x, scores_y), (indices_x, indices_y) = routing_weights.topk(self.num_keys, dim=-1)
         all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2)
         all_indices = indices_x.unsqueeze(-1) * self.num_keys + indices_y.unsqueeze(-2)
         all_scores = all_scores.view(*all_scores.shape[:-2], -1)
         all_indices = all_indices.view(*all_indices.shape[:-2], -1)
+        scores, position_indices = all_scores.topk(self.top_k, dim=-1)
+        indices = all_indices.gather(-1, position_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)