jinliuxi commited on
Commit
bd95ca4
·
verified ·
1 Parent(s): 596eb94

Upload DeepseekV3ForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +29 -20
  2. generation_config.json +4 -4
  3. model.safetensors +2 -2
config.json CHANGED
@@ -1,49 +1,58 @@
1
  {
2
- "_name_or_path": "deepseekmini_pretrain/checkpoint-611314",
3
  "architectures": [
4
- "DeepseekV2ForCausalLM"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "aux_loss_alpha": 0.001,
9
- "bos_token_id": 1,
10
- "eos_token_id": 2,
11
  "ep_size": 1,
12
- "first_k_dense_replace": 1,
13
  "hidden_act": "silu",
14
  "hidden_size": 768,
15
  "initializer_range": 0.02,
16
  "intermediate_size": 2048,
17
- "kv_lora_rank": 16,
18
- "max_position_embeddings": 512,
19
- "model_type": "deepseek_v2",
20
  "moe_intermediate_size": 512,
21
  "moe_layer_freq": 1,
22
- "n_group": 2,
23
  "n_routed_experts": 8,
24
- "n_shared_experts": 2,
25
  "norm_topk_prob": true,
26
  "num_attention_heads": 16,
27
- "num_experts_per_tok": 2,
28
  "num_hidden_layers": 8,
29
  "num_key_value_heads": 16,
30
- "pad_token_id": 2,
 
31
  "pretraining_tp": 1,
32
- "q_lora_rank": 48,
33
  "qk_nope_head_dim": 64,
34
  "qk_rope_head_dim": 32,
35
  "rms_norm_eps": 1e-06,
36
- "rope_scaling": null,
 
 
 
 
 
 
 
 
37
  "rope_theta": 10000.0,
38
- "routed_scaling_factor": 1.0,
39
- "scoring_func": "softmax",
40
  "seq_aux": true,
41
  "tie_word_embeddings": false,
42
- "topk_group": 1,
43
- "topk_method": "gready",
44
  "torch_dtype": "float32",
45
- "transformers_version": "4.48.2",
46
  "use_cache": true,
47
  "v_head_dim": 64,
48
- "vocab_size": 8192
49
  }
 
1
  {
2
+ "_name_or_path": "deepseekmini_pretrain/checkpoint-107000",
3
  "architectures": [
4
+ "DeepseekV3ForCausalLM"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "aux_loss_alpha": 0.001,
9
+ "bos_token_id": 0,
10
+ "eos_token_id": 1,
11
  "ep_size": 1,
12
+ "first_k_dense_replace": 3,
13
  "hidden_act": "silu",
14
  "hidden_size": 768,
15
  "initializer_range": 0.02,
16
  "intermediate_size": 2048,
17
+ "kv_lora_rank": 128,
18
+ "max_position_embeddings": 4096,
19
+ "model_type": "deepseek_v3",
20
  "moe_intermediate_size": 512,
21
  "moe_layer_freq": 1,
22
+ "n_group": 4,
23
  "n_routed_experts": 8,
24
+ "n_shared_experts": 1,
25
  "norm_topk_prob": true,
26
  "num_attention_heads": 16,
27
+ "num_experts_per_tok": 3,
28
  "num_hidden_layers": 8,
29
  "num_key_value_heads": 16,
30
+ "num_nextn_predict_layers": 1,
31
+ "pad_token_id": 1,
32
  "pretraining_tp": 1,
33
+ "q_lora_rank": 256,
34
  "qk_nope_head_dim": 64,
35
  "qk_rope_head_dim": 32,
36
  "rms_norm_eps": 1e-06,
37
+ "rope_scaling": {
38
+ "beta_fast": 32,
39
+ "beta_slow": 1,
40
+ "factor": 40,
41
+ "mscale": 1.0,
42
+ "mscale_all_dim": 1.0,
43
+ "original_max_position_embeddings": 2048,
44
+ "type": "yarn"
45
+ },
46
  "rope_theta": 10000.0,
47
+ "routed_scaling_factor": 2.5,
48
+ "scoring_func": "sigmoid",
49
  "seq_aux": true,
50
  "tie_word_embeddings": false,
51
+ "topk_group": 2,
52
+ "topk_method": "noaux_tc",
53
  "torch_dtype": "float32",
54
+ "transformers_version": "4.48.3",
55
  "use_cache": true,
56
  "v_head_dim": 64,
57
+ "vocab_size": 16384
58
  }
generation_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "eos_token_id": 2,
5
- "pad_token_id": 2,
6
- "transformers_version": "4.48.2"
7
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.48.3"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f19643421a66b64d59b6ac5a6ad8c60c2d6a955c1340864247e2bb5ed9effb5
3
- size 430698776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4931e079fdbaa22b0c91336079c7b281fc8ae648b4ad20d4664a157aec43ee0d
3
+ size 426198272