danielhanchen commited on
Commit
f4457dd
·
verified ·
1 Parent(s): 6b6110b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. LICENSE.DeepSeek +21 -0
  3. README.md +134 -0
  4. chat_template.jinja +1 -0
  5. config.json +69 -0
  6. configuration_deepseek.py +199 -0
  7. intelligence_score_vs_output_tokens.png +3 -0
  8. model-00001-of-000163.safetensors +3 -0
  9. model-00002-of-000163.safetensors +3 -0
  10. model-00003-of-000163.safetensors +3 -0
  11. model-00004-of-000163.safetensors +3 -0
  12. model-00005-of-000163.safetensors +3 -0
  13. model-00006-of-000163.safetensors +3 -0
  14. model-00007-of-000163.safetensors +3 -0
  15. model-00008-of-000163.safetensors +3 -0
  16. model-00009-of-000163.safetensors +3 -0
  17. model-00010-of-000163.safetensors +3 -0
  18. model-00011-of-000163.safetensors +3 -0
  19. model-00012-of-000163.safetensors +3 -0
  20. model-00013-of-000163.safetensors +3 -0
  21. model-00014-of-000163.safetensors +3 -0
  22. model-00015-of-000163.safetensors +3 -0
  23. model-00016-of-000163.safetensors +3 -0
  24. model-00017-of-000163.safetensors +3 -0
  25. model-00018-of-000163.safetensors +3 -0
  26. model-00019-of-000163.safetensors +3 -0
  27. model-00020-of-000163.safetensors +3 -0
  28. model-00021-of-000163.safetensors +3 -0
  29. model-00022-of-000163.safetensors +3 -0
  30. model-00023-of-000163.safetensors +3 -0
  31. model-00024-of-000163.safetensors +3 -0
  32. model-00025-of-000163.safetensors +3 -0
  33. model-00026-of-000163.safetensors +3 -0
  34. model-00027-of-000163.safetensors +3 -0
  35. model-00028-of-000163.safetensors +3 -0
  36. model-00029-of-000163.safetensors +3 -0
  37. model-00030-of-000163.safetensors +3 -0
  38. model-00031-of-000163.safetensors +3 -0
  39. model-00032-of-000163.safetensors +3 -0
  40. model-00033-of-000163.safetensors +3 -0
  41. model-00034-of-000163.safetensors +3 -0
  42. model-00035-of-000163.safetensors +3 -0
  43. model-00036-of-000163.safetensors +3 -0
  44. model-00037-of-000163.safetensors +3 -0
  45. model-00038-of-000163.safetensors +3 -0
  46. model-00039-of-000163.safetensors +3 -0
  47. model-00040-of-000163.safetensors +3 -0
  48. model-00041-of-000163.safetensors +3 -0
  49. model-00042-of-000163.safetensors +3 -0
  50. model-00043-of-000163.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ intelligence_score_vs_output_tokens.png filter=lfs diff=lfs merge=lfs -text
LICENSE.DeepSeek ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 DeepSeek
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - unsloth
4
+ - unsloth
5
+ license: mit
6
+ library_name: transformers
7
+ base_model:
8
+ - tngtech/DeepSeek-TNG-R1T2-Chimera-BF16
9
+ - deepseek-ai/DeepSeek-R1
10
+ - deepseek-ai/DeepSeek-R1-0528
11
+ pipeline_tag: text-generation
12
+ ---
13
+ <div>
14
+ <p style="margin-top: 0;margin-bottom: 0;">
15
+ <em><a href="https://docs.unsloth.ai/basics/unsloth-dynamic-v2.0-gguf">Unsloth Dynamic 2.0</a> achieves superior accuracy & outperforms other leading quants.</em>
16
+ </p>
17
+ <div style="display: flex; gap: 5px; align-items: center; ">
18
+ <a href="https://github.com/unslothai/unsloth/">
19
+ <img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="133">
20
+ </a>
21
+ <a href="https://discord.gg/unsloth">
22
+ <img src="https://github.com/unslothai/unsloth/raw/main/images/Discord%20button.png" width="173">
23
+ </a>
24
+ <a href="https://docs.unsloth.ai/">
25
+ <img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation%20green%20button.png" width="143">
26
+ </a>
27
+ </div>
28
+ </div>
29
+
30
+ <div>
31
+ <p style="margin-top: 0;margin-bottom: 0;">
32
+ <em><a href="https://docs.unsloth.ai/basics/unsloth-dynamic-v2.0-gguf">Unsloth Dynamic 2.0</a> achieves superior accuracy & outperforms other leading quants.</em>
33
+ </p>
34
+ <div style="display: flex; gap: 5px; align-items: center; ">
35
+ <a href="https://github.com/unslothai/unsloth/">
36
+ <img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="133">
37
+ </a>
38
+ <a href="https://discord.gg/unsloth">
39
+ <img src="https://github.com/unslothai/unsloth/raw/main/images/Discord%20button.png" width="173">
40
+ </a>
41
+ <a href="https://docs.unsloth.ai/">
42
+ <img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation%20green%20button.png" width="143">
43
+ </a>
44
+ </div>
45
+ </div>
46
+
47
+ # DeepSeek-TNG-R1T2-Chimera
48
+
49
+ <div align="center">
50
+ <img src="https://354918363417-runtime-assets.s3.eu-central-1.amazonaws.com/company_logo_light.svg"
51
+ alt="TNG Logo"
52
+ width="400"
53
+ style="display: inline-block; vertical-align: middle;"/>
54
+ </div>
55
+ <br>
56
+ <div align="center">
57
+ <a href="https://huggingface.co/tngtech/DeepSeek-TNG-R1T2-Chimera/blob/main/LICENSE.DeepSeek" style="margin: 2px;">
58
+ <img alt="License" src="https://img.shields.io/badge/License-MIT-f5de53?&color=f5de53" style="display: inline-block; vertical-align: middle;"/>
59
+ </a>
60
+ </div>
61
+ <br>
62
+ <div align="center">
63
+ <img alt="Intelligence Score" src="intelligence_score_vs_output_tokens.png" style="display: inline-block; vertical-align: middle;" width="750"/>
64
+ </div>
65
+
66
+ **Assembly of Experts Chimera model constructed with the DeepSeek [R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528), [R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) and [V3-0324](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324) parent models**
67
+
68
+ We present our new **DeepSeek-TNG R1T2 Chimera** 671B model, the first successor to our original [*DeepSeek R1T Chimera*](https://huggingface.co/tngtech/DeepSeek-R1T-Chimera) that was released on April 26th. Unlike the original Chimera, which was based on the *two parent models* V3-0324 and R1, the new Chimera is a **Tri-Mind** *with three parents*, namely additionally R1-0528. It is constructed using the Assembly of Experts-method with relatively fine-granular direct brain edits. This more refined assembly allowed, among other improvements, the fixing of the &lt;think&gt; token consistency issue, which was a weakness of R1T and is now solved for R1T2.
69
+
70
+ **Sweet spot**
71
+
72
+ R1T2 operates at a new sweet spot in intelligence vs. output token length. It appears to be...
73
+
74
+ - about **20% faster than** the regular **R1**, and more than **twice as fast as R1-0528**
75
+ - significantly **more intelligent than** the regular **R1** in benchmarks such as **GPQA** and **AIME-24**
76
+ - much **more intelligent** and also **think-token consistent** compared to the first **R1T Chimera** 0426
77
+ - and generally well-behaved and a **nice persona** to talk to, even without any system prompt.
78
+
79
+ **Recommendations for your model decision**
80
+
81
+ *R1T2* compared...
82
+ - *vs R1:* We hope that R1T2 is a very desirable, almost universal **better and drop-in replacement for R1**
83
+ - *vs R1-0528:* R1T2 is a much **cheaper alternative to full R1-0528**, if the fullest 0528-level intelligence is not required
84
+ - *vs R1T:* R1T2 is usually **recommended over R1T**, unless the specific personality of R1T was optimal, the think-token issue not important, or R1T's higher speed crucial
85
+ - *vs V3-0324:* V3 is so much faster that if you can live with the **lower intelligence, take V3**, however, if you **need reasoning, R1T2** is the go-to model
86
+
87
+ **Limitations**
88
+
89
+ - **R1-0528** is thinking much longer, but also is achieving **better hard benchmark results** than R1T2
90
+ - As measured by SpeechMap.ai (courtesy of xlr8harder), **R1T2** is significantly **more reserved** than R1T, but not as much as R1-0528
91
+ - Due to the influence of its R1 parent, which does not support function calling, **R1T2 is not yet recommended for function-calling** intensive applications at this stage (this may be fixed at a later stage)
92
+ - When switching from R1T to R1T2 development, we changed from AIME24 and MT-Bench to AIME24, AIME25 and GPQA-Diamond for the intelligence score. With the new benchmark set, there is a larger score difference between R1 and the original R1T Chimera than published earlier.
93
+
94
+ **Technological background**
95
+
96
+ For details on the AoE construction process, you can read our [Paper on arXiV](https://arxiv.org/abs/2506.14794).
97
+
98
+
99
+ ## Model Details
100
+
101
+ - **Architecture**: DeepSeek-MoE transformer-based language model
102
+ - **Combination Method**: Assembly of Experts from the three DeepSeek parent models R1-0528, R1 and V3-0324
103
+ - **Release Date**: 2025-07-02
104
+ - **Design Team**: Robert Dahlke, Henrik Klagges, Benjamin Merkel, Fabian Klemm and David Reiss, Munich, Germany
105
+ - **Extra Thanks**: Big thanks to DeepSeek for their great models and open-source generosity, and to the other researchers that have published on model merging methodologies.
106
+
107
+
108
+ ## Use, Out-of-scope Use, Other Limitations, Risks, Recommendations et al.
109
+ Regarding the R1T/R1T2-Chimeras, we ask you to follow the careful guidelines that Microsoft has created for their "MAI-DS-R1" DeepSeek-based model.
110
+ These professional guidelines are available [here on Hugging Face](https://huggingface.co/microsoft/MAI-DS-R1).
111
+
112
+ ## EU AI Act
113
+
114
+ Due to the strict new guidelines of the EU AI Act that take effect on August 2nd 2025, we recommend that each R1T/R1T2 user in the EU either familiarizes themselves with these requirements and assess their compliance, or ceases using the model in the EU after August 1st, 2025.
115
+
116
+ ## Contact, especially for your user feedback
117
+
118
+ Please give us your feedback, especially if you find deficiencies in the model:
119
+ - Email: [email protected]
120
+ - X.com: @tngtech
121
+
122
+ ## Citation
123
+
124
+ ```
125
+ @misc{tng_technology_consulting_gmbh_2025_07_0x,
126
+ author = { TNG Technology Consulting GmbH },
127
+ title = { DeepSeek-TNG-R1T2-Chimera },
128
+ year = 2025,
129
+ month = { July },
130
+ url = { https://huggingface.co/tngtech/DeepSeek-TNG-R1T2-Chimera },
131
+ doi = { xxx },
132
+ publisher = { Hugging Face }
133
+ }
134
+ ```
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = (content.split('</think>')|last) %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeepseekV3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
9
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
10
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
11
+ },
12
+ "bos_token_id": 0,
13
+ "eos_token_id": 1,
14
+ "ep_size": 1,
15
+ "first_k_dense_replace": 3,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 7168,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 18432,
20
+ "kv_lora_rank": 512,
21
+ "max_position_embeddings": 163840,
22
+ "model_type": "deepseek_v3",
23
+ "moe_intermediate_size": 2048,
24
+ "moe_layer_freq": 1,
25
+ "n_group": 8,
26
+ "n_routed_experts": 256,
27
+ "n_shared_experts": 1,
28
+ "norm_topk_prob": true,
29
+ "num_attention_heads": 128,
30
+ "num_experts_per_tok": 8,
31
+ "num_hidden_layers": 61,
32
+ "num_key_value_heads": 128,
33
+ "num_nextn_predict_layers": 1,
34
+ "pad_token_id": 2,
35
+ "q_lora_rank": 1536,
36
+ "qk_nope_head_dim": 128,
37
+ "qk_rope_head_dim": 64,
38
+ "quantization_config": {
39
+ "activation_scheme": "dynamic",
40
+ "fmt": "e4m3",
41
+ "quant_method": "fp8",
42
+ "weight_block_size": [
43
+ 128,
44
+ 128
45
+ ]
46
+ },
47
+ "rms_norm_eps": 1e-06,
48
+ "rope_scaling": {
49
+ "beta_fast": 32,
50
+ "beta_slow": 1,
51
+ "factor": 40,
52
+ "mscale": 1.0,
53
+ "mscale_all_dim": 1.0,
54
+ "original_max_position_embeddings": 4096,
55
+ "type": "yarn"
56
+ },
57
+ "rope_theta": 10000,
58
+ "routed_scaling_factor": 2.5,
59
+ "scoring_func": "sigmoid",
60
+ "tie_word_embeddings": false,
61
+ "topk_group": 4,
62
+ "topk_method": "noaux_tc",
63
+ "torch_dtype": "bfloat16",
64
+ "transformers_version": "4.52.4",
65
+ "unsloth_fixed": true,
66
+ "use_cache": true,
67
+ "v_head_dim": 128,
68
+ "vocab_size": 129280
69
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
7
+ class DeepseekV3Config(PretrainedConfig):
8
+ r"""
9
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
10
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
12
+
13
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
14
+ documentation from [`PretrainedConfig`] for more information.
15
+
16
+
17
+ Args:
18
+ vocab_size (`int`, *optional*, defaults to 129280):
19
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
20
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
21
+ hidden_size (`int`, *optional*, defaults to 4096):
22
+ Dimension of the hidden representations.
23
+ intermediate_size (`int`, *optional*, defaults to 11008):
24
+ Dimension of the MLP representations.
25
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
26
+ Dimension of the MoE representations.
27
+ num_hidden_layers (`int`, *optional*, defaults to 32):
28
+ Number of hidden layers in the Transformer decoder.
29
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
30
+ Number of nextn predict layers in the DeepSeekV3 Model.
31
+ num_attention_heads (`int`, *optional*, defaults to 32):
32
+ Number of attention heads for each attention layer in the Transformer decoder.
33
+ n_shared_experts (`int`, *optional*, defaults to None):
34
+ Number of shared experts, None means dense model.
35
+ n_routed_experts (`int`, *optional*, defaults to None):
36
+ Number of routed experts, None means dense model.
37
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
38
+ Scaling factor or routed experts.
39
+ topk_method (`str`, *optional*, defaults to `gready`):
40
+ Topk method used in routed gate.
41
+ n_group (`int`, *optional*, defaults to None):
42
+ Number of groups for routed experts.
43
+ topk_group (`int`, *optional*, defaults to None):
44
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
45
+ num_experts_per_tok (`int`, *optional*, defaults to None):
46
+ Number of selected experts, None means dense model.
47
+ moe_layer_freq (`int`, *optional*, defaults to 1):
48
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
49
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
50
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
51
+ \--k dense layers--/
52
+ norm_topk_prob (`bool`, *optional*, defaults to False):
53
+ Whether to normalize the weights of the routed experts.
54
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
55
+ Method of computing expert weights.
56
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
57
+ Auxiliary loss weight coefficient.
58
+ seq_aux = (`bool`, *optional*, defaults to True):
59
+ Whether to compute the auxiliary loss for each individual sample.
60
+ num_key_value_heads (`int`, *optional*):
61
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
62
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
63
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
64
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
65
+ by meanpooling all the original heads within that group. For more details checkout [this
66
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
67
+ `num_attention_heads`.
68
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
69
+ The non-linear activation function (function or string) in the decoder.
70
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
71
+ The maximum sequence length that this model might ever be used with.
72
+ initializer_range (`float`, *optional*, defaults to 0.02):
73
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
74
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
75
+ The epsilon used by the rms normalization layers.
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
78
+ relevant if `config.is_decoder=True`.
79
+ pad_token_id (`int`, *optional*):
80
+ Padding token id.
81
+ bos_token_id (`int`, *optional*, defaults to 1):
82
+ Beginning of stream token id.
83
+ eos_token_id (`int`, *optional*, defaults to 2):
84
+ End of stream token id.
85
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
86
+ Whether to tie weight embeddings
87
+ rope_theta (`float`, *optional*, defaults to 10000.0):
88
+ The base period of the RoPE embeddings.
89
+ rope_scaling (`Dict`, *optional*):
90
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
91
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
92
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
93
+ `max_position_embeddings` to the expected new maximum.
94
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
95
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
96
+ attention_dropout (`float`, *optional*, defaults to 0.0):
97
+ The dropout ratio for the attention probabilities.
98
+
99
+ ```python
100
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
101
+
102
+ >>> # Initializing a Deepseek-V3 style configuration
103
+ >>> configuration = DeepseekV3Config()
104
+
105
+ >>> # Accessing the model configuration
106
+ >>> configuration = model.config
107
+ ```"""
108
+
109
+ model_type = "deepseek_v3"
110
+ keys_to_ignore_at_inference = ["past_key_values"]
111
+
112
+ def __init__(
113
+ self,
114
+ vocab_size=129280,
115
+ hidden_size=7168,
116
+ intermediate_size=18432,
117
+ moe_intermediate_size = 2048,
118
+ num_hidden_layers=61,
119
+ num_nextn_predict_layers=1,
120
+ num_attention_heads=128,
121
+ num_key_value_heads=128,
122
+ n_shared_experts = 1,
123
+ n_routed_experts = 256,
124
+ ep_size = 1,
125
+ routed_scaling_factor = 2.5,
126
+ kv_lora_rank = 512,
127
+ q_lora_rank = 1536,
128
+ qk_rope_head_dim = 64,
129
+ v_head_dim = 128,
130
+ qk_nope_head_dim = 128,
131
+ topk_method = 'noaux_tc',
132
+ n_group = 8,
133
+ topk_group = 4,
134
+ num_experts_per_tok = 8,
135
+ moe_layer_freq = 1,
136
+ first_k_dense_replace = 3,
137
+ norm_topk_prob = True,
138
+ scoring_func = 'sigmoid',
139
+ hidden_act="silu",
140
+ max_position_embeddings=4096,
141
+ initializer_range=0.02,
142
+ rms_norm_eps=1e-6,
143
+ use_cache=True,
144
+ pad_token_id=None,
145
+ bos_token_id=0,
146
+ eos_token_id=1,
147
+ tie_word_embeddings=False,
148
+ rope_theta=10000.0,
149
+ rope_scaling=None,
150
+ attention_bias=False,
151
+ attention_dropout=0.0,
152
+ **kwargs,
153
+ ):
154
+ self.vocab_size = vocab_size
155
+ self.max_position_embeddings = max_position_embeddings
156
+ self.hidden_size = hidden_size
157
+ self.intermediate_size = intermediate_size
158
+ self.moe_intermediate_size = moe_intermediate_size
159
+ self.num_hidden_layers = num_hidden_layers
160
+ self.num_nextn_predict_layers = num_nextn_predict_layers
161
+ self.num_attention_heads = num_attention_heads
162
+ self.n_shared_experts = n_shared_experts
163
+ self.n_routed_experts = n_routed_experts
164
+ self.ep_size = ep_size
165
+ self.routed_scaling_factor = routed_scaling_factor
166
+ self.kv_lora_rank = kv_lora_rank
167
+ self.q_lora_rank = q_lora_rank
168
+ self.qk_rope_head_dim = qk_rope_head_dim
169
+ self.v_head_dim = v_head_dim
170
+ self.qk_nope_head_dim = qk_nope_head_dim
171
+ self.topk_method = topk_method
172
+ self.n_group = n_group
173
+ self.topk_group = topk_group
174
+ self.num_experts_per_tok = num_experts_per_tok
175
+ self.moe_layer_freq = moe_layer_freq
176
+ self.first_k_dense_replace = first_k_dense_replace
177
+ self.norm_topk_prob = norm_topk_prob
178
+ self.scoring_func = scoring_func
179
+ # for backward compatibility
180
+ if num_key_value_heads is None:
181
+ num_key_value_heads = num_attention_heads
182
+
183
+ self.num_key_value_heads = num_key_value_heads
184
+ self.hidden_act = hidden_act
185
+ self.initializer_range = initializer_range
186
+ self.rms_norm_eps = rms_norm_eps
187
+ self.use_cache = use_cache
188
+ self.rope_theta = rope_theta
189
+ self.rope_scaling = rope_scaling
190
+ self.attention_bias = attention_bias
191
+ self.attention_dropout = attention_dropout
192
+
193
+ super().__init__(
194
+ pad_token_id=pad_token_id,
195
+ bos_token_id=bos_token_id,
196
+ eos_token_id=eos_token_id,
197
+ tie_word_embeddings=tie_word_embeddings,
198
+ **kwargs,
199
+ )
intelligence_score_vs_output_tokens.png ADDED

Git LFS Details

  • SHA256: ace1e8df27abccaf153f01b719117cbc024839c02cab6e2a300aa401ba196af7
  • Pointer size: 131 Bytes
  • Size of remote file: 197 kB
model-00001-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a53dcdd0939fc01ddbb6927ba46c09497f81efd42e9c3e62245776e66731890
3
+ size 5234138288
model-00002-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de179d950ad25cc4596e26792a1a06a80a46540acef3a9d004cc9940358f3e58
3
+ size 4302381728
model-00003-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac592467a2391b507296d568a87c6f02d41a1a3473f3bc6705fdafe91a7e84d9
3
+ size 4302382136
model-00004-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e2c15bd1a0cc9b4da6e5faa5dd82a6e8322fdeef89ea76fa6556e086dac9201
3
+ size 4302347768
model-00005-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4cd762a7f2575c8a8fb04a68556e2311350567c9cb015fd94af7d37e5ef5c7d
3
+ size 4302381912
model-00006-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d267c2a6d020ef9e6333f886a52aeba11518fc0e6a2a00261d5ccdff71670fe1
3
+ size 4372071352
model-00007-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f82b553017ac5b1962ffb68604935780a8699bd929d6a3170134f6dc2736aa0
3
+ size 4306077848
model-00008-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8659a4124c5001e4f594fc53586c148c3210c8c080efcd0ebf420df1d18a3d9
3
+ size 4302382112
model-00009-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ff5721c6750889be83e580b0e299ec09519e3fb6adcbd6f3ce490058554f780
3
+ size 4302347960
model-00010-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1533ea7da047d76918ca06e71805c79d5e4ae15b3d46d2df486cbfab1072e8a2
3
+ size 4302381720
model-00011-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f5477531751d112e6284b08f399f2f848c5fac203eb77f5dbe6825ff99d42d8
3
+ size 4302382136
model-00012-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f985054e530622211c5606d87ae1b6d28dded975facb701dc0a025d4525bc90
3
+ size 1321583272
model-00013-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeba020edcb35a96ddba8d92ed4b1316f2026dad577668ea6772225f00260fe5
3
+ size 4302314992
model-00014-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f4d9f12a0aee0bf6f94e10d18666c6a6ad94b3ee1bf33f37ecb679ed1c031cc
3
+ size 4302382088
model-00015-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ded3c735c606e9f1bc067f7782b63d701577e59a338b08e3fc49c8fb603be3f8
3
+ size 4302347992
model-00016-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65126dec37ddcaad6a361e7256d7c5d2fb390e11f891f967e32508320f288f40
3
+ size 4302381688
model-00017-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0919e72baebf3b268e24a76343e7c8c0cc9ce7485e0b10f1ab0cc19877bee049
3
+ size 4302382136
model-00018-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab55b759fa61688e5cdbfc0e76426e29eee60cb7dc4098a505268bd8508105c0
3
+ size 4302347800
model-00019-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad04d451a23ba6f6f46aaf549ab2c94ae536db3cc451293d2bf86d932d075c42
3
+ size 4302381880
model-00020-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:136c023f610ce0f40752907c29137d0ac1c55228cc721e33112d72e1a86b2a64
3
+ size 4302382136
model-00021-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdadc13724af4b6fdce0d887a5dabaa9ac2e35f4716a95124f4c9b0161557517
3
+ size 4302348176
model-00022-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e013066a698d160a033b085d2ab350cc487c9526568c23513af655debe2f353f
3
+ size 4302382656
model-00023-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0175c89315c5a7420784da8b46ad83d896b6f1b019ede2e8e9137fc43c5ddff
3
+ size 4302348584
model-00024-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aea2f0b0054e0729a4932bb2ef2f4e4d2a84575c874611bbcacebe641da78b2
3
+ size 4302382264
model-00025-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c4f8ea8cfd59a26c06a4118e98f5519d25f9471c7f2dc09534ec0352cb4efcf
3
+ size 4302382720
model-00026-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74c70c83c69c8035cce7f760b179b7ca279236ab439d672d298b8d27a92113ae
3
+ size 4302348392
model-00027-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d5999b2943768dcc67cad1f8411d8f1e1aeb55c429e513d7789a5cad3152be
3
+ size 4302382448
model-00028-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d08de67222c0b2e1d91e69dc2afaf719dcd43516a286ac9be33d3e95261892
3
+ size 4302382720
model-00029-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7984feb281a1ce60f706eeb95e5190ca228a0217e5bd693addeb85d308e53d30
3
+ size 4302348200
model-00030-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ddf8f8cc2330c91beddec149d5f256bd54c1b501a0b6d205ab2afcba76de0da
3
+ size 4302382640
model-00031-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72eb06264d09a343bba922c11ad90ccba018f52f4678a2d951923672ecb246a6
3
+ size 4302348600
model-00032-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82bf8f9a875f472a590aa80bbaa4da17d778a41bc35b042363c7f2b4fcab80a
3
+ size 4302382248
model-00033-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c95969b504e6cec53e981bf224c8051b3808b12666f5fe1b8ebce720f57cd62f
3
+ size 4302382720
model-00034-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d549912fa539c15eb2d31b0c81197b09fb288458aabf8d6b1f240b5fcaf0872c
3
+ size 1747416576
model-00035-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:110b10ce1ac78868a426f5960d2037398995d3e23722a9d5cff2d85ac4b4463a
3
+ size 4302315568
model-00036-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80dfc11f732d16b60d7e150ec6bffe84cf6373f9963383fc1da6ec1491a04b72
3
+ size 4302382672
model-00037-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7475424d577cdbd86225777b4ceaab9cafa74511ba55f493f7f487b9706fe88d
3
+ size 4302348568
model-00038-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a71a848794efa1c7f97e9e7a6e6f55a9cce95add7656de0000134e5cfc704a
3
+ size 4302382280
model-00039-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:271ec648451c8387511173eb0d83f98b514f730076ee9cad632b923e70c304be
3
+ size 4302382720
model-00040-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e547cf5d0ac39457bfcce969740b2e31b4a473279c5142a875cd3c56152d3d9e
3
+ size 4302348376
model-00041-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb81456fccb3593215a1417ff4bd83e6a18a5156f43266fe40968bf5680cf968
3
+ size 4302382472
model-00042-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d37f968c691f7ca717155328596eab59ffcd356d48dca5d4451f9e2e262a6056
3
+ size 4302382720
model-00043-of-000163.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c31b7cb0d0e6761fec2ce693b99059802b055a51cac5d7e0485d214517af4cdf
3
+ size 4302348184