christian-lms commited on
Commit
635fdbc
·
verified ·
1 Parent(s): 0da7954

Delete configuration_ernie4_5_moe.py

Browse files
Files changed (1) hide show
  1. configuration_ernie4_5_moe.py +0 -194
configuration_ernie4_5_moe.py DELETED
@@ -1,194 +0,0 @@
1
- # Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from transformers import PretrainedConfig
16
-
17
-
18
-
19
- class Ernie4_5_MoeConfig(PretrainedConfig):
20
- r"""
21
- This is the configuration class to store the configuration of a [`Ernie4_5_Model`].
22
-
23
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
24
- documentation from [`PretrainedConfig`] for more information.
25
-
26
-
27
- Args:
28
- vocab_size (int): Size of the vocabulary (number of unique tokens)
29
- hidden_size (int): Dimensionality of the encoder layers and the pooler layer
30
- intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
31
- max_position_embeddings (int): Maximum sequence length the model can handle
32
- num_hidden_layers (int): Number of hidden layers in the Transformer encoder
33
- num_attention_heads (int): Number of attention heads for each attention layer
34
- rms_norm_eps (float): The epsilon used by the RMS normalization layers
35
- use_cache (bool): Whether to use caching for faster generation (decoding)
36
- use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
37
- pad_token_id (int): Token ID used for padding sequences
38
- bos_token_id (int): Token ID used for beginning-of-sequence
39
- eos_token_id (int): Token ID used for end-of-sequence
40
- use_bias (bool): Whether to use bias terms in linear layers
41
- rope_theta (float): The base period of the RoPE embeddings
42
- weight_share_add_bias (bool): Whether to share bias weights in certain layers
43
- ignored_index (int): Target value that is ignored during loss computation
44
- attention_probs_dropout_prob (float): Dropout probability for attention weights
45
- hidden_dropout_prob (float): Dropout probability for hidden layers
46
- num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
47
- max_sequence_length (int): Maximum sequence length for positional embeddings
48
- moe_num_experts: Number of experts in MoE layers
49
- moe_capacity: Capacity configuration for MoE layers
50
- moe_layer_interval: Interval between MoE layers
51
- moe_layer_start_index: Starting layer index for MoE
52
- moe_layer_end_index: Ending layer index for MoE (-1 means last layer)
53
- sinkhorn_2gate: Whether to use sinkhorn 2-gate routing
54
- sinkhorn_temp: Temperature for sinkhorn routing
55
- moe_dropout_prob: Dropout probability for MoE layers
56
- moe_gate: Type of gating mechanism ('top2', etc.)
57
- moe_intermediate_size: Intermediate size for MoE layers
58
- moe_gate_act: Activation function for gating
59
- moe_k: Number of experts to route to
60
- num_nextn_predict_layers: Number of mtp predict layers, if use mtp, set `num_nextn_predict_layers > 0`
61
- multi_token_pred_lambda: The weight of multi token prediction loss
62
- **kwargs: Additional base model configuration parameters
63
- """
64
-
65
- model_type = "ernie4_5_moe"
66
- use_keep_in_fp32_modules = True
67
- keys_to_ignore_at_inference = ["past_key_values"]
68
-
69
- attribute_map = {
70
- "n_positions": "max_position_embeddings",
71
- "n_embd": "hidden_size",
72
- "n_layer": "num_hidden_layers",
73
- "n_head": "num_attention_heads",
74
- "n_inner": "intermediate_size",
75
- "activation_function": "hidden_act",
76
- }
77
-
78
- # Default tensor parallel plan for base model `ernie_4_5_moe`
79
- base_model_tp_plan = {
80
- "model.layers.*.self_attn.q_proj": "colwise_rep",
81
- "model.layers.*.self_attn.k_proj": "colwise_rep",
82
- "model.layers.*.self_attn.v_proj": "colwise_rep",
83
- "model.layers.*.self_attn.o_proj": "rowwise_rep",
84
- "model.layers.*.mlp.experts.*.gate_proj": "colwise",
85
- "model.layers.*.mlp.experts.*.up_proj": "colwise",
86
- "model.layers.*.mlp.experts.*.down_proj": "rowwise",
87
- "model.layers.*.mlp.gate_proj": "colwise",
88
- "model.layers.*.mlp.up_proj": "colwise",
89
- "model.layers.*.mlp.down_proj": "rowwise",
90
- }
91
- base_model_pp_plan = {
92
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
93
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
94
- "norm": (["hidden_states"], ["hidden_states"]),
95
- }
96
-
97
- def __init__(
98
- self,
99
- vocab_size=32000,
100
- hidden_size=768,
101
- intermediate_size=11008,
102
- num_hidden_layers=2,
103
- num_attention_heads=2,
104
- num_key_value_heads=None,
105
- max_position_embeddings=32768,
106
- rms_norm_eps=1e-6,
107
- use_cache=False,
108
- pad_token_id=0,
109
- bos_token_id=1,
110
- eos_token_id=2,
111
- attention_probs_dropout_prob=0.0,
112
- hidden_dropout_prob=0.0,
113
- rope_theta=10000.0,
114
- use_flash_attention=False,
115
- use_rmsnorm=True,
116
- use_bias=False,
117
- weight_share_add_bias=True,
118
- max_sequence_length=None,
119
- ignored_index=-100,
120
- use_moe=True,
121
- moe_num_experts=64,
122
- moe_capacity=(64, 64, 64),
123
- moe_layer_interval=2,
124
- moe_layer_start_index=0,
125
- moe_layer_end_index=-1,
126
- sinkhorn_2gate=True,
127
- sinkhorn_temp=3e-2,
128
- moe_dropout_prob=0.0,
129
- moe_gate="top2",
130
- moe_intermediate_size=3584,
131
- moe_k=2,
132
- moe_gate_act: str = "softmax",
133
- moe_use_aux_free=False,
134
- num_nextn_predict_layers=0,
135
- multi_token_pred_lambda=1.0,
136
- **kwargs,
137
- ):
138
- self.vocab_size = vocab_size
139
- self.max_position_embeddings = max_position_embeddings
140
- self.hidden_size = hidden_size
141
- self.intermediate_size = intermediate_size
142
- self.num_hidden_layers = num_hidden_layers
143
- self.num_attention_heads = num_attention_heads
144
-
145
- if num_key_value_heads is None:
146
- num_key_value_heads = num_attention_heads
147
-
148
- self.num_key_value_heads = num_key_value_heads
149
- self.use_rmsnorm = use_rmsnorm
150
- self.rms_norm_eps = rms_norm_eps
151
- self.rope_theta = rope_theta
152
- self.max_sequence_length = max_sequence_length
153
- self.pad_token_id = pad_token_id
154
- self.bos_token_id = bos_token_id
155
- self.eos_token_id = eos_token_id
156
- self.ignored_index = ignored_index
157
- self.use_cache = use_cache
158
- self.use_bias = use_bias
159
- self.weight_share_add_bias = weight_share_add_bias
160
- self.use_flash_attention = use_flash_attention
161
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
162
- self.hidden_dropout_prob = hidden_dropout_prob
163
-
164
- self.use_moe = moe_num_experts > 0 and use_moe
165
- self.moe_num_experts = moe_num_experts
166
- self.moe_capacity = moe_capacity
167
- self.sinkhorn_2gate = sinkhorn_2gate
168
- self.sinkhorn_temp = sinkhorn_temp
169
- self.moe_layer_interval = moe_layer_interval
170
- self.moe_dropout_prob = moe_dropout_prob
171
- self.moe_gate = moe_gate
172
- self.moe_intermediate_size = moe_intermediate_size
173
- self.moe_k = moe_k
174
- self.moe_layer_start_index = moe_layer_start_index
175
- self.moe_layer_end_index = (
176
- self.num_hidden_layers - 1
177
- if moe_layer_end_index == -1
178
- else moe_layer_end_index
179
- )
180
- self.moe_gate_act = moe_gate_act
181
- self.moe_use_aux_free = moe_use_aux_free
182
- self.num_nextn_predict_layers = num_nextn_predict_layers
183
- self.multi_token_pred_lambda = multi_token_pred_lambda
184
-
185
- # Set default for tied embeddings if not specified.
186
- if "tie_word_embeddings" not in kwargs:
187
- kwargs["tie_word_embeddings"] = False
188
-
189
- super().__init__(
190
- pad_token_id=pad_token_id,
191
- bos_token_id=bos_token_id,
192
- eos_token_id=eos_token_id,
193
- **kwargs,
194
- )