File size: 5,532 Bytes
65bf19f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import glob
import re
import shutil
import sys
import accelerate
import torch
from configuration_bailing_shared_moe_v2 import BailingSharedMoeV2Config
from modeling_bailing_shared_moe_v2 import BailingSharedMoeV2ForCausalLM
from configuration_bailing_moe_v2 import BailingMoeV2Config
from safetensors import safe_open
input_model = sys.argv[1]
output_model_path = sys.argv[2]
auto_map = {
"AutoConfig": "configuration_bailing_shared_moe_v2.BailingSharedMoeV2Config",
"AutoModel": "modeling_bailing_shared_moe_v2.BailingSharedMoeV2Model",
"AutoModelForCausalLM": "modeling_bailing_shared_moe_v2.BailingSharedMoeV2ForCausalLM"
}
cfg_standard_moe = BailingMoeV2Config.from_pretrained(input_model)
cfg_shared_moe = BailingSharedMoeV2Config(
auto_map=auto_map,
model_type="bailing_shared_moe_v2",
vocab_size=cfg_standard_moe.vocab_size,
hidden_size=cfg_standard_moe.hidden_size,
intermediate_size=cfg_standard_moe.intermediate_size,
num_hidden_layers=cfg_standard_moe.num_hidden_layers,
num_attention_heads=cfg_standard_moe.num_attention_heads,
num_key_value_heads=cfg_standard_moe.num_key_value_heads,
hidden_act=cfg_standard_moe.hidden_act,
max_position_embeddings=cfg_standard_moe.max_position_embeddings,
initializer_range=cfg_standard_moe.initializer_range,
rms_norm_eps=cfg_standard_moe.rms_norm_eps,
use_cache=cfg_standard_moe.use_cache,
tie_word_embeddings=cfg_standard_moe.tie_word_embeddings,
rope_theta=cfg_standard_moe.rope_theta,
rope_scaling=cfg_standard_moe.rope_scaling,
max_window_layers=cfg_standard_moe.max_window_layers,
attention_dropout=cfg_standard_moe.attention_dropout,
moe_intermediate_size=cfg_standard_moe.moe_intermediate_size,
num_experts_per_tok=cfg_standard_moe.num_experts_per_tok,
num_experts=cfg_standard_moe.num_experts,
num_shared_experts=cfg_standard_moe.num_shared_experts,
norm_topk_prob=cfg_standard_moe.norm_topk_prob,
output_router_logits=cfg_standard_moe.output_router_logits,
shared_expert_intermediate_size=None,
head_dim=cfg_standard_moe.head_dim,
embedding_dropout=cfg_standard_moe.embedding_dropout,
eos_token_id=cfg_standard_moe.eos_token_id,
first_k_dense_replace=cfg_standard_moe.first_k_dense_replace,
output_dropout=cfg_standard_moe.output_dropout,
pad_token_id=cfg_standard_moe.pad_token_id,
torch_dtype=cfg_standard_moe.torch_dtype,
use_bias=cfg_standard_moe.use_bias,
use_qkv_bias=cfg_standard_moe.use_qkv_bias,
moe_router_enable_expert_bias=cfg_standard_moe.moe_router_enable_expert_bias,
routed_scaling_factor=cfg_standard_moe.routed_scaling_factor,
n_group=cfg_standard_moe.n_group,
topk_group=cfg_standard_moe.topk_group,
use_qk_norm=cfg_standard_moe.use_qk_norm,
moe_shared_expert_intermediate_size=cfg_standard_moe.moe_shared_expert_intermediate_size,
num_nextn_predict_layers=cfg_standard_moe.num_nextn_predict_layers,
score_function=cfg_standard_moe.score_function,
router_dtype=cfg_standard_moe.router_dtype,
use_rmsnorm=cfg_standard_moe.use_rmsnorm,
partial_rotary_factor=cfg_standard_moe.partial_rotary_factor,
)
num_experts = cfg_standard_moe.num_experts
with accelerate.init_empty_weights():
model_shared_moe = BailingSharedMoeV2ForCausalLM(cfg_shared_moe)
model_shared_moe = model_shared_moe.to(torch.bfloat16)
new_state_dict = {}
pattern = f"{input_model}/model-*-of-*.safetensors"
files = sorted(glob.glob(pattern))
if len(files) == 0:
raise FileNotFoundError
tensors = {}
for file_path in files:
print(f"processing {file_path}")
with safe_open(file_path, framework="pt", device="cpu") as f:
for key in f.keys():
tensor = f.get_tensor(key)
tensors[key] = tensor
for key in tensors:
if "experts" not in key or "shared_experts" in key:
new_state_dict[key] = tensors[key]
elif "experts.0" in key:
layer_num = int(re.search(r"\d+", key).group())
new_state_dict[
f"model.layers.{layer_num}.mlp.moe_mlp.output_experts.weight"
] = torch.stack(
[
tensors[f"model.layers.{layer_num}.mlp.experts.{i}.down_proj.weight"]
for i in range(num_experts)
]
)
new_state_dict[f"model.layers.{layer_num}.mlp.moe_mlp.experts.weight"] = (
torch.stack(
[
torch.cat(
[
tensors[
f"model.layers.{layer_num}.mlp.experts.{i}.up_proj.weight"
],
tensors[
f"model.layers.{layer_num}.mlp.experts.{i}.gate_proj.weight"
],
],
dim=0,
)
for i in range(num_experts)
]
)
)
model_shared_moe.load_state_dict(new_state_dict, strict=True, assign=True)
model_shared_moe.save_pretrained(output_model_path)
cfg_shared_moe.save_pretrained(output_model_path)
shutil.copy(
"modeling_bailing_shared_moe_v2.py",
output_model_path + "/" + "modeling_bailing_shared_moe_v2.py",
)
shutil.copy(
"configuration_bailing_shared_moe_v2.py",
output_model_path + "/" + "configuration_bailing_shared_moe_v2.py",
)
for i in ["special_tokens_map.json", "tokenizer_config.json", "tokenizer.json"]:
shutil.copy(input_model + "/" + i, output_model_path + "/" + i)
|