# ============================================================================== # Smol-MoE 8x135M - "Chat with Your Creation" # (Final Interactive Inference Script) # # Smol-MoE 8x135M - “与你的造物对话” # (最终版交互式推理脚本) # ============================================================================== # --- Core Library Imports / 核心库导入 --- import torch import torch.nn as nn import torch.nn.functional as F from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.models.llama.modeling_llama import LlamaMLP import os # --- 1. CRITICAL: Re-define Your Custom Architecture --- # --- 1. 关键:重新定义你的所有自定义模块 --- # When loading a model with a custom architecture, Hugging Face needs to know the definition of the custom classes. # By defining them here, we allow the `from_pretrained` process to correctly reconstruct our unique MoE model. # 在加载一个拥有自定义架构的模型时,Hugging Face 需要知道这些自定义类的定义。 # 在这里定义它们,我们才能让 `from_pretrained` 函数成功地重建我们独一-无二的MoE模型。 # --- Model Configuration / 模型配置 --- MODEL_PATH = "./SmolMoE-8x135M-Instruct-v1-Trained" # Load our custom MoE parameters from the saved config file. # 从我们保存的配置文件中,加载自定义的MoE参数。 config = AutoConfig.from_pretrained(MODEL_PATH) NUM_EXPERTS = config.moe_num_experts TOP_K = config.moe_top_k class MoERouter(nn.Module): """The Router module. Its job is to score experts for each token.""" """路由器模块。它的工作是为每个token给所有专家打分。""" def __init__(self, hidden_size: int, num_experts: int): super().__init__() self.layer = nn.Linear(hidden_size, num_experts, bias=False) def forward(self, hidden_states): return self.layer(hidden_states) class MoEModule(nn.Module): """The custom Mixture-of-Experts module that replaces the standard FFN.""" """我们自定义的混合专家模块,它替换了标准的FFN。""" def __init__(self, config): super().__init__() self.hidden_size = config.hidden_size self.top_k = TOP_K self.num_experts = NUM_EXPERTS self.router = MoERouter(self.hidden_size, self.num_experts) self.experts = nn.ModuleList([LlamaMLP(config) for _ in range(self.num_experts)]) def forward(self, hidden_states): original_shape = hidden_states.shape flat_hidden_states = hidden_states.view(-1, self.hidden_size) router_logits = self.router(flat_hidden_states) routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float) routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(hidden_states.dtype) final_hidden_states = torch.zeros_like(flat_hidden_states) for k in range(self.top_k): expert_indices_k = selected_experts[:, k] routing_weights_k = routing_weights[:, k] for i in range(self.num_experts): mask = expert_indices_k == i if mask.any(): expert_output = self.experts[i](flat_hidden_states[mask]) final_hidden_states.index_add_(0, torch.where(mask)[0], expert_output * routing_weights_k[mask].unsqueeze(1)) return final_hidden_states.view(*original_shape) # --- 2. Main Program: Load Model and Start Conversation --- # --- 2. 主程序:加载模型并开始对话 --- def main(): device = "cuda" if torch.cuda.is_available() else "cpu" # --- Model Loading / 模型加载 --- print(f"Loading tokenizer from '{MODEL_PATH}'...") tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) print(f"Manually rebuilding MoE model structure...") # First, we create an "empty shell" of our model with the correct architecture but random weights. # `from_config` builds the structure without loading any weights. # 首先,我们用`from_config`创建一个拥有正确架构但权重是随机的“空壳”模型。 # 这一步只搭建骨架,不加载任何权重。 moe_model = AutoModelForCausalLM.from_config(config) # Then, we perform the "architectural surgery" again, replacing standard MLPs with our MoEModules. # 然后,我们再次手动进行“架构手术”,把标准的MLP替换成我们的MoE模块。 for i, layer in enumerate(moe_model.model.layers): layer.mlp = MoEModule(config) print(f"Loading your trained MoE weights into the correct structure...") from safetensors.torch import load_file state_dict = load_file(os.path.join(MODEL_PATH, "model.safetensors"), device="cpu") # Use `strict=False` for flexible loading, then manually tie the weights. # This handles the missing `lm_head.weight` key caused by weight tying. # 使用`strict=False`进行灵活加载,然后手动绑定权重。 # 这个操作处理了因权重绑定而导致的`lm_head.weight`键缺失的问题。 moe_model.load_state_dict(state_dict, strict=False) moe_model.tie_weights() # Move the finalized model to the GPU and set it to evaluation mode. # 将最终完成的模型移动到GPU,并设置为评估模式。 moe_model.to(device, dtype=torch.bfloat16) moe_model.eval() print("--- MoE Model is ready for conversation! ---") print("Type 'exit' or 'quit' to end the chat.\n") # --- Interactive Conversation Loop / 交互式对话循环 --- messages = [] while True: try: user_input = input("You: ") if user_input.lower() in ["exit", "quit"]: print("Goodbye!") break # Step 1: Add the user's input to the conversation history. # 步骤 1: 将用户的输入添加到对话历史中。 messages.append({"role": "user", "content": user_input}) # Step 2: Format the entire conversation history using the chat template. # `add_generation_prompt=True` adds the starting tokens for the assistant's turn. # 步骤 2: 使用聊天模板格式化完整的对话历史。 # `add_generation_prompt=True` 会在末尾添加助手角色的起始标记。 prompt_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Step 3: Encode the input text and move it to the GPU. # 步骤 3: 编码输入文本并将其发送到GPU。 inputs = tokenizer(prompt_text, return_tensors="pt").to(device) # Step 4: Generate a response. # 步骤 4: 生成回复。 with torch.no_grad(): outputs = moe_model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True ) # Step 5: Decode and clean the output. # `outputs[0]` contains the full conversation (input + output). We need to extract only the new part. # 步骤 5: 解码并清理输出。 # `outputs[0]` 包含了完整的对话(输入+输出),我们需要从中提取出模型新生成的部分。 full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Find the newly generated part by removing the original prompt from the full response. # 通过从完整回复中移除原始的prompt来找到新生成的部分。 assistant_prompt_start = "<|assistant|>\n" # This is a robust way to find the start of the assistant's actual response assistant_response_start_index = full_response.rfind(assistant_prompt_start) if assistant_response_start_index != -1: model_response = full_response[assistant_response_start_index + len(assistant_prompt_start):].strip() else: # Fallback for simpler cases model_response = full_response.replace(prompt_text.replace("", "").replace("", ""), "").strip() print(f"MoE Model: {model_response}") # Step 6: Add the model's response to the history for multi-turn conversations. # 步骤 6: 将模型的回复也添加到对话历史中,以便进行多轮对话。 messages.append({"role": "assistant", "content": model_response}) except KeyboardInterrupt: print("\nGoodbye!") break # Script entry point / 脚本入口 if __name__ == "__main__": main()