# ============================================================================== # Smol-MoE 8x135M - "The Mind-Reader" Test Script # (Final Version with Correct Loading Logic) # # Smol-MoE 8x135M - “读心器”测试脚本 # (包含正确加载逻辑的最终版本) # ============================================================================== # --- Core Library Imports / 核心库导入 --- import torch import torch.nn as nn import torch.nn.functional as F from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.models.llama.modeling_llama import LlamaMLP, LlamaForCausalLM # We need LlamaMLP for the expert definition / 我们需要 LlamaMLP 来定义专家 import numpy as np import os # We need this for path operations / 我们需要 os 库来处理文件路径 # --- 1. CRITICAL: Re-define Your Custom Architecture --- # --- 1. 关键:重新定义你的所有自定义模块 --- # When loading a model with a custom architecture, Hugging Face needs to know the definition of the custom classes. # By defining them here, we allow `from_pretrained` to correctly reconstruct our unique MoE model. # 在加载一个拥有自定义架构的模型时,Hugging Face 需要知道这些自定义类的定义。 # 在这里定义它们,我们才能让 `from_pretrained` 函数成功地重建我们独一无二的MoE模型。 # --- Model Configuration / 模型配置 --- MODEL_PATH = "./SmolMoE-8x135M-Instruct-v1-Trained" config = AutoConfig.from_pretrained(MODEL_PATH) # Load our custom MoE parameters from the saved config file. # 从我们保存的配置文件中,加载自定义的MoE参数。 NUM_EXPERTS = config.moe_num_experts TOP_K = config.moe_top_k # A list of expert names for clear visualization later. The order must match the training script. # 用于后续清晰可视化的专家名称列表。顺序必须和训练脚本中的保持一致。 EXPERT_NAMES = [ "Actor", "Analyst", "Coder", "Encyclopedia", "Guardian", "Summarizer", "Thinker", "Writer" ] class MoERouter(nn.Module): """The Router module. Its job is to score experts for each token.""" """路由器模块。它的工作是为每个token给所有专家打分。""" def __init__(self, hidden_size: int, num_experts: int): super().__init__() self.layer = nn.Linear(hidden_size, num_experts, bias=False) def forward(self, hidden_states): return self.layer(hidden_states) class MoEModule(nn.Module): """The custom Mixture-of-Experts module that replaces the standard FFN.""" """我们自定义的混合专家模块,它替换了标准的FFN。""" def __init__(self, config): super().__init__() self.hidden_size = config.hidden_size self.top_k = TOP_K self.num_experts = NUM_EXPERTS self.router = MoERouter(self.hidden_size, self.num_experts) self.experts = nn.ModuleList([LlamaMLP(config) for _ in range(self.num_experts)]) def forward(self, hidden_states): original_shape = hidden_states.shape flat_hidden_states = hidden_states.view(-1, self.hidden_size) router_logits = self.router(flat_hidden_states) routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float) routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(hidden_states.dtype) final_hidden_states = torch.zeros_like(flat_hidden_states) for k in range(self.top_k): expert_indices_k = selected_experts[:, k] routing_weights_k = routing_weights[:, k] for i in range(self.num_experts): mask = expert_indices_k == i if mask.any(): expert_output = self.experts[i](flat_hidden_states[mask]) final_hidden_states.index_add_(0, torch.where(mask)[0], expert_output * routing_weights_k[mask].unsqueeze(1)) return final_hidden_states.view(*original_shape) # --- 2. Core Diagnostic Tools / 诊断测试的核心工具 --- # This dictionary will store the router decisions captured by our hooks. # 这个字典将用于存储我们的钩子捕获到的路由器决策数据。 captured_router_weights = {} def get_router_weights_hook(layer_idx): """This is a factory function that creates our hook.""" """这是一个创建钩子函数的工厂函数。""" def hook(module, input, output): # `input[0]` is the hidden_states tensor passed to the MoE module. # `input[0]` 是传入MoE模块的hidden_states张量。 router_logits = module.router(input[0]) # We calculate the average routing probability for all tokens in the sequence. # 我们计算序列中所有token的平均路由概率。 avg_probs = F.softmax(router_logits, dim=-1).mean(dim=[0, 1]) # *** FINAL FIX: Convert from BFloat16 to Float32 before converting to NumPy. *** # NumPy does not support the bfloat16 dtype, so we must convert it first. # *** 最终修复:在转换为numpy数组前,先将数据格式从BFloat16转换为Float32。*** # NumPy库不支持bfloat16这种数据类型,所以我们必须先进行转换。 captured_router_weights[layer_idx] = avg_probs.detach().cpu().to(torch.float32).numpy() return hook def visualize_router_decisions(prompt): """A helper function to print the captured router decisions in a nice table.""" """一个辅助函数,用于将捕获到的路由器决策以漂亮的表格形式打印出来。""" print("\n" + "="*80) print(f"ROUTER DECISION ANALYSIS for Prompt: '{prompt[:50]}...'") print("="*80) print(f"{'Layer':<7} | {'Dominant Expert(s)':<45} | {'Confidence'}") print("-"*80) for layer_idx, weights in captured_router_weights.items(): top2_indices = np.argsort(weights)[-2:][::-1] dominant_experts_str = f"1. {EXPERT_NAMES[top2_indices[0]]} | 2. {EXPERT_NAMES[top2_indices[1]]}" confidence_str = f"({weights[top2_indices[0]]:.1%} | {weights[top2_indices[1]]:.1%})" print(f"Layer {layer_idx:<4} | {dominant_experts_str:<45} | {confidence_str}") print("="*80 + "\n") # --- 3. Main Testing Workflow / 主测试流程 --- def main(): device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading tokenizer from '{MODEL_PATH}'...") tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) print(f"Manually rebuilding MoE model structure...") # First, we create an "empty shell" of our model with the correct architecture but random weights. # `from_config` builds the structure without loading any weights. # 首先,我们用`from_config`创建一个拥有正确架构但权重是随机的“空壳”模型。 # 这一步只搭建骨架,不加载任何权重。 moe_model = AutoModelForCausalLM.from_config(config) # Then, we perform the "architectural surgery" again, replacing standard MLPs with our MoEModules. # 然后,我们再次手动进行“架构手术”,把标准的MLP替换成我们的MoE模块。 for i, layer in enumerate(moe_model.model.layers): layer.mlp = MoEModule(config) print(f"Loading your trained MoE weights into the correct structure...") # Load the weights from the safetensors file. # 从safetensors文件加载权重。 from safetensors.torch import load_file state_dict = load_file(os.path.join(MODEL_PATH, "model.safetensors"), device="cpu") # *** FINAL FIX #1: Use `strict=False` for flexible loading. *** # We know `lm_head.weight` is missing because of `tie_word_embeddings`, so we allow this "inexact" loading. # *** 最终修复 #1:使用`strict=False`进行灵活加载。*** # 我们知道因为`tie_word_embeddings`的设置,`lm_head.weight`是缺失的,所以我们允许这种“不严格”的加载。 moe_model.load_state_dict(state_dict, strict=False) # *** FINAL FIX #2: Manually tie the weights. *** # This function reads the `tie_word_embeddings` setting from the config and correctly links the lm_head to the token embeddings. # *** 最终修复 #2:手动执行权重绑定。*** # 这个函数会根据config中的`tie_word_embeddings`设置,将lm_head和词嵌入层正确地绑定在一起。 moe_model.tie_weights() # Move the finalized model to the GPU and set it to evaluation mode. # 将最终完成的模型移动到GPU,并设置为评估模式。 moe_model.to(device, dtype=torch.bfloat16) moe_model.eval() print("--- Custom MoE Model Successfully Loaded and Finalized! ---") # Install our "listening devices" (hooks) on each MoE layer for diagnostics. # 为诊断测试,在每个MoE层上都安装我们的“窃听器”(钩子)。 hooks = [] for i, layer in enumerate(moe_model.model.layers): if isinstance(layer.mlp, MoEModule): hook = layer.mlp.register_forward_hook(get_router_weights_hook(i)) hooks.append(hook) # Design a series of "exam questions" to test different experts. # 设计一系列“考题”来测试不同的专家。 test_prompts = { "Coder": "Write a Python function that takes a list of numbers and returns a new list with only the even numbers.", "Writer": "In a world where shadows have a life of their own, a young lamplighter discovers a terrible secret. Write the opening paragraph.", "Thinker": "If all bloops are gloops, and some gloops are zloops, is it certain that some bloops are zloops? Explain your reasoning.", "Encyclopedia": "What were the primary economic and political causes of the French Revolution?", "Multi-Expert": "In the style of a Shakespearean tragedy, write a short monologue for a software developer lamenting a bug in their code. Include a comment line from the code." } # The main testing loop. # 主测试循环。 for expert_name, prompt in test_prompts.items(): captured_router_weights.clear() # Clear data from the previous run / 清空上一次的捕获数据 print(f"\n--- Testing for: {expert_name} Expert ---") print(f"Prompt: {prompt}") inputs = tokenizer(prompt, return_tensors="pt").to(device) # 1. Functional Test: Generate text / 1. 功能测试:生成文本 with torch.no_grad(): outputs = moe_model.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print("\n--- Generated Text ---") print(generated_text) print("--- End of Generated Text ---") # 2. Diagnostic Test: Visualize router decisions / 2. 诊断测试:可视化路由决策 visualize_router_decisions(prompt) # Clean up by removing all hooks to prevent memory leaks. # 清理工作:移除所有钩子以防止内存泄漏。 for hook in hooks: hook.remove() print("All tests complete and hooks have been removed.") # Script entry point / 脚本入口 if __name__ == "__main__": main()