# ==============================================================================
#                 Smol-MoE 8x135M - "The Mind-Reader" Test Script
#                     (Final Version with Correct Loading Logic)
#
#                 Smol-MoE 8x135M - “读心器”测试脚本
#                     (包含正确加载逻辑的最终版本)
# ==============================================================================

# --- Core Library Imports / 核心库导入 ---
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.modeling_llama import LlamaMLP, LlamaForCausalLM # We need LlamaMLP for the expert definition / 我们需要 LlamaMLP 来定义专家
import numpy as np
import os # We need this for path operations / 我们需要 os 库来处理文件路径

# --- 1. CRITICAL: Re-define Your Custom Architecture ---
# --- 1. 关键：重新定义你的所有自定义模块 ---
# When loading a model with a custom architecture, Hugging Face needs to know the definition of the custom classes.
# By defining them here, we allow `from_pretrained` to correctly reconstruct our unique MoE model.
# 在加载一个拥有自定义架构的模型时，Hugging Face 需要知道这些自定义类的定义。
# 在这里定义它们，我们才能让 `from_pretrained` 函数成功地重建我们独一无二的MoE模型。

# --- Model Configuration / 模型配置 ---
MODEL_PATH = "./SmolMoE-8x135M-Instruct-v1-Trained"
config = AutoConfig.from_pretrained(MODEL_PATH)
# Load our custom MoE parameters from the saved config file.
# 从我们保存的配置文件中，加载自定义的MoE参数。
NUM_EXPERTS = config.moe_num_experts
TOP_K = config.moe_top_k

# A list of expert names for clear visualization later. The order must match the training script.
# 用于后续清晰可视化的专家名称列表。顺序必须和训练脚本中的保持一致。
EXPERT_NAMES = [
    "Actor", "Analyst", "Coder", "Encyclopedia",
    "Guardian", "Summarizer", "Thinker", "Writer"
]

class MoERouter(nn.Module):
    """The Router module. Its job is to score experts for each token."""
    """路由器模块。它的工作是为每个token给所有专家打分。"""
    def __init__(self, hidden_size: int, num_experts: int):
        super().__init__()
        self.layer = nn.Linear(hidden_size, num_experts, bias=False)
    def forward(self, hidden_states):
        return self.layer(hidden_states)

class MoEModule(nn.Module):
    """The custom Mixture-of-Experts module that replaces the standard FFN."""
    """我们自定义的混合专家模块，它替换了标准的FFN。"""
    def __init__(self, config):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.top_k = TOP_K
        self.num_experts = NUM_EXPERTS
        self.router = MoERouter(self.hidden_size, self.num_experts)
        self.experts = nn.ModuleList([LlamaMLP(config) for _ in range(self.num_experts)])

    def forward(self, hidden_states):
        original_shape = hidden_states.shape
        flat_hidden_states = hidden_states.view(-1, self.hidden_size)
        router_logits = self.router(flat_hidden_states)
        routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        routing_weights = routing_weights.to(hidden_states.dtype)
        final_hidden_states = torch.zeros_like(flat_hidden_states)
        for k in range(self.top_k):
            expert_indices_k = selected_experts[:, k]
            routing_weights_k = routing_weights[:, k]
            for i in range(self.num_experts):
                mask = expert_indices_k == i
                if mask.any():
                    expert_output = self.experts[i](flat_hidden_states[mask])
                    final_hidden_states.index_add_(0, torch.where(mask)[0], expert_output * routing_weights_k[mask].unsqueeze(1))
        return final_hidden_states.view(*original_shape)

# --- 2. Core Diagnostic Tools / 诊断测试的核心工具 ---
# This dictionary will store the router decisions captured by our hooks.
# 这个字典将用于存储我们的钩子捕获到的路由器决策数据。
captured_router_weights = {}

def get_router_weights_hook(layer_idx):
    """This is a factory function that creates our hook."""
    """这是一个创建钩子函数的工厂函数。"""
    def hook(module, input, output):
        # `input[0]` is the hidden_states tensor passed to the MoE module.
        # `input[0]` 是传入MoE模块的hidden_states张量。
        router_logits = module.router(input[0])
        # We calculate the average routing probability for all tokens in the sequence.
        # 我们计算序列中所有token的平均路由概率。
        avg_probs = F.softmax(router_logits, dim=-1).mean(dim=[0, 1])

        # *** FINAL FIX: Convert from BFloat16 to Float32 before converting to NumPy. ***
        # NumPy does not support the bfloat16 dtype, so we must convert it first.
        # *** 最终修复：在转换为numpy数组前，先将数据格式从BFloat16转换为Float32。***
        # NumPy库不支持bfloat16这种数据类型，所以我们必须先进行转换。
        captured_router_weights[layer_idx] = avg_probs.detach().cpu().to(torch.float32).numpy()
    return hook

def visualize_router_decisions(prompt):
    """A helper function to print the captured router decisions in a nice table."""
    """一个辅助函数，用于将捕获到的路由器决策以漂亮的表格形式打印出来。"""
    print("\n" + "="*80)
    print(f"ROUTER DECISION ANALYSIS for Prompt: '{prompt[:50]}...'")
    print("="*80)
    print(f"{'Layer':<7} | {'Dominant Expert(s)':<45} | {'Confidence'}")
    print("-"*80)
    for layer_idx, weights in captured_router_weights.items():
        top2_indices = np.argsort(weights)[-2:][::-1]
        dominant_experts_str = f"1. {EXPERT_NAMES[top2_indices[0]]} | 2. {EXPERT_NAMES[top2_indices[1]]}"
        confidence_str = f"({weights[top2_indices[0]]:.1%} | {weights[top2_indices[1]]:.1%})"
        print(f"Layer {layer_idx:<4} | {dominant_experts_str:<45} | {confidence_str}")
    print("="*80 + "\n")

# --- 3. Main Testing Workflow / 主测试流程 ---
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print(f"Loading tokenizer from '{MODEL_PATH}'...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    print(f"Manually rebuilding MoE model structure...")
    # First, we create an "empty shell" of our model with the correct architecture but random weights.
    # `from_config` builds the structure without loading any weights.
    # 首先，我们用`from_config`创建一个拥有正确架构但权重是随机的“空壳”模型。
    # 这一步只搭建骨架，不加载任何权重。
    moe_model = AutoModelForCausalLM.from_config(config)
    
    # Then, we perform the "architectural surgery" again, replacing standard MLPs with our MoEModules.
    # 然后，我们再次手动进行“架构手术”，把标准的MLP替换成我们的MoE模块。
    for i, layer in enumerate(moe_model.model.layers):
        layer.mlp = MoEModule(config)
        
    print(f"Loading your trained MoE weights into the correct structure...")
    # Load the weights from the safetensors file.
    # 从safetensors文件加载权重。
    from safetensors.torch import load_file
    state_dict = load_file(os.path.join(MODEL_PATH, "model.safetensors"), device="cpu")
    
    # *** FINAL FIX #1: Use `strict=False` for flexible loading. ***
    # We know `lm_head.weight` is missing because of `tie_word_embeddings`, so we allow this "inexact" loading.
    # *** 最终修复 #1：使用`strict=False`进行灵活加载。***
    # 我们知道因为`tie_word_embeddings`的设置，`lm_head.weight`是缺失的，所以我们允许这种“不严格”的加载。
    moe_model.load_state_dict(state_dict, strict=False)
    
    # *** FINAL FIX #2: Manually tie the weights. ***
    # This function reads the `tie_word_embeddings` setting from the config and correctly links the lm_head to the token embeddings.
    # *** 最终修复 #2：手动执行权重绑定。***
    # 这个函数会根据config中的`tie_word_embeddings`设置，将lm_head和词嵌入层正确地绑定在一起。
    moe_model.tie_weights()
    
    # Move the finalized model to the GPU and set it to evaluation mode.
    # 将最终完成的模型移动到GPU，并设置为评估模式。
    moe_model.to(device, dtype=torch.bfloat16)
    moe_model.eval()
    print("--- Custom MoE Model Successfully Loaded and Finalized! ---")

    # Install our "listening devices" (hooks) on each MoE layer for diagnostics.
    # 为诊断测试，在每个MoE层上都安装我们的“窃听器”（钩子）。
    hooks = []
    for i, layer in enumerate(moe_model.model.layers):
        if isinstance(layer.mlp, MoEModule):
            hook = layer.mlp.register_forward_hook(get_router_weights_hook(i))
            hooks.append(hook)

    # Design a series of "exam questions" to test different experts.
    # 设计一系列“考题”来测试不同的专家。
    test_prompts = {
        "Coder": "Write a Python function that takes a list of numbers and returns a new list with only the even numbers.",
        "Writer": "In a world where shadows have a life of their own, a young lamplighter discovers a terrible secret. Write the opening paragraph.",
        "Thinker": "If all bloops are gloops, and some gloops are zloops, is it certain that some bloops are zloops? Explain your reasoning.",
        "Encyclopedia": "What were the primary economic and political causes of the French Revolution?",
        "Multi-Expert": "In the style of a Shakespearean tragedy, write a short monologue for a software developer lamenting a bug in their code. Include a comment line from the code."
    }

    # The main testing loop.
    # 主测试循环。
    for expert_name, prompt in test_prompts.items():
        captured_router_weights.clear() # Clear data from the previous run / 清空上一次的捕获数据
        print(f"\n--- Testing for: {expert_name} Expert ---")
        print(f"Prompt: {prompt}")
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        # 1. Functional Test: Generate text / 1. 功能测试：生成文本
        with torch.no_grad():
            outputs = moe_model.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("\n--- Generated Text ---")
        print(generated_text)
        print("--- End of Generated Text ---")
        
        # 2. Diagnostic Test: Visualize router decisions / 2. 诊断测试：可视化路由决策
        visualize_router_decisions(prompt)
    
    # Clean up by removing all hooks to prevent memory leaks.
    # 清理工作：移除所有钩子以防止内存泄漏。
    for hook in hooks:
        hook.remove()
    print("All tests complete and hooks have been removed.")

# Script entry point / 脚本入口
if __name__ == "__main__":
    main()