# ==============================================================================
#                 Smol-MoE 8x135M - "Chat with Your Creation"
#               (Final Interactive Inference Script)
#
#                 Smol-MoE 8x135M - “与你的造物对话”
#               (最终版交互式推理脚本)
# ==============================================================================

# --- Core Library Imports / 核心库导入 ---
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.modeling_llama import LlamaMLP
import os

# --- 1. CRITICAL: Re-define Your Custom Architecture ---
# --- 1. 关键：重新定义你的所有自定义模块 ---
# When loading a model with a custom architecture, Hugging Face needs to know the definition of the custom classes.
# By defining them here, we allow the `from_pretrained` process to correctly reconstruct our unique MoE model.
# 在加载一个拥有自定义架构的模型时，Hugging Face 需要知道这些自定义类的定义。
# 在这里定义它们，我们才能让 `from_pretrained` 函数成功地重建我们独一-无二的MoE模型。

# --- Model Configuration / 模型配置 ---
MODEL_PATH = "./SmolMoE-8x135M-Instruct-v1-Trained"

# Load our custom MoE parameters from the saved config file.
# 从我们保存的配置文件中，加载自定义的MoE参数。
config = AutoConfig.from_pretrained(MODEL_PATH)
NUM_EXPERTS = config.moe_num_experts
TOP_K = config.moe_top_k

class MoERouter(nn.Module):
    """The Router module. Its job is to score experts for each token."""
    """路由器模块。它的工作是为每个token给所有专家打分。"""
    def __init__(self, hidden_size: int, num_experts: int):
        super().__init__()
        self.layer = nn.Linear(hidden_size, num_experts, bias=False)
    def forward(self, hidden_states):
        return self.layer(hidden_states)

class MoEModule(nn.Module):
    """The custom Mixture-of-Experts module that replaces the standard FFN."""
    """我们自定义的混合专家模块，它替换了标准的FFN。"""
    def __init__(self, config):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.top_k = TOP_K
        self.num_experts = NUM_EXPERTS
        self.router = MoERouter(self.hidden_size, self.num_experts)
        self.experts = nn.ModuleList([LlamaMLP(config) for _ in range(self.num_experts)])

    def forward(self, hidden_states):
        original_shape = hidden_states.shape
        flat_hidden_states = hidden_states.view(-1, self.hidden_size)
        router_logits = self.router(flat_hidden_states)
        routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        routing_weights = routing_weights.to(hidden_states.dtype)
        final_hidden_states = torch.zeros_like(flat_hidden_states)
        for k in range(self.top_k):
            expert_indices_k = selected_experts[:, k]
            routing_weights_k = routing_weights[:, k]
            for i in range(self.num_experts):
                mask = expert_indices_k == i
                if mask.any():
                    expert_output = self.experts[i](flat_hidden_states[mask])
                    final_hidden_states.index_add_(0, torch.where(mask)[0], expert_output * routing_weights_k[mask].unsqueeze(1))
        return final_hidden_states.view(*original_shape)

# --- 2. Main Program: Load Model and Start Conversation ---
# --- 2. 主程序：加载模型并开始对话 ---
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # --- Model Loading / 模型加载 ---
    print(f"Loading tokenizer from '{MODEL_PATH}'...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    print(f"Manually rebuilding MoE model structure...")
    # First, we create an "empty shell" of our model with the correct architecture but random weights.
    # `from_config` builds the structure without loading any weights.
    # 首先，我们用`from_config`创建一个拥有正确架构但权重是随机的“空壳”模型。
    # 这一步只搭建骨架，不加载任何权重。
    moe_model = AutoModelForCausalLM.from_config(config)
    
    # Then, we perform the "architectural surgery" again, replacing standard MLPs with our MoEModules.
    # 然后，我们再次手动进行“架构手术”，把标准的MLP替换成我们的MoE模块。
    for i, layer in enumerate(moe_model.model.layers):
        layer.mlp = MoEModule(config)
        
    print(f"Loading your trained MoE weights into the correct structure...")
    from safetensors.torch import load_file
    state_dict = load_file(os.path.join(MODEL_PATH, "model.safetensors"), device="cpu")
    
    # Use `strict=False` for flexible loading, then manually tie the weights.
    # This handles the missing `lm_head.weight` key caused by weight tying.
    # 使用`strict=False`进行灵活加载，然后手动绑定权重。
    # 这个操作处理了因权重绑定而导致的`lm_head.weight`键缺失的问题。
    moe_model.load_state_dict(state_dict, strict=False)
    moe_model.tie_weights()
    
    # Move the finalized model to the GPU and set it to evaluation mode.
    # 将最终完成的模型移动到GPU，并设置为评估模式。
    moe_model.to(device, dtype=torch.bfloat16)
    moe_model.eval()
    print("--- MoE Model is ready for conversation! ---")
    print("Type 'exit' or 'quit' to end the chat.\n")

    # --- Interactive Conversation Loop / 交互式对话循环 ---
    messages = []
    while True:
        try:
            user_input = input("You: ")
            if user_input.lower() in ["exit", "quit"]:
                print("Goodbye!")
                break

            # Step 1: Add the user's input to the conversation history.
            # 步骤 1: 将用户的输入添加到对话历史中。
            messages.append({"role": "user", "content": user_input})
            
            # Step 2: Format the entire conversation history using the chat template.
            # `add_generation_prompt=True` adds the starting tokens for the assistant's turn.
            # 步骤 2: 使用聊天模板格式化完整的对话历史。
            # `add_generation_prompt=True` 会在末尾添加助手角色的起始标记。
            prompt_text = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            
            # Step 3: Encode the input text and move it to the GPU.
            # 步骤 3: 编码输入文本并将其发送到GPU。
            inputs = tokenizer(prompt_text, return_tensors="pt").to(device)

            # Step 4: Generate a response.
            # 步骤 4: 生成回复。
            with torch.no_grad():
                outputs = moe_model.generate(
                    **inputs, 
                    max_new_tokens=256, 
                    temperature=0.7, 
                    top_p=0.9, 
                    do_sample=True
                )
            
            # Step 5: Decode and clean the output.
            # `outputs[0]` contains the full conversation (input + output). We need to extract only the new part.
            # 步骤 5: 解码并清理输出。
            # `outputs[0]` 包含了完整的对话（输入+输出），我们需要从中提取出模型新生成的部分。
            full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Find the newly generated part by removing the original prompt from the full response.
            # 通过从完整回复中移除原始的prompt来找到新生成的部分。
            assistant_prompt_start = "<|assistant|>\n"
            # This is a robust way to find the start of the assistant's actual response
            assistant_response_start_index = full_response.rfind(assistant_prompt_start)
            if assistant_response_start_index != -1:
                model_response = full_response[assistant_response_start_index + len(assistant_prompt_start):].strip()
            else:
                 # Fallback for simpler cases
                model_response = full_response.replace(prompt_text.replace("<s>", "").replace("</s>", ""), "").strip()

            print(f"MoE Model: {model_response}")
            
            # Step 6: Add the model's response to the history for multi-turn conversations.
            # 步骤 6: 将模型的回复也添加到对话历史中，以便进行多轮对话。
            messages.append({"role": "assistant", "content": model_response})

        except KeyboardInterrupt:
            print("\nGoodbye!")
            break

# Script entry point / 脚本入口
if __name__ == "__main__":
    main()