# ============================================================================== # Smol-MoE 8x135M - The "Genesis" Master Script # (Final Optimized Version with All Fixes & Detailed Comments) # ============================================================================== # --- Core Library Imports --- # PyTorch Core import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F # Hugging Face Transformers library, the source of our "Lego bricks" from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.models.llama.modeling_llama import LlamaMLP # The standard FFN module from Llama, which we use as the base for our "Experts" # Safetensors library, for safely and efficiently loading model weights from safetensors.torch import load_file # Standard Python Libraries import os # For handling file paths import shutil # For directory operations (like deleting an old model folder) import numpy as np # For data analysis (in the final test) import time # For timing the training process # --- 0. Global Configuration & Hyperparameters --- # This is the master control panel for the entire project. All key parameters are defined here. # MODEL_NAME: The path to the base model. We use this to load the initial config and tokenizer. # Note: This should point to a standard, unmodified SmolLM model. MODEL_NAME = "./SmolLM2-135M-Instruct" # BASE_EXPERT_PATH: The parent directory containing all 8 of your pre-trained expert model folders. BASE_EXPERT_PATH = "./models" # EXPERT_DIRS: A list of the specific directory names for your 8 expert models. The order is important. EXPERT_DIRS = [ "SmolLM2-135M-Instruct-Actor", "SmolLM2-135M-Instruct-Analyst", "SmolLM2-135M-Instruct-Coder", "SmolLM2-135M-Instruct-Encyclopedia", "SmolLM2-135M-Instruct-Guardian", "SmolLM2-135M-Instruct-Summarizer", "SmolLM2-135M-Instruct-Thinker", "SmolLM2-135M-Instruct-Writer" ] # MoE Architecture Parameters NUM_EXPERTS = 8 # The number of experts in our committee TOP_K = 2 # The number of top experts to route to for each token # Training Hyperparameters LEARNING_RATE = 0.001 # The learning rate for the routers. Since we only train routers, it can be slightly higher. EPOCHS = 20 # Number of training epochs. Since we're using mock data to validate the process, 20 is sufficient. # This needs to be much higher when using real data. BATCH_SIZE = 4 # The number of sequences to process in each batch. Adjust based on your VRAM. SEQUENCE_LENGTH = 128 # The length of text sequences the model processes. Adjust based on your VRAM. LB_LOSS_COEFFICIENT = 0.01 # The weight coefficient for the load balancing loss. This is a critical "balancing valve" # used to trade off between "doing the job well" and "distributing work fairly." # Device Configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # --- 1. MoE Architecture Component Definitions --- # These are the blueprints for the new parts we've "invented." class MoERouter(nn.Module): """ The Router (or Gate Network) - The "CEO" or "dispatcher" of the expert committee. Its structure is a simple linear layer, responsible for scoring all experts for each incoming token. """ def __init__(self, hidden_size: int, num_experts: int): super().__init__() self.layer = nn.Linear(hidden_size, num_experts, bias=False) def forward(self, hidden_states): # Outputs the "scores" (logits) for each expert, which will later be turned into probabilities via Softmax. return self.layer(hidden_states) class MoEModule(nn.Module): """ The Mixture-of-Experts Module - The "conference room" for the entire expert committee. This module replaces the standard FFN (MLP) block in the original Llama model. It contains one router (the CEO) and a list of experts (the board members). """ def __init__(self, config): super().__init__() # Get necessary parameters from the global config self.hidden_size = config.hidden_size self.top_k = TOP_K self.num_experts = NUM_EXPERTS # Create the components self.router = MoERouter(self.hidden_size, self.num_experts) # LlamaMLP is the standard FFN implementation in Hugging Face's Llama, which we use as the base for our "experts". self.experts = nn.ModuleList([LlamaMLP(config) for _ in range(self.num_experts)]) # A placeholder to temporarily store the load balancing loss for this layer during a forward pass self.most_recent_lb_loss = None def forward(self, hidden_states): # Store the original shape to reshape the output at the end original_shape = hidden_states.shape # Flatten the input from (batch, sequence, dim) to (batch * sequence, dim) for token-level routing flat_hidden_states = hidden_states.view(-1, self.hidden_size) # --- Step 1: Routing Decision --- # Get scores from the router for each token router_logits = self.router(flat_hidden_states) # Use Softmax to convert scores to probabilities routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float) # Select the top-k experts and their corresponding probabilities routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) # Normalize the probabilities of the top-k experts so they sum to 1 routing_weights /= routing_weights.sum(dim=-1, keepdim=True) # Cast weights back to the model's main dtype (e.g., bfloat16) for efficiency routing_weights = routing_weights.to(hidden_states.dtype) # --- Step 2: Calculate and Store Load Balancing Loss --- # This is the soul of MoE training: ensuring the router doesn't get "lazy" and uses all experts fairly. router_probs_full = F.softmax(router_logits, dim=-1, dtype=torch.float) avg_expert_prob = router_probs_full.mean(dim=0) # The average probability for each expert across all tokens expert_mask_for_lb = F.one_hot(selected_experts, num_classes=self.num_experts).sum(dim=1) # Checks which experts were chosen for each token avg_expert_fraction = expert_mask_for_lb.float().mean(dim=0) # The average fraction of tokens processed by each expert # Calculate the loss, multiply by the number of experts as a penalty term, and store it. self.most_recent_lb_loss = self.num_experts * torch.sum(avg_expert_prob * avg_expert_fraction) # --- Step 3: Expert Computation and Result Aggregation (Vectorized & Efficient) --- # Create an empty tensor to store the final results final_hidden_states = torch.zeros_like(flat_hidden_states) # This loop only iterates `top_k` times (e.g., 2), which is very fast. for k in range(self.top_k): # Get the expert indices and weights for the k-th choice across all tokens expert_indices_k = selected_experts[:, k] routing_weights_k = routing_weights[:, k] # This loop iterates over all experts, but the computations inside are batched and fast. for i in range(self.num_experts): # Create a mask to find all tokens that were routed to the current expert `i` mask = expert_indices_k == i if mask.any(): # If any token was routed to this expert # Process all selected tokens in a single batch expert_output = self.experts[i](flat_hidden_states[mask]) # Weight the expert's output by its routing weight and "add" it back to the correct positions in the final tensor final_hidden_states.index_add_(0, torch.where(mask)[0], expert_output * routing_weights_k[mask].unsqueeze(1)) # Reshape the result back to the original (batch, sequence, hidden_size) shape and return return final_hidden_states.view(*original_shape) # --- 2. The "Genesis" Function: Assembling, Transplanting, and Modifying the Model --- def create_moe_model(): """ This is the "Architectural Surgery" function. It is responsible for: 1. Building an empty model skeleton with MoE modules. 2. "Transplanting" the weights from your 8 pre-trained experts into it. 3. Freezing all expert parameters, leaving only the routers trainable. """ print("--- Starting Architectural Surgery ---") # Load the config from the standard model; this is the "genetic blueprint" for our new model config = AutoConfig.from_pretrained(MODEL_NAME) print("Step 1: Loading base model skeleton...") # Load one of the experts to serve as the "skeleton" for our MoE model. # We will use its non-FFN parts (embeddings, attention modules, etc.). base_model = AutoModelForCausalLM.from_pretrained( os.path.join(BASE_EXPERT_PATH, EXPERT_DIRS[0]), torch_dtype=torch.bfloat16, device_map=device ) print("Step 2: Pre-loading all expert weights into CPU memory for efficiency...") # To improve efficiency, we load all expert weights from disk into CPU RAM at once. # We use `safetensors.torch.load_file` as it is the correct and safe way to load .safetensors files. all_experts_state_dicts = [ load_file(os.path.join(BASE_EXPERT_PATH, expert_dir, 'model.safetensors'), device='cpu') for expert_dir in EXPERT_DIRS ] print("All expert weights pre-loaded.") print("Step 3: Replacing FFNs with MoE modules and transplanting expert weights...") # Iterate through all 30 layers of the model for layer_idx, layer in enumerate(base_model.model.layers): # In each layer, replace the original, standard LlamaMLP with our custom MoEModule layer.mlp = MoEModule(config).to(device, dtype=torch.bfloat16) # Begin the "Organ Transplant" for expert_idx in range(NUM_EXPERTS): # Get the weights for the current expert from memory expert_state_dict = all_experts_state_dicts[expert_idx] # Filter to get only the weights for the FFN part of the current layer expert_mlp_weights = { k.replace(f"model.layers.{layer_idx}.mlp.", ""): v for k, v in expert_state_dict.items() if f"model.layers.{layer_idx}.mlp." in k } # Load these weights into the corresponding expert "seat" in our MoE module layer.mlp.experts[expert_idx].load_state_dict(expert_mlp_weights) print("Step 4: Freezing all parameters except for the routers...") # This is our key strategy: only train the "CEO", don't disturb the already-smart "experts". for name, param in base_model.named_parameters(): if "router" not in name: param.requires_grad = False print("\n--- Surgery Complete! MoE Model is assembled and ready for training. ---") # Print parameter statistics to verify our operation was successful trainable_params = sum(p.numel() for p in base_model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in base_model.parameters()) print(f"Total Parameters: {total_params / 1e6:.2f}M") print(f"Trainable Parameters (Routers): {trainable_params}") return base_model # --- 3. Main Process: Training and Saving --- def main(): # Step 1: Call the "Genesis" function to create our model moe_model = create_moe_model() # Step 2: Create the optimizer. It's smart enough to only include parameters where `requires_grad=True` (i.e., the routers). optimizer = optim.AdamW([p for p in moe_model.parameters() if p.requires_grad], lr=LEARNING_RATE) print("\n--- Preparing Simulated Mixed Dataset for Training ---") # NOTE: We are using completely random "mock data" here, solely to validate that the entire process runs. # To make the routers truly intelligent, you MUST replace this with a real, diverse dataset. mock_input_ids = torch.randint(0, moe_model.config.vocab_size, (BATCH_SIZE, SEQUENCE_LENGTH), device=device) mock_labels = mock_input_ids.clone() print("--- Starting Router Training Loop (Optimized & Corrected) ---") moe_model.train() # Set the model to training mode start_time = time.time() for epoch in range(EPOCHS): optimizer.zero_grad() # Clear gradients from the previous epoch # --- The Elegant and Correct Forward Pass --- # Call the model directly. Hugging Face automatically handles all complex internal details (like attention masks). # By providing `labels`, it also automatically calculates the main cross-entropy loss for us. outputs = moe_model(input_ids=mock_input_ids, labels=mock_labels) main_loss = outputs.loss # Extract the main task loss # --- Safely Collect Load Balancing Losses --- total_lb_loss = 0.0 for layer in moe_model.model.layers: total_lb_loss += layer.mlp.most_recent_lb_loss # Retrieve the loss stored in our placeholder # --- Calculate the Final "Composite KPI" (Total Loss) --- total_loss = main_loss + LB_LOSS_COEFFICIENT * total_lb_loss # --- Backpropagation and Optimization --- total_loss.backward() # Calculate gradients optimizer.step() # Update router weights # --- Print Training Logs --- if (epoch + 1) % 10 == 0: elapsed_time = time.time() - start_time print(f"Epoch [{epoch+1:03d}/{EPOCHS}] | Total Loss: {total_loss.item():.4f} | " f"Main Loss: {main_loss.item():.4f} | " f"Avg LB Loss: {(total_lb_loss.item() / moe_model.config.num_hidden_layers):.4f} | " f"Time: {elapsed_time:.2f}s") start_time = time.time() print("\n--- Router Training Complete! ---") # --- Step 5: Solidifying our great work onto the disk --- print("\n--- Phase 5: Saving the fully trained MoE model to disk ---") OUTPUT_MODEL_DIR = "./SmolMoE-8x135M-Instruct-v1-Trained" if os.path.exists(OUTPUT_MODEL_DIR): shutil.rmtree(OUTPUT_MODEL_DIR) os.makedirs(OUTPUT_MODEL_DIR) print("Updating model config with MoE-specific parameters...") # We use custom names for our MoE parameters to avoid conflicts with standard Hugging Face generation configs. moe_model.config.moe_num_experts = NUM_EXPERTS moe_model.config.moe_top_k = TOP_K print(f"Saving model to '{OUTPUT_MODEL_DIR}'...") moe_model.save_pretrained(OUTPUT_MODEL_DIR) # Saves weights and the updated config file print("Saving tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Save the tokenizer tokenizer.save_pretrained(OUTPUT_MODEL_DIR) print("\n--- Model successfully saved! ---") print("You can now load this model in other scripts, but you must re-define the custom MoE classes first.") # --- Script Entry Point --- # Ensures that the main() function is only called when this file is executed directly. if __name__ == "__main__": main()