repro code

Browse files

Files changed (8) hide show

activation_stats.py +398 -0
finetune_qwen.py +1267 -0
layer_influence.py +375 -0
layer_surgery.py +279 -0
moe_to_dense.py +1097 -0
sample.txt +65 -0
scales.json +8 -0
visualize_activations.py +467 -0

activation_stats.py ADDED Viewed

	@@ -0,0 +1,398 @@

+#!/usr/bin/env python3
+# fmt: off
+import argparse
+import json
+import math
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+@dataclass
+class RunningStat:
+    count: int = 0
+    sum: float = 0.0
+    sumsq: float = 0.0
+    min: Optional[float] = None
+    max: Optional[float] = None
+    zero_count: int = 0
+    nan_count: int = 0
+    inf_count: int = 0
+    def update_from_tensor(self, t: torch.Tensor):
+        with torch.no_grad():
+            nan_mask = torch.isnan(t)
+            inf_mask = torch.isinf(t)
+            self.nan_count += int(nan_mask.sum().item())
+            self.inf_count += int(inf_mask.sum().item())
+            t = torch.nan_to_num(t, nan=0.0, posinf=0.0, neginf=0.0)
+            self.zero_count += int((t == 0).sum().item())
+            tf = t.float()
+            self.sum += float(tf.sum().item())
+            self.sumsq += float((tf * tf).sum().item())
+            self.count += t.numel()
+            t_min = float(t.min().item())
+            t_max = float(t.max().item())
+            if self.min is None or t_min < self.min:
+                self.min = t_min
+            if self.max is None or t_max > self.max:
+                self.max = t_max
+    @property
+    def mean(self) -> Optional[float]:
+        if self.count == 0:
+            return None
+        return self.sum / self.count
+    @property
+    def var(self) -> Optional[float]:
+        if self.count == 0:
+            return None
+        m = self.mean
+        return max(0.0, self.sumsq / self.count - (m * m))
+    @property
+    def std(self) -> Optional[float]:
+        v = self.var
+        if v is None:
+            return None
+        return math.sqrt(v)
+    def to_dict(self) -> Dict[str, Any]:
+        d = asdict(self)
+        d["mean"] = self.mean
+        d["std"] = self.std
+        return d
+@dataclass
+class TokenRMSStat:
+    count: int = 0
+    sum: float = 0.0
+    sumsq: float = 0.0
+    def update_from_tensor(self, t: torch.Tensor):
+        with torch.no_grad():
+            if t.ndim == 1:
+                feats = t.unsqueeze(0)
+            else:
+                feats = t.view(-1, t.shape[-1])
+            rms = feats.float().pow(2).mean(dim=-1).sqrt()
+            rms = torch.nan_to_num(rms, nan=0.0, posinf=0.0, neginf=0.0)
+            self.count += int(rms.numel())
+            self.sum += float(rms.sum().item())
+            self.sumsq += float((rms * rms).sum().item())
+    @property
+    def mean(self) -> Optional[float]:
+        if self.count == 0:
+            return None
+        return self.sum / self.count
+    @property
+    def var(self) -> Optional[float]:
+        if self.count == 0:
+            return None
+        m = self.mean
+        return max(0.0, self.sumsq / self.count - (m * m))
+    @property
+    def std(self) -> Optional[float]:
+        v = self.var
+        if v is None:
+            return None
+        return math.sqrt(v)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "count": self.count,
+            "mean": self.mean,
+            "std": self.std,
+        }
+class ActivationMonitor:
+    def __init__(self, use_tensorboard: bool = False, tb_dir: Optional[str] = None):
+        self.stats: Dict[str, RunningStat] = {}
+        self.token_rms: Dict[str, TokenRMSStat] = {}
+        self.use_tensorboard = use_tensorboard
+        self.tb = None
+        self._global_step = 0
+        if self.use_tensorboard and tb_dir is not None:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+                self.tb = SummaryWriter(log_dir=tb_dir)
+            except Exception as e:
+                print(f"TensorBoard not available: {e}")
+    def _get_stat(self, name: str) -> RunningStat:
+        if name not in self.stats:
+            self.stats[name] = RunningStat()
+        return self.stats[name]
+    def _get_token_rms(self, name: str) -> TokenRMSStat:
+        if name not in self.token_rms:
+            self.token_rms[name] = TokenRMSStat()
+        return self.token_rms[name]
+    def hook(self, name: str):
+        def _hook(module, inputs, output):
+            with torch.no_grad():
+                t = output
+                if isinstance(t, tuple):
+                    t = t[0]
+                if not isinstance(t, torch.Tensor):
+                    return
+                self._get_stat(name).update_from_tensor(t)
+                self._get_token_rms(name).update_from_tensor(t)
+                if self.tb is not None and (self._global_step % 10 == 0):
+                    rs = self.stats[name]
+                    tr = self.token_rms[name]
+                    if rs.count > 0:
+                        self.tb.add_scalar(
+                            f"{name}/mean", rs.mean, self._global_step
+                        )
+                        if rs.std is not None:
+                            self.tb.add_scalar(
+                                f"{name}/std", rs.std, self._global_step
+                            )
+                        self.tb.add_scalar(
+                            f"{name}/zero_frac",
+                            rs.zero_count / max(1, rs.count),
+                            self._global_step,
+                        )
+                    if tr.count > 0 and tr.mean is not None:
+                        self.tb.add_scalar(
+                            f"{name}/token_rms_mean",
+                            tr.mean,
+                            self._global_step,
+                        )
+            return
+        return _hook
+    def step(self):
+        self._global_step += 1
+    def close(self):
+        if self.tb is not None:
+            self.tb.flush()
+            self.tb.close()
+    def to_dict(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {}
+        for k in sorted(self.stats.keys()):
+            out[k] = {
+                "global": self.stats[k].to_dict(),
+                "token_rms": self.token_rms[k].to_dict(),
+            }
+        return out
+def find_modules_to_hook(
+    model: torch.nn.Module, patterns: List[str]
+) -> List[str]:
+    names: List[str] = []
+    for name, _ in model.named_modules():
+        lname = name.lower()
+        if not lname.startswith("model.layers."):
+            continue
+        for p in patterns:
+            if p in lname:
+                names.append(name)
+                break
+    return sorted(list(set(names)))
+def compute_attention_entropy(
+    model: AutoModelForCausalLM,
+    tok: AutoTokenizer,
+    prompts: List[str],
+    max_length: int,
+    input_device: torch.device,
+) -> Dict[int, float]:
+    prev = getattr(model.config, "output_attentions", False)
+    model.config.output_attentions = True
+    with torch.inference_mode():
+        enc = tok(
+            prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+        )
+        for k in enc:
+            enc[k] = enc[k].to(input_device)
+        out = model(**enc, output_attentions=True, use_cache=False)
+        atts = out.attentions
+        entropies: Dict[int, float] = {}
+        for i, att in enumerate(atts):
+            probs = att.float().clamp_min(1e-12)
+            ent = -(probs * probs.log()).sum(dim=-1)
+            ent_mean = float(ent.mean().item())
+            entropies[i] = ent_mean
+    model.config.output_attentions = prev
+    return entropies
+def load_prompts(
+    prompts: Optional[str], prompts_file: Optional[str]
+) -> List[str]:
+    lines: List[str] = []
+    if prompts_file:
+        with open(prompts_file, "r", encoding="utf-8") as f:
+            for line in f:
+                s = line.strip("\n")
+                if s:
+                    lines.append(s)
+    if prompts:
+        for s in prompts.split("\n"):
+            s = s.strip()
+            if s:
+                lines.append(s)
+    if not lines:
+        lines = [
+            "Hello! Briefly introduce yourself.",
+            "Explain the concept of attention in transformers.",
+            "List three use cases for large language models.",
+        ]
+    return lines
+def main():
+    ap = argparse.ArgumentParser(
+        description="Activation statistics monitor for HF CausalLM models."
+    )
+    ap.add_argument("--model", type=str, required=True, help="Model path or HF ID.")
+    ap.add_argument("--prompts", type=str)
+    ap.add_argument("--prompts_file", type=str)
+    ap.add_argument("--max_length", type=int, default=256)
+    ap.add_argument("--batch_size", type=int, default=4)
+    ap.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        choices=["bfloat16", "float16", "float32"],
+    )
+    ap.add_argument("--device_map", type=str, default="auto")
+    ap.add_argument(
+        "--patterns",
+        type=str,
+        default=(
+            "q_proj,k_proj,v_proj,o_proj,mlp.up_proj,mlp.gate_proj,"
+            "mlp.down_proj,layernorm,norm"
+        ),
+    )
+    ap.add_argument("--save_json", type=str)
+    ap.add_argument("--tensorboard_dir", type=str)
+    ap.add_argument("--attention_entropy", action="store_true")
+    args = ap.parse_args()
+    dtype_map = {
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+        "float32": torch.float32,
+    }
+    torch_dtype = dtype_map[args.dtype]
+    print(f"Loading tokenizer/model: {args.model}")
+    tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch_dtype,
+        trust_remote_code=True,
+        device_map=args.device_map,
+    )
+    model.eval()
+    embed_device = model.get_input_embeddings().weight.device
+    print(f"Sending inputs to: {embed_device}")
+    patterns = [p.strip().lower() for p in args.patterns.split(",") if p.strip()]
+    to_hook = find_modules_to_hook(model, patterns)
+    mon = ActivationMonitor(
+        use_tensorboard=args.tensorboard_dir is not None,
+        tb_dir=args.tensorboard_dir,
+    )
+    handles = []
+    for name, module in model.named_modules():
+        if name in to_hook:
+            handles.append(module.register_forward_hook(mon.hook(name)))
+    print(f"Registered hooks on {len(handles)} modules.")
+    prompts = load_prompts(args.prompts, args.prompts_file)
+    with torch.inference_mode():
+        i = 0
+        while i < len(prompts):
+            batch_prompts = prompts[i : i + args.batch_size]
+            i += args.batch_size
+            enc = tok(
+                batch_prompts,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=args.max_length,
+            )
+            for k in enc:
+                enc[k] = enc[k].to(embed_device)
+            _ = model(**enc, use_cache=False)
+            mon.step()
+    attn_entropy: Dict[int, float] = {}
+    if args.attention_entropy:
+        subset = prompts[: min(len(prompts), args.batch_size)]
+        attn_entropy = compute_attention_entropy(
+            model, tok, subset, args.max_length, embed_device
+        )
+    for h in handles:
+        h.remove()
+    mon.close()
+    stats = mon.to_dict()
+    if args.attention_entropy:
+        stats["_attention_entropy"] = attn_entropy
+    print("\nActivation summary (top 10 by token_rms mean):")
+    ranked = sorted(
+        [
+            (name, d["token_rms"]["mean"] or 0.0)
+            for name, d in stats.items()
+            if name != "_attention_entropy"
+        ],
+        key=lambda x: x[1],
+        reverse=True,
+    )[:10]
+    for name, rms_mean in ranked:
+        g = stats[name]["global"]
+        zero_frac = g.get("zero_count", 0) / max(1, g.get("count", 1))
+        print(
+            f"- {name}: token_rms_mean={rms_mean:.4f}, "
+            f"mean={g.get('mean'):.4f} std={g.get('std'):.4f} "
+            f"min={g.get('min'):.4f} max={g.get('max'):.4f} "
+            f"zero_frac={zero_frac:.4f}"
+        )
+    if args.save_json:
+        out_path = Path(args.save_json)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(out_path, "w") as f:
+            json.dump(stats, f, indent=2)
+        print(f"\nSaved stats JSON to: {out_path}")
+if __name__ == "__main__":
+    main()

finetune_qwen.py ADDED Viewed

	@@ -0,0 +1,1267 @@

+import os
+import torch
+from datasets import load_from_disk, concatenate_datasets, Dataset
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
+from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
+from peft.tuners.lora import LoraLayer
+from trl import SFTTrainer, SFTConfig
+import logging
+import torch.distributed as dist
+from datetime import timedelta, datetime
+import time
+from transformers.trainer import TrainerCallback
+import gc
+import sys
+import shutil  # For handling file operations
+import glob  # For file pattern matching
+import threading  # For background cleanup
+import multiprocessing
+import subprocess
+import tempfile
+import json
+import random
+import math
+import queue
+import numpy as np
+# Import the specific layer class for FSDP wrapping
+try:
+    from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
+except ImportError:
+    logging.warning("Could not import Qwen2DecoderLayer. FSDP wrapping might fail.")
+    Qwen2DecoderLayer = None
+# Configure more detailed logging with timestamps
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    stream=sys.stdout,  # Ensure logs go to stdout for immediate visibility
+    force=True
+)
+# Set up temporary directory for cache files
+temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp")
+os.makedirs(temp_dir, exist_ok=True)
+logging.info(f"Using temporary directory: {temp_dir}")
+# Set environment variables to control temporary file creation
+os.environ["TMPDIR"] = temp_dir  # Unix
+os.environ["TEMP"] = temp_dir    # Windows
+os.environ["TMP"] = temp_dir     # Windows alternative
+# Set default cache locations
+hf_datasets_cache_path = os.path.join(temp_dir, "hf_datasets_cache")
+transformers_cache_path = os.path.join(temp_dir, "transformers_cache")
+hf_home_path = os.path.join(temp_dir, "hf_home")
+os.makedirs(hf_datasets_cache_path, exist_ok=True)
+os.makedirs(transformers_cache_path, exist_ok=True)
+os.makedirs(hf_home_path, exist_ok=True)
+os.environ["HF_DATASETS_CACHE"] = hf_datasets_cache_path
+os.environ["TRANSFORMERS_CACHE"] = transformers_cache_path
+os.environ["HF_HOME"] = hf_home_path
+logging.info(f"Hugging Face Datasets cache directed to: {hf_datasets_cache_path}")
+logging.info(f"Hugging Face Transformers cache directed to: {transformers_cache_path}")
+# Keep forcing Arrow to use system memory pool if possible
+os.environ["ARROW_DEFAULT_MEMORY_POOL"] = "system"
+logging.info("Configured temporary directory and cache locations.")
+# Set environment variable to control PyTorch's memory allocator
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"
+# Disable PYTORCH_NO_CUDA_MEMORY_CACHING for better performance
+if "PYTORCH_NO_CUDA_MEMORY_CACHING" in os.environ:
+    del os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"]
+# Set a longer timeout for NCCL operations
+os.environ["NCCL_BLOCKING_WAIT"] = "1"
+os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
+os.environ["NCCL_TIMEOUT"] = "3600"  # 1 hour timeout for NCCL operations
+# Initialize distributed environment with better error handling
+def init_distributed():
+    try:
+        # Check if we're in a distributed training environment
+        if "WORLD_SIZE" in os.environ and int(os.environ["WORLD_SIZE"]) > 1:
+            # Set memory optimization environment variables
+            if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+                logging.info("Setting PyTorch memory optimizations for H200 GPUs")
+                # Empty CUDA cache before initializing process group
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    logging.info("CUDA cache cleared")
+            local_rank = int(os.environ.get("LOCAL_RANK", 0))
+            world_size = int(os.environ.get("WORLD_SIZE", 1))
+            rank = int(os.environ.get("RANK", 0))
+            logging.info(f"Initializing distributed training for 8x H200s. Rank: {rank}, Local Rank: {local_rank}, World Size: {world_size}")
+            # Set the device for this process explicitly before initializing
+            torch.cuda.set_device(local_rank)
+            logging.info(f"Setting device {local_rank} for process rank {rank}")
+            # Set a longer timeout to handle long operations (3 hours)
+            timeout = timedelta(hours=3)
+            # Initialize the distributed process group
+            dist.init_process_group(
+                backend='nccl',
+                init_method='env://',
+                timeout=timeout,
+                rank=rank,
+                world_size=world_size
+            )
+            # Verify initialization was successful
+            if dist.is_initialized():
+                logging.info(f"Successfully initialized distributed process group. Rank: {rank}, Device: {torch.cuda.current_device()}")
+                # Log NCCL environment
+                logging.info(f"NCCL Version: {torch.cuda.nccl.version() if hasattr(torch.cuda, 'nccl') else 'unknown'}")
+                logging.info(f"CUDA Device Count: {torch.cuda.device_count()}")
+                logging.info(f"CUDA Device Name: {torch.cuda.get_device_name(local_rank)}")
+            else:
+                logging.error(f"Failed to initialize distributed process group. Rank: {rank}")
+            # Ensure all processes can communicate with specified device
+            try:
+                device_ids = [local_rank]
+                dist.barrier(device_ids=device_ids)
+                logging.info(f"Communication test successful. Process {rank} on device {local_rank} can communicate.")
+            except Exception as e:
+                logging.error(f"Communication test failed. Processes cannot communicate: {str(e)}. Rank: {rank}")
+                raise
+            return True
+        else:
+            logging.info("Not running in distributed mode.")
+            return False
+    except Exception as e:
+        logging.error(f"Error initializing distributed environment: {str(e)}")
+        raise
+# Initialize distributed environment
+distributed_mode = init_distributed()
+# --- Configuration ---
+# Model ID updated based on user input
+MODEL_ID = "Qwen/QwQ-32B"
+# Path to the processed dataset created by preprocess_data.py
+DATASET_PATH = "./processed_datasets/combined_code_finetune_data"
+# Number of examples to use (set to -1 for all)
+MAX_EXAMPLES = -1  # Use all examples by default
+# LoRA configuration (Optimized for 8x H200 GPUs)
+LORA_R = 64              # Doubled to increase parameter count significantly
+LORA_ALPHA = 128         # Increased alpha to match r
+LORA_DROPOUT = 0.05      # Dropout probability for LoRA layers
+# Target modules might need verification for QwQ-32B specifically.
+# Common targets for Qwen models:
+LORA_TARGET_MODULES = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    # "embed_tokens", # Removed to reduce overhead/complexity
+    # "lm_head",      # Removed to reduce overhead/complexity
+]
+# Training arguments optimized for 8x H200 GPUs with memory constraints
+OUTPUT_DIR = "./qwq-32b-finetuned-adapters"
+PER_DEVICE_TRAIN_BATCH_SIZE = 8   # Increase BS after halving seq length again
+GRADIENT_ACCUMULATION_STEPS = 6   # Decrease accumulation (8*8*6 = 384)
+# Global batch size = PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS * NumGPUs
+# Example: 8 * 6 * 8 = 384
+LEARNING_RATE = 3e-5     # Slightly higher LR for larger batch size
+EPOCHS = 1                 # Start with 1 epoch, increase cautiously
+MAX_SEQ_LENGTH = 4096      # Halved sequence length again
+LOGGING_STEPS = 50         # Increased logging frequency
+SAVE_STEPS = 500           # Increased save frequency
+OPTIMIZER = "adamw_bnb_8bit" # Use 8-bit optimizer to save significant memory
+WARMUP_RATIO = 0.03
+LR_SCHEDULER_TYPE = "cosine"
+# H200-specific optimizations (8x setup)
+USE_FLASH_ATTN = True      # Enable Flash Attention 2 for H200s
+USE_SEQUENCE_PARALLEL = False # Disable when using FSDP
+USE_BETTER_TRANSFORMERS = True # Use better transformers for optimized kernels
+DATALOADER_NUM_WORKERS = 8      # Reduced workers to avoid CPU contention
+TOKENIZATION_NUM_WORKERS = 224 # Maximum worker count for tokenization
+USE_ACTIVATION_CHECKPOINTING = True # Enable activation checkpointing to save memory with long sequences
+# Advanced distributed training options for 8x GPUs
+USE_FSDP = True  # Enable FSDP
+FSDP_CONFIG = {
+    "fsdp_offload_params": False, # Disable CPU Offload
+    "fsdp_sharding_strategy": 1, # 1 = FULL_SHARD
+    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+    "fsdp_transformer_layer_cls_to_wrap": [Qwen2DecoderLayer.__name__] if Qwen2DecoderLayer else [],
+    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+    "fsdp_backward_prefetch": "backward_post", # Changed from backward_pre
+    "fsdp_forward_prefetch": False, # Disabled forward prefetch
+    "fsdp_activation_checkpointing": [Qwen2DecoderLayer.__name__] if Qwen2DecoderLayer else [], # Use FSDP activation checkpointing
+}
+# WandB Integration
+REPORT_TO_WANDB = True # Set to False to disable WandB reporting
+WANDB_PROJECT_NAME = "QwQ-32B-Finetune-8xH200" # Updated for 8x GPUs
+WANDB_ENTITY = None  # Set to your username or team name if needed
+# Determine report_to destination
+report_to = "none"
+if REPORT_TO_WANDB:
+    # Disable WandB in all processes except rank 0 in distributed mode
+    if distributed_mode and int(os.environ.get("LOCAL_RANK", 0)) != 0:
+        logging.info(f"Rank {os.environ.get('RANK', '?')}: Disabling WandB")
+        os.environ["WANDB_DISABLED"] = "true"
+        report_to = "none" # Explicitly set to none for non-main processes
+    else:
+        # Main process or non-distributed mode, attempt WandB initialization
+        try:
+            import wandb
+            logging.info("Initializing WandB directly...")
+            run_name = f"qwq-32b-finetune-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+            if wandb.run is None:
+                try:
+                    wandb.init(
+                        project=WANDB_PROJECT_NAME,
+                        entity=WANDB_ENTITY,
+                        name=run_name,
+                        config={
+                            "model_name": MODEL_ID,
+                            "batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
+                            "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
+                            "learning_rate": LEARNING_RATE,
+                            "epochs": EPOCHS,
+                            "sequence_length": MAX_SEQ_LENGTH,
+                            "lora_r": LORA_R,
+                            "lora_alpha": LORA_ALPHA,
+                        }
+                    )
+                    logging.info(f"WandB initialized: {wandb.run.name} (ID: {wandb.run.id})")
+                    report_to = "wandb"
+                except Exception as e:
+                    logging.error(f"WandB initialization error: {str(e)}")
+                    report_to = "tensorboard"
+            else:
+                logging.info(f"Using existing WandB run: {wandb.run.name} (ID: {wandb.run.id})")
+                report_to = "wandb"
+        except ImportError:
+            logging.warning("WandB package not installed. Reporting to TensorBoard.")
+            report_to = "tensorboard"
+        except Exception as wandb_init_e:
+             logging.error(f"General WandB setup error: {wandb_init_e}")
+             report_to = "tensorboard"
+# If WandB reporting is disabled, set report_to accordingly
+elif not distributed_mode:
+    report_to = "tensorboard"
+    logging.info("WandB reporting disabled. Reporting to TensorBoard.")
+else: # If WandB is disabled and it IS distributed
+    report_to = "none"
+    logging.info("WandB reporting disabled for this distributed rank.")
+# Quantization (QLoRA)
+USE_4BIT_QUANTIZATION = False # Disable QLoRA due to FSDP incompatibility
+BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # Use bfloat16 if supported, else float16
+BNB_4BIT_QUANT_TYPE = "nf4"
+# --- Check Optional Dependencies (Define flags globally) ---
+FLASH_ATTN_AVAILABLE = False
+BETTER_TRANSFORMERS_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_AVAILABLE = True
+    logging.info("Flash Attention available - will be used if enabled.")
+except ImportError:
+    logging.warning("Flash Attention not available. Install with 'pip install flash-attn'")
+try:
+    from optimum.bettertransformer import BetterTransformer
+    BETTER_TRANSFORMERS_AVAILABLE = True
+    logging.info("Better Transformers available - will be used if enabled.")
+except ImportError:
+    logging.warning("Better Transformers not available. Install with 'pip install optimum'")
+# --- Check Dataset ---
+if not os.path.exists(DATASET_PATH):
+    logging.error(f"Dataset not found at {DATASET_PATH}. Run preprocess_data.py first.")
+    exit(1)
+logging.info(f"Loading dataset from {DATASET_PATH}...")
+# Load dataset normally
+dataset = load_from_disk(DATASET_PATH)
+# Apply truncation if needed
+if MAX_EXAMPLES > 0 and len(dataset) > MAX_EXAMPLES:
+    logging.info(f"Truncating dataset to {MAX_EXAMPLES} examples")
+    indices = list(range(min(MAX_EXAMPLES, len(dataset))))
+    dataset = dataset.select(indices)
+logging.info(f"Dataset loaded: {dataset} with {len(dataset)} examples")
+# --- Tokenizer ---
+logging.info(f"Loading tokenizer for {MODEL_ID}...")
+# Enable fast tokenizer and optimizations
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_ID,
+    use_fast=True,  # Explicitly request the fast Rust-based tokenizer
+    trust_remote_code=True,
+   # model_max_length=MAX_SEQ_LENGTH,
+    padding_side="right",
+)
+# Log tokenizer type for verification
+if hasattr(tokenizer, 'is_fast') and tokenizer.is_fast:
+    logging.info(f"Successfully loaded fast tokenizer (Rust implementation): {type(tokenizer).__name__}")
+    # Fast tokenizers are automatically parallel in dataset.map() when num_proc > 1
+    logging.info(f"Fast tokenizer will use parallel processing during dataset.map() with {TOKENIZATION_NUM_WORKERS} workers")
+else:
+    logging.warning(f"Using Python tokenizer: {type(tokenizer).__name__}")
+    logging.warning("Python tokenizers are slower than Rust-based fast tokenizers")
+# Check and set pad token based on Qwen documentation (<|endoftext|>)
+# Qwen models might have this set correctly, but we verify.
+EXPECTED_PAD_TOKEN = "<|endoftext|>"
+if tokenizer.pad_token is None or tokenizer.pad_token != EXPECTED_PAD_TOKEN:
+    logging.warning(f"Tokenizer pad_token is missing or not '{EXPECTED_PAD_TOKEN}'. Setting pad_token='{EXPECTED_PAD_TOKEN}'.")
+    tokenizer.pad_token = EXPECTED_PAD_TOKEN
+# Enable padding and truncation defaults for batch processing
+tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+tokenizer.padding_side = "right"  # Typically "right" for decoder-only models like Qwen
+# Log tokenizer configuration
+logging.info(f"Tokenizer configuration:")
+logging.info(f"  - Type: {'Fast' if hasattr(tokenizer, 'is_fast') and tokenizer.is_fast else 'Python'}")
+logging.info(f"  - Pad token: {tokenizer.pad_token}")
+logging.info(f"  - EOS token: {tokenizer.eos_token}")  # Should be <|im_end|>
+logging.info(f"  - Vocab size: {tokenizer.vocab_size}")
+logging.info(f"  - Model max length: {tokenizer.model_max_length}")
+logging.info(f"  - Padding side: {tokenizer.padding_side}")
+# Define parallel preprocessing function for the dataset
+def preprocess_function(examples):
+    return tokenizer(
+        examples["text"],
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_SEQ_LENGTH,
+        return_tensors=None,  # Return Python lists for dataset
+    )
+# Create a cache directory for tokenized datasets
+TOKENIZED_DATASET_CACHE_DIR = os.path.join(os.path.dirname(DATASET_PATH), "tokenized_cache")
+os.makedirs(TOKENIZED_DATASET_CACHE_DIR, exist_ok=True)
+tokenized_dataset_path = os.path.join(TOKENIZED_DATASET_CACHE_DIR, "tokenized_dataset")
+# Create a file to signal tokenization completion
+tokenization_done_file = os.path.join(TOKENIZED_DATASET_CACHE_DIR, "tokenization_complete")
+# Function to clean up temporary files in dataset directory
+def delete_existing_tmp_files():
+    """Find and delete any existing tmp files in dataset directory"""
+    # Look for tmp files in dataset directory
+    tmp_files = glob.glob(os.path.join(DATASET_PATH, "tmp*"))
+    if tmp_files:
+        logging.info(f"Found {len(tmp_files)} existing tmp files, removing...")
+        for tmp_file in tmp_files:
+            try:
+                if os.path.isdir(tmp_file):
+                    shutil.rmtree(tmp_file)
+                else:
+                    os.remove(tmp_file)
+                logging.info(f"Removed: {tmp_file}")
+            except Exception as e:
+                logging.warning(f"Could not remove {tmp_file}: {str(e)}")
+    else:
+        logging.info("No existing tmp files found")
+# Check if we're in distributed mode and get rank
+if distributed_mode:
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    is_main_process = rank == 0
+    logging.info(f"Rank {rank}/{world_size}: Preparing for dataset processing")
+else:
+    is_main_process = True
+    rank = 0
+    world_size = 1
+    local_rank = 0
+# Clean up temp files - only on main process to avoid conflicts
+if is_main_process:
+    delete_existing_tmp_files()
+    # Also remove the tokenization_done_file if it exists
+    if os.path.exists(tokenization_done_file):
+        os.remove(tokenization_done_file)
+        logging.info(f"Rank {rank}: Removed old tokenization completion marker")
+# Only tokenize on main process (rank 0) to avoid redundant work
+need_tokenization = False
+# Check if tokenized dataset already exists
+if os.path.exists(tokenized_dataset_path) and os.path.isdir(tokenized_dataset_path):
+    # --- Dataset Exists ---
+    logging.info(f"Rank {rank}: Found existing tokenized dataset at {tokenized_dataset_path}")
+    path_to_load = tokenized_dataset_path # All ranks will load from the persistent path
+    need_tokenization = False
+    # Rank 0 ensures completion marker exists
+    if is_main_process and not os.path.exists(tokenization_done_file):
+        total_original_examples = "unknown"
+        try:
+            from datasets import load_dataset_builder # Local import
+            original_dataset_info = load_dataset_builder(DATASET_PATH).info
+            total_original_examples = original_dataset_info.splits['train'].num_examples
+        except Exception as info_e:
+            logging.warning(f"Rank {rank}: Could not get original dataset info: {info_e}")
+        try:
+            # Get size of existing loaded dataset (approximate if needed)
+            # This requires loading a small part or metadata, might be slow
+            # For now, let's just mark it as existing
+            # loaded_size = len(load_from_disk(tokenized_dataset_path, keep_in_memory=False))
+            loaded_size = "unknown (loaded existing)"
+            with open(tokenization_done_file, "w") as f:
+                f.write(f"Tokenization assumed complete (loaded existing) at {datetime.now().isoformat()}\n")
+                f.write(f"Processed {loaded_size} examples out of {total_original_examples}\n")
+            logging.info(f"Rank {rank}: Created tokenization completion marker as it was missing.")
+        except Exception as file_e:
+            logging.error(f"Rank {rank}: Failed to create missing completion marker: {file_e}")
+            # Proceeding anyway, but other ranks might hang if they rely solely on the file
+    # Non-main ranks still need to wait for the marker to be sure Rank 0 checked/created it
+    elif not is_main_process:
+        logging.info(f"Rank {rank}: Waiting for main process confirmation via marker file...")
+        max_wait_time = 300 # Shorter wait, just confirming file exists
+        wait_start = time.time()
+        while not os.path.exists(tokenization_done_file):
+            if time.time() - wait_start > max_wait_time:
+                 logging.error(f"Rank {rank}: Timed out waiting for marker file from Rank 0.")
+                 raise TimeoutError("Marker file wait timeout")
+            time.sleep(5)
+        logging.info(f"Rank {rank}: Marker file found.")
+elif is_main_process: # Tokenized doesn't exist, Rank 0 needs to create it
+    logging.info(f"Rank {rank}: Tokenization required. Proceeding with tokenization...")
+    need_tokenization = True
+    path_to_load = None
+elif distributed_mode: # Tokenized doesn't exist, non-main ranks need to wait
+    logging.info(f"Rank {rank}: Tokenization required. Waiting for main process...")
+    need_tokenization = True
+    path_to_load = tokenized_dataset_path
+# --- Perform Tokenization (if needed by Rank 0) ---
+if need_tokenization and is_main_process:
+    tokenized_dataset_obj = None # Use a distinct name for the object returned by map
+    try:
+        # Process the dataset using dataset.map with internal parallelism
+        start_time = time.time()  # Define start_time here
+        # Standard tokenization with caching enabled
+        logging.info(f"Rank {rank}: Starting tokenization using dataset.map with {TOKENIZATION_NUM_WORKERS} workers.")
+        tokenized_dataset_obj = dataset.map(
+            preprocess_function,
+            batched=True,
+            batch_size=1000,
+            num_proc=TOKENIZATION_NUM_WORKERS,
+            remove_columns=["text"],
+            load_from_cache_file=True, # Allow using cache file if it exists
+            desc=f"Tokenizing dataset ({TOKENIZATION_NUM_WORKERS} workers)"
+        )
+        elapsed = time.time() - start_time
+        logging.info(f"Rank {rank}: Tokenization successful in {elapsed:.2f} seconds.")
+        # If tokenization was successful:
+        if tokenized_dataset_obj is not None:
+            logging.info(f"Rank {rank}: Dataset tokenization completed.")
+            # Save directly to final path
+            logging.info(f"Rank {rank}: Saving tokenized dataset to {tokenized_dataset_path}...")
+            save_start = time.time()
+            # Ensure target directory doesn't exist (needed for clean save)
+            if os.path.exists(tokenized_dataset_path):
+                shutil.rmtree(tokenized_dataset_path)
+            tokenized_dataset_obj.save_to_disk(tokenized_dataset_path)
+            save_elapsed = time.time() - save_start
+            logging.info(f"Rank {rank}: Tokenized dataset saved in {save_elapsed:.2f} seconds.")
+            # Create completion marker file ONLY after successful save
+            with open(tokenization_done_file, "w") as f:
+                f.write(f"Tokenization completed and saved at {datetime.now().isoformat()}\n")
+            logging.info(f"Rank {rank}: Created tokenization completion marker")
+            # Keep the result in memory for Rank 0 for immediate use
+            dataset = tokenized_dataset_obj
+            path_to_load = None # Rank 0 uses the in-memory object directly
+    except Exception as e:
+        logging.error(f"Rank {rank}: Tokenization failed: {e}")
+        import traceback
+        logging.error(traceback.format_exc())
+        # Create done file indicating failure
+        with open(tokenization_done_file, "w") as f:
+            f.write(f"Tokenization FAILED at {datetime.now().isoformat()}\nError: {e}")
+        raise RuntimeError("Tokenization failed.") from e
+# --- Load Dataset (All Ranks) ---
+# This block now runs for all ranks *after* rank 0 has either tokenized or copied data
+dataset_for_trainer = None # Use a distinct variable name for clarity
+if path_to_load: # If path_to_load is set (means rank 0 copied or non-main rank needs to load)
+    if not is_main_process and need_tokenization:
+         # Non-main ranks wait for the done file if tokenization was required
+         logging.info(f"Rank {rank}: Waiting for tokenization completion signal (already checked for existence)...")
+         # Wait logic already happened if we got here and path_to_load is set
+         pass
+    # All ranks with a path_to_load proceed to load
+    logging.info(f"Rank {rank}: Loading dataset from {path_to_load}...")
+    load_start_time = time.time()
+    try:
+        # Load without forcing into memory initially
+        dataset_for_trainer = load_from_disk(path_to_load, keep_in_memory=False)
+        load_elapsed = time.time() - load_start_time
+        logging.info(f"Rank {rank}: Successfully loaded dataset in {load_elapsed:.2f}s. Length: {len(dataset_for_trainer)}")
+    except Exception as e:
+        logging.error(f"Rank {rank}: CRITICAL - Failed to load dataset from {path_to_load}: {e}")
+        raise
+elif is_main_process and not need_tokenization:
+     # Rank 0 loaded existing, copied to RAM disk, and path_to_load points there
+     # It still needs to load it for the trainer
+     logging.info(f"Rank {rank}: Loading dataset from RAM disk copy {path_to_load}...")
+     try:
+         dataset_for_trainer = load_from_disk(path_to_load, keep_in_memory=False)
+         logging.info(f"Rank {rank}: Successfully loaded dataset from RAM disk copy.")
+     except Exception as e:
+         logging.error(f"Rank {rank}: CRITICAL - Failed to load from RAM disk copy {path_to_load}: {e}")
+         raise
+elif is_main_process and need_tokenization:
+    # Rank 0 just tokenized, 'dataset' variable already holds the result in memory
+    logging.info(f"Rank {rank}: Using in-memory dataset from successful tokenization.")
+    dataset_for_trainer = dataset # Use the object directly
+else:
+    # Should not happen
+    logging.error(f"Rank {rank}: Dataset path logic error. path_to_load='{path_to_load}', need_tokenization={need_tokenization}")
+    raise RuntimeError("Dataset preparation failed - logic error.")
+# At this point, 'dataset' on all ranks should hold the ready-to-use data.
+# Synchronize processes after dataset is ready on all ranks
+if distributed_mode:
+    try:
+        logging.info(f"Rank {rank}: Synchronizing after dataset preparation...")
+        dist.barrier()
+        logging.info(f"Rank {rank}: Synchronization complete.")
+    except Exception as sync_e:
+        logging.error(f"Rank {rank}: Synchronization after dataset prep failed: {sync_e}")
+        raise
+# --- Helper Function for Memory Check ---
+def check_gpu_memory_utilization():
+    """Check and report GPU memory utilization"""
+    if not torch.cuda.is_available():
+        logging.info("CUDA not available, skipping GPU memory check.")
+        return 0 # Return 0 utilization if no GPU
+    logging.info("==== GPU MEMORY UTILIZATION CHECK ====")
+    total_allocated_gb = 0
+    total_reserved_gb = 0
+    total_capacity_gb = 0
+    try:
+        for i in range(torch.cuda.device_count()):
+            free_mem, total_mem = torch.cuda.mem_get_info(i)
+            allocated = torch.cuda.memory_allocated(i)
+            reserved = torch.cuda.memory_reserved(i)
+            free_gb = free_mem / (1024**3)
+            total_gb = total_mem / (1024**3)
+            allocated_gb = allocated / (1024**3)
+            reserved_gb = reserved / (1024**3)
+            utilized_pct = (1 - free_mem/total_mem) * 100 if total_mem > 0 else 0
+            total_allocated_gb += allocated_gb
+            total_reserved_gb += reserved_gb
+            total_capacity_gb += total_gb
+            logging.info(f"GPU {i}: Allocated {allocated_gb:.1f}GB, Reserved {reserved_gb:.1f}GB, "
+                         f"Free {free_gb:.1f}GB, Total {total_gb:.1f}GB, "
+                         f"Utilization: {utilized_pct:.1f}%")
+        avg_utilization = (total_allocated_gb / total_capacity_gb) * 100 if total_capacity_gb > 0 else 0
+        logging.info(f"OVERALL: Using {total_allocated_gb:.1f}GB / {total_capacity_gb:.1f}GB ({avg_utilization:.1f}% allocated)")
+        logging.info("========================================")
+        return avg_utilization
+    except Exception as e:
+        logging.error(f"Error checking GPU memory: {e}")
+        return 0 # Return 0 on error
+# --- Model Loading & Preparation (Runs on ALL ranks) ---
+logging.info(f"Rank {rank}: Loading model: {MODEL_ID}...")
+# 1. Load Model Configuration
+config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
+logging.info("Enabling YaRN scaling in model configuration.")
+config.rope_scaling = {
+    "type": "yarn",
+    "factor": 4.0,
+    "original_max_position_embeddings": 32768,
+}
+# Determine torch dtype
+torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+# Set device_map based on distributed mode
+# When using FSDP, device_map should typically be None or "auto", FSDP handles placement.
+if USE_FSDP:
+    device_map = None
+    logging.info("FSDP enabled: Setting device_map=None")
+elif distributed_mode:
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device_map = {"": local_rank}
+    logging.info(f"Rank {rank}: DDP mode: Loading model on device {local_rank}")
+else:
+    device_map = "auto"
+    logging.info("Rank {rank}: Single process mode: Using automatic device mapping")
+# Configure Flash Attention and other optimizations
+use_flash_attn = USE_FLASH_ATTN and FLASH_ATTN_AVAILABLE
+attn_implementation = "flash_attention_2" if use_flash_attn else None
+# Configure Quantization if enabled
+# quantization_config = None
+# if USE_4BIT_QUANTIZATION:
+#     logging.info("Configuring 4-bit quantization (QLoRA)...")
+#     compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
+#     quantization_config = BitsAndBytesConfig(
+#         load_in_4bit=True,
+#         bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
+#         bnb_4bit_compute_dtype=compute_dtype,
+#         bnb_4bit_use_double_quant=True, # Qwen models often benefit from double quant
+#     )
+#     # Override torch_dtype when using quantization as recommended
+#     # torch_dtype = None
+#     logging.info(f"4-bit quantization config created: type={BNB_4BIT_QUANT_TYPE}, compute={BNB_4BIT_COMPUTE_DTYPE}")
+# Configure model loading kwargs
+model_load_kwargs = {
+    "config": config,
+    "device_map": device_map,
+    "low_cpu_mem_usage": True,
+    "trust_remote_code": True,
+}
+if use_flash_attn:
+    model_load_kwargs["attn_implementation"] = "flash_attention_2"
+# if quantization_config:
+#     model_load_kwargs["quantization_config"] = quantization_config
+# Always set torch_dtype when not using quantization
+model_load_kwargs["torch_dtype"] = torch_dtype
+# Log memory before loading
+# ... (memory logging logic - keep as is) ...
+# Load the model
+model = None # Initialize model variable
+try:
+    logging.info(f"Rank {rank}: Calling AutoModelForCausalLM.from_pretrained...")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        **model_load_kwargs
+    )
+    logging.info(f"Rank {rank}: Base model loaded successfully on device: {model.device if device_map is None else 'CPU/Multi'}")
+    # Ensure consistent dtype before FSDP wrapping (which happens in trainer.train)
+    if torch_dtype == torch.bfloat16:
+        logging.info("Explicitly casting model to bfloat16...")
+        model = model.to(torch.bfloat16)
+    # Apply Better Transformers optimization
+    use_better_transformers_flag = USE_BETTER_TRANSFORMERS and BETTER_TRANSFORMERS_AVAILABLE
+    if use_better_transformers_flag:
+        try:
+            logging.info("Applying BetterTransformer optimizations...")
+            model = BetterTransformer.transform(model)
+            logging.info("BetterTransformer optimizations applied successfully")
+        except Exception as bt_e:
+            logging.warning(f"Could not apply BetterTransformer optimizations: {str(bt_e)}")
+    # Apply activation checkpointing
+    if USE_ACTIVATION_CHECKPOINTING:
+        try:
+            logging.info("Enabling activation checkpointing...")
+            model.gradient_checkpointing_enable()
+            logging.info("Activation checkpointing enabled.")
+        except Exception as ac_e:
+            logging.warning(f"Could not enable activation checkpointing: {str(ac_e)}")
+    # Log model config and check memory utilization
+    logging.info(f"Rank {rank}: Model setup complete.")
+    check_gpu_memory_utilization() # This function needs to be defined or moved
+except Exception as model_load_e: # Correct indentation for except
+    logging.error(f"Rank {rank}: Failed during model loading or preparation: {model_load_e}")
+    import traceback
+    logging.error(traceback.format_exc())
+    # Attempt to clean up distributed env before raising
+    if distributed_mode and dist.is_initialized():
+        try: dist.destroy_process_group()
+        except: pass
+    raise # Re-raise error
+# --- LoRA Configuration ---
+# ... (LoRA config - keep as is) ...
+peft_config = LoraConfig(
+    r=LORA_R,
+    lora_alpha=LORA_ALPHA,
+    lora_dropout=LORA_DROPOUT,
+    target_modules=LORA_TARGET_MODULES,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+# --- Synchronize AFTER model loading & PEFT config ---
+if distributed_mode:
+    try:
+        logging.info(f"Rank {rank}: Synchronizing after model loading...")
+        dist.barrier()
+        logging.info(f"Rank {rank}: Synchronization after model loading complete.")
+    except Exception as sync_e:
+        logging.error(f"Rank {rank}: Synchronization after model loading failed: {sync_e}")
+        raise
+# --- Define Training Arguments ---
+# (Determine determined_run_name logic here as before)
+determined_run_name = None
+if REPORT_TO_WANDB and is_main_process:
+    try:
+        import wandb
+        if wandb.run is not None: determined_run_name = wandb.run.name
+    except Exception: pass # Ignore errors here, handled by report_to
+base_training_args = {
+    # ... (all base args, including max_seq_length) ...
+    "output_dir": OUTPUT_DIR,
+    "per_device_train_batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
+    "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
+    "optim": OPTIMIZER,
+    "save_steps": SAVE_STEPS,
+    "logging_steps": LOGGING_STEPS,
+    "learning_rate": LEARNING_RATE,
+    "num_train_epochs": EPOCHS,
+    "max_steps": -1,
+    "fp16": False,
+    "bf16": torch_dtype == torch.bfloat16, # Use previously determined dtype
+    "max_grad_norm": 0.3,
+    "warmup_ratio": WARMUP_RATIO,
+    "group_by_length": False, # Explicitly disable to prevent pre-computation hang
+    "lr_scheduler_type": LR_SCHEDULER_TYPE,
+    "report_to": report_to,
+    "save_total_limit": 3,
+    "logging_first_step": True,
+    **({"run_name": determined_run_name} if determined_run_name is not None else {}),
+    "fsdp": "full_shard" if USE_FSDP else "", # Pass FSDP strategy string (removed offload)
+    "fsdp_config": FSDP_CONFIG if USE_FSDP else {}, # Pass FSDP config dict
+    "dataloader_num_workers": DATALOADER_NUM_WORKERS,
+    "resume_from_checkpoint": "auto",
+    "save_strategy": "steps",
+    "load_best_model_at_end": False,
+    "metric_for_best_model": None,
+    "dataset_text_field": "text",
+    "packing": False,
+    "max_seq_length": MAX_SEQ_LENGTH,
+    # Memory/Performance Optimizations
+    "gradient_checkpointing_kwargs": {"use_reentrant": False}, # More stable checkpointing for FSDP activation checkpointing
+    "ddp_find_unused_parameters": False, # Should be False for FSDP
+    "tf32": True, # Enable TF32 for faster compute on compatible GPUs
+}
+training_arguments = SFTConfig(**base_training_args)
+logging.info(f"Rank {rank}: Training arguments (SFTConfig) created.")
+# --- Define Callbacks ---
+# Create memory monitoring callback
+class MemoryMonitorCallback(TrainerCallback):
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step % 10 == 0:  # Log every 10 steps
+            if torch.cuda.is_available():
+                gc.collect()
+                torch.cuda.empty_cache()
+                rank = int(os.environ.get("RANK", 0))
+                local_rank = int(os.environ.get("LOCAL_RANK", 0))
+                try:
+                    free_mem, total_mem = torch.cuda.mem_get_info(local_rank)
+                    free_gb = free_mem / (1024**3)
+                    used_gb = (total_mem - free_mem) / (1024**3)
+                    total_gb = total_mem / (1024**3)
+                    reserved = torch.cuda.memory_reserved(local_rank) / (1024**3)
+                    allocated = torch.cuda.memory_allocated(local_rank) / (1024**3)
+                    logging.info(f"Rank {rank}: Memory at step {state.global_step}: "
+                               f"{free_gb:.1f}GB free, {used_gb:.1f}GB used, {total_gb:.1f}GB total, "
+                               f"{reserved:.1f}GB reserved, {allocated:.1f}GB allocated")
+                except Exception as mem_e:
+                     logging.warning(f"Rank {rank}: Could not get memory info: {mem_e}")
+        return control
+memory_monitor = MemoryMonitorCallback()
+# Create a special first step callback with WandB support
+class FirstStepCallback(TrainerCallback):
+    def __init__(self):
+        self.first_step_start_time = None
+        self.progress_indicators = 0
+        self.update_interval = 60  # Check every minute
+        self.last_update_time = time.time()
+    def on_step_begin(self, args, state, control, **kwargs):
+        if state.global_step == 0:
+            self.first_step_start_time = time.time()
+            logging.info(f"FIRST STEP STARTING at {datetime.now().strftime('%H:%M:%S')}")
+            if REPORT_TO_WANDB and 'wandb' in sys.modules:
+                 try:
+                     import wandb # Import locally
+                     if wandb.run:
+                         wandb.log({"training_status": "first_step_started"})
+                 except Exception as log_e: logging.warning(f"Wandb log error: {log_e}")
+        return control
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step == 0:
+            if self.first_step_start_time is None: # Should not happen, but safeguard
+                logging.warning("First step ended but start time was not recorded.")
+                return control
+            duration = time.time() - self.first_step_start_time
+            logging.info(f"FIRST STEP COMPLETED at {datetime.now().strftime('%H:%M:%S')} (took {duration:.2f} seconds)")
+            if REPORT_TO_WANDB and 'wandb' in sys.modules:
+                try:
+                     import wandb # Import locally
+                     if wandb.run:
+                         wandb.log({
+                             "training_status": "first_step_completed",
+                             "first_step_duration": duration
+                         })
+                except Exception as log_e: logging.warning(f"Wandb log error: {log_e}")
+        return control
+    def on_substep_end(self, args, state, control, **kwargs):
+        # This tracks progress within a step (during gradient accumulation)
+        current_time = time.time()
+        # Only report for the first step/substep and only from rank 0
+        if (self.first_step_start_time is not None and
+            state.global_step == 0 and
+            current_time - self.last_update_time >= self.update_interval and
+            (not distributed_mode or int(os.environ.get("LOCAL_RANK", 0)) == 0)):
+            self.progress_indicators += 1
+            elapsed = current_time - self.first_step_start_time
+            logging.info(f"First step still in progress... ({elapsed:.1f}s elapsed, progress indicator {self.progress_indicators})")
+            if REPORT_TO_WANDB and 'wandb' in sys.modules:
+                try:
+                     import wandb # Import locally
+                     if wandb.run:
+                         wandb.log({
+                             "training_status": "first_step_in_progress",
+                             "first_step_elapsed": elapsed,
+                             "progress_indicator": self.progress_indicators
+                         })
+                except Exception as log_e: logging.warning(f"Wandb log error: {log_e}")
+            self.last_update_time = current_time
+        return control
+first_step_callback = FirstStepCallback()
+# Add WandB logging callback if WandB is enabled
+wandb_callback = None # Initialize
+if REPORT_TO_WANDB and 'wandb' in sys.modules and (not distributed_mode or int(os.environ.get("LOCAL_RANK", 0)) == 0):
+    try:
+        # **** FULL WandBLoggingCallback Class Definition ****
+        class WandBLoggingCallback(TrainerCallback):
+            """Logs comprehensive training metrics and progress to Weights & Biases"""
+            def __init__(self):
+                self.training_start_time = None
+                self.last_log_time = None
+                self.total_steps = None
+                self.samples_seen = 0
+                self.tokens_seen = 0
+                self.current_epoch = 0
+                self.epoch_start_time = None
+                self.step_history = []  # For tracking steps/second
+                self.global_tokens_per_second = 0
+                self.progress_table = None # Initialize table to None
+            def on_train_begin(self, args, state, control, **kwargs):
+                """Log hyperparameters and initialize tracking at the start of training"""
+                if not (REPORT_TO_WANDB and 'wandb' in sys.modules): return # Check if WandB should be used
+                try:
+                    import wandb # Import locally
+                    if not wandb.run:
+                        logging.warning("WandBCallback: Wandb not initialized in on_train_begin.")
+                        return
+                except ImportError:
+                     logging.warning("WandBCallback: wandb not imported, cannot log on_train_begin")
+                     return
+                self.training_start_time = time.time()
+                self.epoch_start_time = time.time()
+                self.last_log_time = time.time()
+                # Calculate total expected steps
+                if args.max_steps > 0:
+                    self.total_steps = args.max_steps
+                else:
+                    # Use trainer passed in kwargs if available (prioritize 'trainer' key)
+                    trainer_instance = kwargs.get('trainer', None)
+                    if trainer_instance is None:
+                         trainer_instance = kwargs.get('model', None) # Fallback to 'model' key
+                    dataset_length = 0
+                    if trainer_instance and hasattr(trainer_instance, 'train_dataset') and trainer_instance.train_dataset is not None:
+                         try:
+                             dataset_length = len(trainer_instance.train_dataset)
+                         except Exception as len_e:
+                             logging.warning(f"WandBCallback: Error getting dataset length: {len_e}")
+                    else:
+                         logging.warning("WandBCallback: Could not access train_dataset length during on_train_begin.")
+                    batch_size = args.per_device_train_batch_size
+                    accumulation = args.gradient_accumulation_steps
+                    world_size = int(os.environ.get("WORLD_SIZE", 1))
+                    global_batch_denom = (batch_size * world_size * accumulation)
+                    if dataset_length > 0 and global_batch_denom > 0:
+                         self.total_steps = (dataset_length // global_batch_denom) * args.num_train_epochs
+                    else:
+                         self.total_steps = -1 # Indicate unknown total steps
+                # Log key hyperparameters
+                config = {
+                    "model_name": MODEL_ID,
+                    "lora_r": LORA_R,
+                    "lora_alpha": LORA_ALPHA,
+                    "batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
+                    "grad_accum": GRADIENT_ACCUMULATION_STEPS,
+                    "effective_batch": PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
+                    "global_batch": PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS * world_size,
+                    "learning_rate": LEARNING_RATE,
+                    "seq_length": MAX_SEQ_LENGTH,
+                    "epochs": EPOCHS,
+                    "total_steps_estimated": self.total_steps,
+                    "optimizer": OPTIMIZER,
+                    "warmup_ratio": WARMUP_RATIO,
+                    "scheduler": LR_SCHEDULER_TYPE,
+                }
+                wandb.config.update(config)
+                # Initialize training progress table
+                columns = ["step", "epoch", "loss", "lr", "tokens/sec", "eta", "elapsed_hrs"]
+                self.progress_table = wandb.Table(columns=columns)
+                # Log training start
+                wandb.log({"training_status": "started"})
+                logging.info(f"Training started - total estimated steps: {self.total_steps}")
+            def on_log(self, args, state, control, logs=None, **kwargs):
+                """Log detailed metrics and progress after each logging step"""
+                if not (REPORT_TO_WANDB and 'wandb' in sys.modules): return # Check if WandB should be used
+                try:
+                    import wandb # Import locally
+                    if not wandb.run:
+                        logging.warning("WandBCallback: Wandb run not active during on_log.")
+                        return
+                except ImportError:
+                     logging.warning("WandBCallback: wandb not imported, cannot log on_log")
+                     return
+                if not logs:
+                    return
+                # Format metrics for logging
+                metrics = {}
+                for k, v in logs.items():
+                    if isinstance(v, (int, float)):
+                        metrics[k] = v
+                    elif hasattr(v, "item"):  # Handle tensors
+                        try: metrics[k] = v.item()
+                        except: pass
+                if not metrics:
+                    return
+                # Calculate time-based metrics
+                current_time = time.time()
+                if self.training_start_time is None: self.training_start_time = current_time # Safeguard
+                elapsed_time = current_time - self.training_start_time
+                elapsed_hrs = elapsed_time / 3600
+                # Estimate tokens processed
+                batch_size = args.per_device_train_batch_size
+                grad_accum = args.gradient_accumulation_steps
+                world_size = int(os.environ.get("WORLD_SIZE", 1))
+                global_batch_size = batch_size * grad_accum * world_size
+                tokens_per_step = global_batch_size * MAX_SEQ_LENGTH # Use MAX_SEQ_LENGTH from outer scope
+                # Update tokens seen
+                steps_since_last = state.global_step - (self.step_history[-1][0] if self.step_history else -1)
+                if steps_since_last <= 0: steps_since_last = 1 # Avoid issues on first log
+                new_tokens = tokens_per_step * steps_since_last
+                self.tokens_seen += new_tokens
+                # Calculate throughput
+                time_since_last = current_time - (self.last_log_time if self.last_log_time else current_time)
+                if time_since_last <= 0: time_since_last = 1.0 # Avoid division by zero
+                tokens_per_second = new_tokens / time_since_last
+                # Update rolling average of tokens/sec
+                alpha = 0.1
+                self.global_tokens_per_second = alpha * tokens_per_second + (1 - alpha) * self.global_tokens_per_second
+                # Track epoch progress
+                if "epoch" in metrics:
+                    new_epoch = int(metrics["epoch"])
+                    if new_epoch > self.current_epoch:
+                        epoch_time = current_time - (self.epoch_start_time if self.epoch_start_time else current_time)
+                        self.epoch_start_time = current_time
+                        self.current_epoch = new_epoch
+                        wandb.log({"epoch/duration_sec": epoch_time}, step=state.global_step)
+                        logging.info(f"Epoch {self.current_epoch-1} completed in {epoch_time:.2f} seconds")
+                    epoch_float = metrics["epoch"]
+                    epoch_progress = epoch_float - int(epoch_float)
+                    metrics["epoch_progress"] = epoch_progress * 100
+                # Estimate time remaining
+                eta_hours = float('nan')
+                if self.total_steps and self.total_steps > 0 and state.global_step > 0:
+                    progress_fraction = state.global_step / self.total_steps
+                    if progress_fraction > 1e-6: # Avoid division by zero early on
+                        eta_seconds = elapsed_time / progress_fraction - elapsed_time
+                        eta_hours = eta_seconds / 3600
+                        metrics["eta_hours"] = eta_hours
+                # Add additional calculated metrics
+                metrics.update({
+                    "progress/elapsed_hours": elapsed_hrs,
+                    "progress/tokens_total": self.tokens_seen,
+                    "performance/tokens_per_second": tokens_per_second,
+                    "performance/tokens_per_second_avg": self.global_tokens_per_second,
+                    "performance/global_batch_size": global_batch_size,
+                })
+                # Add GPU utilization if available
+                if torch.cuda.is_available():
+                    try:
+                        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+                        # Note: torch.cuda.utilization might not be available/reliable
+                        # metrics["gpu/utilization"] = torch.cuda.utilization(local_rank)
+                        metrics["gpu/memory_allocated_gb"] = torch.cuda.memory_allocated(local_rank) / 1e9
+                        metrics["gpu/memory_reserved_gb"] = torch.cuda.memory_reserved(local_rank) / 1e9
+                    except Exception as gpu_e:
+                        logging.debug(f"Could not log GPU metrics: {gpu_e}")
+                # Log all metrics to wandb
+                wandb.log(metrics, step=state.global_step)
+                # Add row to progress table
+                if self.progress_table is not None:
+                    loss_val = metrics.get("loss", float("nan"))
+                    lr_val = metrics.get("learning_rate", float("nan"))
+                    epoch_val = metrics.get("epoch", 0)
+                    tokens_sec = metrics.get("performance/tokens_per_second_avg", 0)
+                    self.progress_table.add_data(
+                        state.global_step,
+                        f"{epoch_val:.2f}",
+                        f"{loss_val:.4f}",
+                        f"{lr_val:.2e}",
+                        f"{tokens_sec:.1f}",
+                        f"{eta_hours:.1f} hrs",
+                        f"{elapsed_hrs:.1f} hrs"
+                    )
+                    # Log the updated progress table (might be verbose, consider less frequent logging)
+                    # wandb.log({"training_progress": self.progress_table}, step=state.global_step)
+                # Print concise metrics to console
+                log_info = (
+                    f"Step {state.global_step}"
+                    + (f"/{self.total_steps} ({100 * state.global_step / self.total_steps:.1f}%)" if self.total_steps and self.total_steps > 0 else "")
+                    + f" | Loss: {loss_val:.4f} | LR: {lr_val:.2e} | Epoch: {epoch_val:.2f}"
+                    + f" | Tokens/sec: {tokens_sec:.1f}"
+                    + (f" | ETA: {eta_hours:.1f}h" if not math.isnan(eta_hours) else "")
+                )
+                logging.info(log_info)
+                # Update time tracking
+                self.last_log_time = current_time
+                self.step_history.append((state.global_step, current_time))
+                if len(self.step_history) > 100: # Keep only recent history
+                    self.step_history = self.step_history[-100:]
+            def on_train_end(self, args, state, control, **kwargs):
+                """Log final statistics at the end of training"""
+                if not (REPORT_TO_WANDB and 'wandb' in sys.modules): return # Check if WandB should be used
+                try:
+                    import wandb # Import locally
+                    if not wandb.run:
+                        logging.warning("WandBCallback: Wandb run not active during on_train_end.")
+                        return
+                except ImportError:
+                     logging.warning("WandBCallback: wandb not imported, cannot log on_train_end")
+                     return
+                total_time = time.time() - (self.training_start_time if self.training_start_time else time.time())
+                hours = total_time / 3600
+                final_stats = {
+                    "training_status": "completed",
+                    "total_steps_completed": state.global_step,
+                    "total_epochs_completed": self.current_epoch,
+                    "total_training_time_hours": hours,
+                    "total_tokens_processed": self.tokens_seen,
+                    "average_tokens_per_second": self.tokens_seen / total_time if total_time > 0 else 0
+                }
+                wandb.log(final_stats, step=state.global_step) # Log at final step
+                wandb.run.summary.update({
+                    "training_duration_hours": hours,
+                    "total_steps": state.global_step,
+                    "total_epochs": self.current_epoch,
+                    "total_tokens": self.tokens_seen
+                })
+                logging.info(f"Training complete - {hours:.2f} hours, {state.global_step} steps, {self.tokens_seen:,} tokens processed")
+        # **** End of WandBLoggingCallback Definition ****
+        # Create callback instance
+        wandb_callback = WandBLoggingCallback()
+        logging.info("Enhanced WandB logging callback created")
+    except Exception as e:
+        logging.error(f"Error creating WandB callback: {str(e)}")
+        wandb_callback = None
+# Create the list of callbacks
+trainer_callbacks = [memory_monitor, first_step_callback] # Use the instance names
+if wandb_callback:
+    trainer_callbacks.append(wandb_callback)
+    logging.info("Added WandB callback to trainer")
+# trainer_callbacks = [] # Temporarily disable all callbacks
+# --- Initialize Trainer ---
+logging.info(f"Rank {rank}: Initializing SFTTrainer...")
+trainer = None
+try:
+    trainer = SFTTrainer(
+        model=model,
+        # Using processing_class as per user confirmation
+        processing_class=tokenizer,
+        args=training_arguments,
+        train_dataset=dataset_for_trainer,
+        peft_config=peft_config,
+        # Ensure this matches whether the collator is defined/needed
+        preprocess_logits_for_metrics=None,
+        callbacks=trainer_callbacks, # Pass the list here
+    )
+    logging.info(f"Rank {rank}: SFTTrainer initialized successfully.")
+except Exception as e:
+     logging.error(f"Rank {rank}: Error initializing SFTTrainer: {e}")
+     import traceback
+     logging.error(traceback.format_exc())
+     if distributed_mode and dist.is_initialized():
+         try: dist.destroy_process_group()
+         except: pass
+     raise
+# --- Train ---
+if trainer is not None:
+    logging.info(f"Beginning trainer.train() call at {datetime.now().strftime('%H:%M:%S')}")
+    try:
+        trainer.train()
+        logging.info(f"Training finished successfully at {datetime.now().strftime('%H:%M:%S')}")
+    except Exception as e:
+        logging.error(f"Exception during training: {e}")
+        import traceback
+        logging.error(traceback.format_exc())
+        if distributed_mode and dist.is_initialized():
+            try:
+                dist.destroy_process_group()
+                logging.info("Destroyed process group after training error")
+            except:
+                pass
+        raise
+# --- Merge Model and Save Full Model ---
+logging.info("Merging adapter weights into base model...")
+# Clear some memory first if needed (especially if not using massive GPUs)
+# del model
+# del trainer
+# torch.cuda.empty_cache()
+# Reload the base model (consider lower precision to save VRAM during merge)
+logging.info(f"Reloading base model ({MODEL_ID}) for merging...")
+base_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    config=config, # Ensure YaRN config is used if applied during training
+    torch_dtype=torch.bfloat16, # Or torch.float16, adjust as needed
+    low_cpu_mem_usage=True, # Helps with large models
+    trust_remote_code=True,
+    device_map=None, # Load onto CPU first to potentially save GPU VRAM if needed
+    attn_implementation="flash_attention_2"
+)
+# Load the PEFT model with adapters
+logging.info(f"Loading PEFT model from {OUTPUT_DIR}...")
+merged_model = PeftModel.from_pretrained(
+    base_model,
+    OUTPUT_DIR,
+    device_map=None, # Load onto CPU first
+)
+# Merge the adapter weights
+logging.info("Merging LoRA weights...")
+merged_model = merged_model.merge_and_unload()
+logging.info("LoRA weights merged.")
+# Define path for the full model save
+full_model_save_path = os.path.join(OUTPUT_DIR, "final_merged_model")
+# Save the merged model
+logging.info(f"Saving merged model to {full_model_save_path}...")
+merged_model.save_pretrained(full_model_save_path)
+logging.info("Merged model saved.")
+# Save the tokenizer associated with the merged model
+logging.info(f"Saving tokenizer to {full_model_save_path}...")
+tokenizer.save_pretrained(full_model_save_path)
+logging.info("Tokenizer saved.")
+logging.info(f"Fine-tuning and merging process complete. Full model saved to {full_model_save_path}")
+# --- Notes on Inference and Resuming Training ---
+logging.info("Training Checkpoint Notes:")
+logging.info(f"  • Checkpoints saved to: {OUTPUT_DIR}")
+logging.info(f"  • To resume training from the latest checkpoint, just rerun this script")
+logging.info(f"    (resume_from_checkpoint='auto' will automatically find the latest checkpoint)")
+logging.info(f"  • To resume from a specific checkpoint, set resume_from_checkpoint='path/to/checkpoint'")
+# --- Notes on Inference ---
+# To use the trained adapters:
+# from peft import PeftModel
+# base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, ...)
+# model = PeftModel.from_pretrained(base_model, final_adapter_path)
+# model = model.merge_and_unload() # Optional: merge adapters for faster inference
+# Then use model and tokenizer for generation.

layer_influence.py ADDED Viewed

	@@ -0,0 +1,375 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Measure per-layer causal influence via gating and/or swapping.
+# Mapping fix: correctly map composite layer indices to donor indices.
+import argparse
+import math
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def parse_layers(spec: str) -> List[int]:
+    out: List[int] = []
+    for chunk in spec.split(","):
+        chunk = chunk.strip()
+        if not chunk:
+            continue
+        if "-" in chunk:
+            a, b = chunk.split("-")
+            a, b = int(a), int(b)
+            out.extend(list(range(a, b + 1)))
+        else:
+            out.append(int(chunk))
+    out = sorted(list({x for x in out}))
+    return out
+def load_lines(
+    prompts: Optional[str], prompts_file: Optional[str]
+) -> List[str]:
+    lines: List[str] = []
+    if prompts_file:
+        with open(prompts_file, "r", encoding="utf-8") as f:
+            for line in f:
+                s = line.strip("\n")
+                if s:
+                    lines.append(s)
+    if prompts:
+        for s in prompts.split("\n"):
+            s = s.strip()
+            if s:
+                lines.append(s)
+    if not lines:
+        lines = [
+            "You are a helpful assistant. Say hi in one sentence.",
+            "Explain transformers in 2-3 sentences.",
+            "Summarize the benefits of bfloat16 training.",
+        ]
+    return lines
+def get_embed_device(model: torch.nn.Module) -> torch.device:
+    return model.get_input_embeddings().weight.device
+@torch.inference_mode()
+def dataset_nll(
+    model: AutoModelForCausalLM,
+    tok: AutoTokenizer,
+    texts: List[str],
+    max_length: int = 512,
+    batch_size: int = 4,
+    input_device: Optional[torch.device] = None,
+) -> Tuple[float, int]:
+    if input_device is None:
+        input_device = get_embed_device(model)
+    total_nll = 0.0
+    total_tokens = 0
+    i = 0
+    while i < len(texts):
+        batch = texts[i : i + batch_size]
+        i += batch_size
+        enc = tok(
+            batch,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+        )
+        for k in enc:
+            enc[k] = enc[k].to(input_device)
+        input_ids = enc["input_ids"]
+        attention_mask = enc["attention_mask"]
+        labels = input_ids.clone()
+        labels[labels == tok.pad_token_id] = -100
+        out = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            use_cache=False,
+        )
+        loss = out.loss
+        valid = labels.ne(-100)
+        n_tokens = int(valid.sum().item())
+        total_nll += float(loss.item()) * n_tokens
+        total_tokens += n_tokens
+    return total_nll, total_tokens
+def ppl_from_nll(total_nll: float, total_tokens: int) -> float:
+    if total_tokens == 0:
+        return float("inf")
+    return float(math.exp(total_nll / total_tokens))
+@dataclass
+class GateSpec:
+    layer: int
+    attn_scale: float = 0.0
+    mlp_scale: float = 0.0
+@contextmanager
+def gate_layer(model: AutoModelForCausalLM, spec: GateSpec):
+    """
+    Temporarily scale a layer's residual contribution by scaling:
+      - self_attn.o_proj.weight by attn_scale
+      - mlp.down_proj.weight by mlp_scale
+    Using 0.0 disables that sublayer's residual addition.
+    """
+    backups: List[Tuple[torch.nn.Parameter, torch.Tensor]] = []
+    def scale_param(p: torch.nn.Parameter, s: float):
+        backups.append((p, p.data.detach().clone()))
+        p.data.mul_(s)
+    layer = model.model.layers[spec.layer]  # type: ignore[attr-defined]
+    try:
+        if hasattr(layer.self_attn, "o_proj"):
+            scale_param(layer.self_attn.o_proj.weight, spec.attn_scale)
+        else:
+            raise AttributeError("No o_proj in self_attn")
+        if hasattr(layer.mlp, "down_proj"):
+            scale_param(layer.mlp.down_proj.weight, spec.mlp_scale)
+        else:
+            raise AttributeError("No down_proj in mlp")
+        yield
+    finally:
+        for p, old in backups:
+            p.data.copy_(old)
+        backups.clear()
+@contextmanager
+def swap_layer_from_donor(
+    model_dst: AutoModelForCausalLM,
+    model_src: AutoModelForCausalLM,
+    dst_layer_idx: int,
+    src_layer_idx: int,
+):
+    """
+    Temporarily copy all parameters/buffers for dst_layer_idx from
+    model_src's src_layer_idx, then restore.
+    """
+    dst_prefix = f"model.layers.{dst_layer_idx}."
+    src_prefix = f"model.layers.{src_layer_idx}."
+    src_named_params = dict(model_src.named_parameters())
+    dst_named_params = dict(model_dst.named_parameters())
+    src_named_bufs = dict(model_src.named_buffers())
+    dst_named_bufs = dict(model_dst.named_buffers())
+    src_params_by_suffix: Dict[str, torch.Tensor] = {}
+    for name, p in src_named_params.items():
+        if name.startswith(src_prefix):
+            suffix = name[len(src_prefix) :]
+            src_params_by_suffix[suffix] = p
+    src_bufs_by_suffix: Dict[str, torch.Tensor] = {}
+    for name, b in src_named_bufs.items():
+        if name.startswith(src_prefix):
+            suffix = name[len(src_prefix) :]
+            src_bufs_by_suffix[suffix] = b
+    backups_p: List[Tuple[str, torch.Tensor]] = []
+    backups_b: List[Tuple[str, torch.Tensor]] = []
+    try:
+        with torch.no_grad():
+            for name, p_dst in list(dst_named_params.items()):
+                if not name.startswith(dst_prefix):
+                    continue
+                suffix = name[len(dst_prefix) :]
+                if suffix not in src_params_by_suffix:
+                    continue
+                p_src = src_params_by_suffix[suffix]
+                backups_p.append((name, p_dst.data.detach().clone()))
+                p_dst.data.copy_(
+                    p_src.data.to(device=p_dst.device, dtype=p_dst.dtype)
+                )
+            for name, b_dst in list(dst_named_bufs.items()):
+                if not name.startswith(dst_prefix):
+                    continue
+                suffix = name[len(dst_prefix) :]
+                if suffix not in src_bufs_by_suffix:
+                    continue
+                b_src = src_bufs_by_suffix[suffix]
+                backups_b.append((name, b_dst.data.detach().clone()))
+                b_dst.data.copy_(
+                    b_src.data.to(device=b_dst.device, dtype=b_dst.dtype)
+                )
+        yield
+    finally:
+        with torch.no_grad():
+            for name, old in backups_p:
+                p_dst = dst_named_params[name]
+                p_dst.data.copy_(old)
+            for name, old in backups_b:
+                b_dst = dst_named_bufs[name]
+                b_dst.data.copy_(old)
+def map_layer_idx(
+    dst_idx: int, dst_total: int, src_total: int, mode: str = "ratio"
+) -> int:
+    """
+    Map a composite (dst) layer index to donor (src) layer index.
+    - ratio (default): floor(dst_idx * src_total / dst_total)
+    - wrap: dst_idx % src_total
+    """
+    if src_total <= 0:
+        raise ValueError("src_total must be > 0")
+    if mode == "wrap":
+        return dst_idx % src_total
+    x = int(math.floor(dst_idx * src_total / max(1, dst_total)))
+    return max(0, min(src_total - 1, x))
+def main():
+    ap = argparse.ArgumentParser(
+        description="Per-layer influence via gating and/or swapping."
+    )
+    ap.add_argument("--model", type=str, required=True)
+    ap.add_argument("--donor_model", type=str)
+    ap.add_argument("--layers", type=str, required=True)
+    ap.add_argument("--prompts", type=str)
+    ap.add_argument("--prompts_file", type=str)
+    ap.add_argument("--max_length", type=int, default=512)
+    ap.add_argument("--batch_size", type=int, default=4)
+    ap.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        choices=["bfloat16", "float16", "float32"],
+    )
+    ap.add_argument("--device_map", type=str, default="auto")
+    ap.add_argument("--gate_scan", action="store_true")
+    ap.add_argument("--swap_scan", action="store_true")
+    ap.add_argument("--attn_only", action="store_true")
+    ap.add_argument("--mlp_only", action="store_true")
+    ap.add_argument(
+        "--swap_map", type=str, default="ratio", choices=["ratio", "wrap"]
+    )
+    args = ap.parse_args()
+    dtype_map = {
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+        "float32": torch.float32,
+    }
+    torch_dtype = dtype_map[args.dtype]
+    layers = parse_layers(args.layers)
+    texts = load_lines(args.prompts, args.prompts_file)
+    print(f"Loading model: {args.model}")
+    tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch_dtype,
+        trust_remote_code=True,
+        device_map=args.device_map,
+    ).eval()
+    final_layers = int(
+        getattr(model.config, "num_hidden_layers", len(model.model.layers))
+    )
+    print(f"Composite num_hidden_layers: {final_layers}")
+    print("Computing baseline NLL/PPL...")
+    base_nll, base_tokens = dataset_nll(
+        model,
+        tok,
+        texts,
+        max_length=args.max_length,
+        batch_size=args.batch_size,
+        input_device=get_embed_device(model),
+    )
+    base_ppl = ppl_from_nll(base_nll, base_tokens)
+    print(f"Baseline: tokens={base_tokens} NLL={base_nll:.3f} PPL={base_ppl:.3f}")
+    if args.gate_scan:
+        print("\n== Gate scan (disable residual contribution per layer) ==")
+        a_scale = 0.0 if not args.mlp_only else 1.0
+        m_scale = 0.0 if not args.attn_only else 1.0
+        results: List[Tuple[int, float, float]] = []
+        for L in layers:
+            spec = GateSpec(layer=L, attn_scale=a_scale, mlp_scale=m_scale)
+            with gate_layer(model, spec):
+                nll, ntok = dataset_nll(
+                    model,
+                    tok,
+                    texts,
+                    max_length=args.max_length,
+                    batch_size=args.batch_size,
+                    input_device=get_embed_device(model),
+                )
+            ppl = ppl_from_nll(nll, ntok)
+            delta_nll = nll - base_nll
+            delta_ppl = ppl - base_ppl
+            results.append((L, delta_nll, delta_ppl))
+            print(
+                f"Layer {L:>3}: ΔNLL={delta_nll:+.3f} ΔPPL={delta_ppl:+.3f} "
+                f"(NLL={nll:.3f}, PPL={ppl:.3f})"
+            )
+    if args.swap_scan:
+        if not args.donor_model:
+            raise ValueError("--swap_scan requires --donor_model.")
+        print(f"\nLoading donor model: {args.donor_model}")
+        donor = AutoModelForCausalLM.from_pretrained(
+            args.donor_model,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+            device_map="cpu",
+        ).eval()
+        donor_layers = int(
+            getattr(donor.config, "num_hidden_layers", len(donor.model.layers))
+        )
+        print(
+            f"Donor num_hidden_layers: {donor_layers} "
+            f"(mapping mode: {args.swap_map})"
+        )
+        print("\n== Swap scan (replace composite layer with donor-mapped) ==")
+        results_s: List[Tuple[int, int, float, float]] = []
+        for L in layers:
+            src_L = map_layer_idx(L, final_layers, donor_layers, mode=args.swap_map)
+            with swap_layer_from_donor(model, donor, L, src_L):
+                nll, ntok = dataset_nll(
+                    model,
+                    tok,
+                    texts,
+                    max_length=args.max_length,
+                    batch_size=args.batch_size,
+                    input_device=get_embed_device(model),
+                )
+            ppl = ppl_from_nll(nll, ntok)
+            delta_nll = nll - base_nll
+            delta_ppl = ppl - base_ppl
+            results_s.append((L, src_L, delta_nll, delta_ppl))
+            print(
+                f"Layer {L:>3} <- donor {src_L:>2}: "
+                f"ΔNLL={delta_nll:+.3f} ΔPPL={delta_ppl:+.3f} "
+                f"(NLL={nll:.3f}, PPL={ppl:.3f})"
+            )
+if __name__ == "__main__":
+    main()

layer_surgery.py ADDED Viewed

	@@ -0,0 +1,279 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Layer surgery on safetensors shards:
+# - Replace selected transformer blocks with donor blocks
+# - Optionally rescale specific projections per layer
+#
+# Example:
+#   python layer_surgery.py \
+#     --composite ./qwen3-8b-plus-moe-64L \
+#     --base Qwen/Qwen3-8B \
+#     --out ./qwen3-8b-plus-moe-64L-surgery \
+#     --replace_layers 61 \
+#     --map ratio
+import argparse
+import glob
+import json
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from huggingface_hub import snapshot_download
+def read_json(p: str) -> Dict:
+    with open(p, "r") as f:
+        return json.load(f)
+def write_json(p: Path, data: Dict):
+    with open(p, "w") as f:
+        json.dump(data, f, indent=2)
+def ensure_local(model_or_path: str) -> str:
+    if os.path.isdir(model_or_path):
+        return model_or_path
+    print(f"Downloading {model_or_path} ...")
+    return snapshot_download(
+        model_or_path, cache_dir="./model_cache", resume_download=True
+    )
+def index_dir(model_dir: str) -> Tuple[Dict[str, str], List[str]]:
+    idx_path = os.path.join(model_dir, "model.safetensors.index.json")
+    weight_map: Dict[str, str] = {}
+    files: List[str] = []
+    if os.path.exists(idx_path):
+        idx = read_json(idx_path)
+        weight_map = idx.get("weight_map", {})
+        files = sorted(list({os.path.join(model_dir, f) for f in weight_map.values()}))
+        return weight_map, files
+    st_files = glob.glob(os.path.join(model_dir, "*.safetensors"))
+    if not st_files:
+        raise FileNotFoundError(f"No .safetensors found in {model_dir}")
+    for fpath in st_files:
+        with safe_open(fpath, framework="pt") as f:
+            for k in f.keys():
+                weight_map[k] = os.path.basename(fpath)
+    files = sorted(st_files)
+    return weight_map, files
+def parse_layers(spec: str) -> List[int]:
+    out: List[int] = []
+    for chunk in spec.split(","):
+        chunk = chunk.strip()
+        if not chunk:
+            continue
+        if "-" in chunk:
+            a, b = chunk.split("-")
+            a, b = int(a), int(b)
+            out.extend(list(range(a, b + 1)))
+        else:
+            out.append(int(chunk))
+    return sorted(list({x for x in out}))
+def layer_prefix(li: int) -> str:
+    return f"model.layers.{li}."
+def map_layer(dst_idx: int, dst_total: int, src_total: int, mode: str) -> int:
+    if src_total <= 0:
+        raise ValueError("src_total must be > 0")
+    if mode == "wrap":
+        return dst_idx % src_total
+    x = int(math.floor(dst_idx * src_total / max(1, dst_total)))
+    return max(0, min(src_total - 1, x))
+def build_explicit_map(pairs: Optional[str]) -> Dict[int, int]:
+    m: Dict[int, int] = {}
+    if not pairs:
+        return m
+    for token in pairs.split(","):
+        token = token.strip()
+        if not token:
+            continue
+        a, b = token.split(":")
+        m[int(a)] = int(b)
+    return m
+SCALE_KEYS = {
+    "attn_q": ".self_attn.q_proj.weight",
+    "attn_k": ".self_attn.k_proj.weight",
+    "attn_v": ".self_attn.v_proj.weight",
+    "attn_o": ".self_attn.o_proj.weight",
+    "mlp_up": ".mlp.up_proj.weight",
+    "mlp_gate": ".mlp.gate_proj.weight",
+    "mlp_down": ".mlp.down_proj.weight",
+}
+def load_scales(scale_json: Optional[str]) -> Dict[int, Dict[str, float]]:
+    if not scale_json:
+        return {}
+    data = read_json(scale_json)
+    out: Dict[int, Dict[str, float]] = {}
+    for k, v in data.items():
+        li = int(k)
+        out[li] = {}
+        for mk, sf in v.items():
+            if mk not in SCALE_KEYS:
+                raise ValueError(f"Unknown scale key '{mk}'. Valid: {list(SCALE_KEYS)}")
+            out[li][mk] = float(sf)
+    return out
+def tensor_layer_idx(tensor_name: str) -> Optional[int]:
+    parts = tensor_name.split(".")
+    if len(parts) > 3 and parts[0] == "model" and parts[1] == "layers":
+        try:
+            return int(parts[2])
+        except Exception:
+            return None
+    return None
+def apply_scales_if_needed(
+    tname: str, tensor: torch.Tensor, li: int, scales: Dict[int, Dict[str, float]]
+) -> torch.Tensor:
+    if li not in scales:
+        return tensor
+    spec = scales[li]
+    for key, suffix in SCALE_KEYS.items():
+        if key in spec and tname.endswith(suffix):
+            s = spec[key]
+            return (tensor * tensor.new_tensor(s)).contiguous()
+    return tensor
+def main():
+    ap = argparse.ArgumentParser(
+        description="Layer surgery on safetensors: replace and/or rescale layers."
+    )
+    ap.add_argument("--composite", type=str, required=True)
+    ap.add_argument("--base", type=str, help="Donor model dir or HF ID")
+    ap.add_argument("--out", type=str, required=True)
+    ap.add_argument("--replace_layers", type=str, help='e.g. "61" or "48-55,60,62"')
+    ap.add_argument(
+        "--map", type=str, default="ratio", choices=["ratio", "wrap"]
+    )
+    ap.add_argument("--map_pairs", type=str, help='e.g. "61:34,55:30"')
+    ap.add_argument("--scale_json", type=str)
+    args = ap.parse_args()
+    comp_dir = ensure_local(args.composite)
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    comp_cfg = read_json(os.path.join(comp_dir, "config.json"))
+    L_comp = int(comp_cfg.get("num_hidden_layers"))
+    print(f"Composite layers: {L_comp}")
+    replace_set: List[int] = []
+    if args.replace_layers:
+        replace_set = parse_layers(args.replace_layers)
+        if not args.base:
+            raise ValueError("--base is required when --replace_layers is set.")
+        base_dir = ensure_local(args.base)
+        base_cfg = read_json(os.path.join(base_dir, "config.json"))
+        L_base = int(base_cfg.get("num_hidden_layers"))
+        print(f"Donor layers: {L_base}")
+        explicit = build_explicit_map(args.map_pairs)
+    else:
+        base_dir = ""
+        L_base = 0
+        explicit = {}
+    comp_map, comp_files = index_dir(comp_dir)
+    if replace_set:
+        base_map, base_files = index_dir(base_dir)
+    else:
+        base_map, base_files = {}, []
+    scales = load_scales(args.scale_json)
+    if scales:
+        print("Scales loaded for layers:", sorted(scales.keys()))
+    to_copy = [
+        "config.json",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.model",
+        "generation_config.json",
+    ]
+    for fname in to_copy:
+        src = os.path.join(comp_dir, fname)
+        if os.path.exists(src):
+            shutil.copy2(src, out_dir / fname)
+    print("Performing surgery shard-by-shard...")
+    out_weight_map: Dict[str, str] = {}
+    for comp_f in comp_files:
+        rel = os.path.basename(comp_f)
+        out_f = out_dir / rel
+        new_tensors: Dict[str, torch.Tensor] = {}
+        with safe_open(comp_f, framework="pt") as fcomp:
+            keys = list(fcomp.keys())
+            for k in keys:
+                li = tensor_layer_idx(k)
+                tensor = None
+                if li is not None and li in replace_set:
+                    if li in explicit:
+                        src_li = explicit[li]
+                    else:
+                        src_li = map_layer(li, L_comp, L_base, args.map)
+                    src_prefix = layer_prefix(src_li)
+                    dst_prefix = layer_prefix(li)
+                    donor_k = src_prefix + k[len(dst_prefix) :]
+                    donor_file = base_map.get(donor_k)
+                    if donor_file is None:
+                        raise KeyError(f"Donor tensor not found: {donor_k}")
+                    donor_path = os.path.join(base_dir, donor_file)
+                    with safe_open(donor_path, framework="pt") as fbase:
+                        tensor = fbase.get_tensor(donor_k)
+                else:
+                    tensor = fcomp.get_tensor(k)
+                if li is not None:
+                    tensor = apply_scales_if_needed(k, tensor, li, scales)
+                if not tensor.is_contiguous():
+                    tensor = tensor.contiguous()
+                new_tensors[k] = tensor
+                out_weight_map[k] = rel
+        save_file(new_tensors, str(out_f))
+    total_size = 0
+    for fname in set(out_weight_map.values()):
+        fp = out_dir / fname
+        if fp.exists():
+            total_size += fp.stat().st_size
+    index = {"metadata": {"total_size": total_size, "format": "safetensors"}, "weight_map": out_weight_map}
+    write_json(out_dir / "model.safetensors.index.json", index)
+    print(f"Done. Wrote modified shards and index to: {out_dir}")
+    print("\nTip: validate load quickly (meta device):")
+    print(f"  from transformers import AutoModelForCausalLM")
+    print(f"  AutoModelForCausalLM.from_pretrained('{str(out_dir)}', device_map='meta', trust_remote_code=True)")
+if __name__ == "__main__":
+    main()

moe_to_dense.py ADDED Viewed

	@@ -0,0 +1,1097 @@

+#!/usr/bin/env python3
+"""
+Convert Qwen3 MoE model to dense format with target model compatibility,
+then optionally build a larger composite model by interleaving layers
+with a base dense model (e.g., Qwen3-8B).
+Usage (MoE -> dense):
+    python moe_to_dense.py \
+        --model_id Qwen/Qwen3-235B-A22B-Instruct-2507 \
+        --target_model Qwen/Qwen3-8B \
+        --output_path ./qwen3-235b-dense-avg \
+        --method average \
+        --low_memory
+Usage (build larger composite by interleaving):
+    python moe_to_dense.py \
+        --compose_interleaved \
+        --base_model Qwen/Qwen3-8B \
+        --moe_converted ./qwen3-235b-dense-avg \
+        --composite_output_path ./qwen3-8b-plus-moe-64L \
+        --final_layers 64 \
+        --interleave_strategy even \
+        --cast_dtype bfloat16
+Validate load (meta device, no allocations):
+    python moe_to_dense.py \
+        --validate_model ./qwen3-8b-plus-moe-64L
+"""
+import os
+import json
+import torch
+import argparse
+from pathlib import Path
+from typing import Dict, Any, Optional, List, Tuple
+from tqdm import tqdm
+import logging
+from safetensors import safe_open
+from safetensors.torch import save_file
+import glob
+from huggingface_hub import snapshot_download
+import shutil
+import gc
+import math
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def read_json(path: str) -> Dict[str, Any]:
+    with open(path, "r") as f:
+        return json.load(f)
+def write_json(path: Path, data: Dict[str, Any]):
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+def cast_tensor_dtype(t: torch.Tensor, cast: Optional[str]) -> torch.Tensor:
+    if cast is None:
+        return t
+    target = {
+        "float32": torch.float32,
+        "fp32": torch.float32,
+        "float16": torch.float16,
+        "fp16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "bf16": torch.bfloat16,
+    }[cast.lower()]
+    if t.dtype == target:
+        return t
+    return t.to(dtype=target)
+class MoEToDenseConverter:
+    def __init__(
+        self,
+        model_path: str,
+        target_model_path: str,
+        output_path: str,
+        method: str = "average",
+        low_memory: bool = False,
+    ):
+        """
+        Initialize the converter with target model compatibility.
+        Args:
+            model_path: Path to MoE model or HuggingFace model ID
+            target_model_path: Path to target dense model for dimension matching
+            output_path: Where to save the converted dense model
+            method: How to handle experts:
+                - "concat_experts": Concatenate experts per projection
+                - "average": Average experts per projection (recommended)
+                - "first": Use first expert
+            low_memory: Process per-shard with minimal RAM
+        """
+        self.model_path = model_path
+        self.target_model_path = target_model_path
+        self.output_path = Path(output_path)
+        self.method = method
+        self.low_memory = low_memory
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        # Will be set in load_config
+        self.source_config: Optional[Dict[str, Any]] = None
+        # Load target model config for dimension matching
+        self.target_config = self.load_target_config()
+    def load_target_config(self) -> Dict[str, Any]:
+        if not os.path.exists(self.target_model_path):
+            logger.info(f"Downloading target model {self.target_model_path}...")
+            self.target_model_path = snapshot_download(
+                self.target_model_path,
+                cache_dir="./model_cache",
+                allow_patterns=["config.json"],
+            )
+        config_path = os.path.join(self.target_model_path, "config.json")
+        config = read_json(config_path)
+        logger.info("Target model dimensions:")
+        logger.info(f"  hidden_size: {config.get('hidden_size')}")
+        logger.info(f"  intermediate_size: {config.get('intermediate_size')}")
+        logger.info(
+            f"  num_attention_heads: {config.get('num_attention_heads')}"
+        )
+        logger.info(
+            f"  num_key_value_heads: {config.get('num_key_value_heads')}"
+        )
+        return config
+    def download_model_if_needed(self):
+        if not os.path.exists(self.model_path):
+            logger.info(
+                f"Downloading model {self.model_path} from HuggingFace..."
+            )
+            self.model_path = snapshot_download(
+                self.model_path,
+                cache_dir="./model_cache",
+                resume_download=True,
+            )
+        return self.model_path
+    def load_config(self) -> Dict[str, Any]:
+        config_path = os.path.join(self.model_path, "config.json")
+        source_cfg = read_json(config_path)
+        self.source_config = dict(source_cfg)
+        logger.info(
+            f"Source MoE architecture: "
+            f"{source_cfg.get('architectures', ['Unknown'])}"
+        )
+        logger.info(f"  num_experts: {source_cfg.get('num_experts')}")
+        logger.info(
+            f"  moe_intermediate_size: "
+            f"{source_cfg.get('moe_intermediate_size')}"
+        )
+        cfg = dict(source_cfg)
+        if "Qwen3MoeForCausalLM" in cfg.get("architectures", []):
+            cfg["architectures"] = ["Qwen3ForCausalLM"]
+        cfg["hidden_size"] = self.target_config["hidden_size"]
+        cfg["intermediate_size"] = self.target_config["intermediate_size"]
+        cfg["num_attention_heads"] = self.target_config["num_attention_heads"]
+        cfg["num_key_value_heads"] = self.target_config["num_key_value_heads"]
+        moe_params = [
+            "num_experts",
+            "num_experts_per_tok",
+            "moe_intermediate_size",
+            "decoder_sparse_step",
+            "norm_topk_prob",
+            "output_router_logits",
+            "router_aux_loss_coef",
+            "mlp_only_layers",
+        ]
+        for param in moe_params:
+            if param in cfg:
+                del cfg[param]
+        if cfg.get("model_type") == "qwen3_moe":
+            cfg["model_type"] = "qwen3"
+        return cfg
+    @staticmethod
+    def _pad_trunc_rows(t: torch.Tensor, rows: int) -> torch.Tensor:
+        if t.shape[0] == rows:
+            return t
+        if t.shape[0] > rows:
+            return t[:rows, :].contiguous()
+        pad = torch.zeros(
+            rows - t.shape[0], t.shape[1], dtype=t.dtype, device=t.device
+        )
+        return torch.cat([t, pad], dim=0).contiguous()
+    @staticmethod
+    def _pad_trunc_cols(t: torch.Tensor, cols: int) -> torch.Tensor:
+        if t.shape[1] == cols:
+            return t
+        if t.shape[1] > cols:
+            return t[:, :cols].contiguous()
+        pad = torch.zeros(
+            t.shape[0], cols - t.shape[1], dtype=t.dtype, device=t.device
+        )
+        return torch.cat([t, pad], dim=1).contiguous()
+    def convert_attention_layers(
+        self, tensors: Dict[str, torch.Tensor], layer_idx: int
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Convert attention layers to match target dimensions with proper GQA
+        head remapping.
+        Linear weights are [out_features, in_features]:
+          - q_proj: out = num_attention_heads * head_dim (= hidden_size)
+          - k_proj: out = num_key_value_heads * head_dim
+          - v_proj: out = num_key_value_heads * head_dim
+          - o_proj: in  = num_attention_heads * head_dim (= hidden_size)
+        """
+        if self.source_config is None:
+            raise RuntimeError(
+                "source_config not set. load_config must be called first."
+            )
+        converted: Dict[str, torch.Tensor] = {}
+        tgt_hidden = int(self.target_config["hidden_size"])
+        tgt_heads = int(self.target_config["num_attention_heads"])
+        tgt_kv_heads = int(self.target_config["num_key_value_heads"])
+        if tgt_heads == 0 or tgt_hidden % tgt_heads != 0:
+            raise ValueError(
+                f"Invalid target heads/hidden: hidden={tgt_hidden}, "
+                f"heads={tgt_heads}"
+            )
+        tgt_head_dim = tgt_hidden // tgt_heads
+        src_heads = int(
+            self.source_config.get("num_attention_heads", tgt_heads)
+        )
+        src_kv_heads = int(
+            self.source_config.get("num_key_value_heads", tgt_kv_heads)
+        )
+        def remap_heads(
+            W: torch.Tensor,
+            src_n: int,
+            tgt_n: int,
+            src_hd: int,
+            tgt_hd: int,
+        ) -> torch.Tensor:
+            # W: [src_n * src_hd, in]
+            W3 = W.view(src_n, src_hd, W.shape[1])
+            if src_hd != tgt_hd:
+                if src_hd > tgt_hd:
+                    W3 = W3[:, :tgt_hd, :].contiguous()
+                else:
+                    pad = torch.zeros(
+                        src_n,
+                        tgt_hd - src_hd,
+                        W.shape[1],
+                        dtype=W.dtype,
+                        device=W.device,
+                    )
+                    W3 = torch.cat([W3, pad], dim=1).contiguous()
+            if src_n == tgt_n:
+                Wm = W3
+            elif src_n > tgt_n:
+                if src_n % tgt_n != 0:
+                    Wm = W3[:tgt_n, :, :].contiguous()
+                else:
+                    g = src_n // tgt_n
+                    Wm = (
+                        W3.view(tgt_n, g, tgt_hd, W.shape[1])
+                        .mean(dim=1)
+                        .contiguous()
+                    )
+            else:
+                r = tgt_n // max(1, src_n)
+                if r * src_n == tgt_n:
+                    Wm = W3.repeat_interleave(r, dim=0).contiguous()
+                else:
+                    reps = math.ceil(tgt_n / src_n)
+                    Wm = (
+                        W3.repeat((reps, 1, 1))[:tgt_n, :, :].contiguous()
+                    )
+            W2 = Wm.view(tgt_n * tgt_hd, W.shape[1])
+            return W2
+        for key, tensor in tensors.items():
+            if "self_attn" not in key:
+                continue
+            if "q_proj" in key:
+                out_src = tensor.shape[0]
+                src_hd = out_src // max(1, src_heads)
+                if src_hd * src_heads != out_src:
+                    src_hd = tgt_head_dim
+                W = remap_heads(tensor, src_heads, tgt_heads, src_hd, tgt_head_dim)
+                W = self._pad_trunc_cols(W, tgt_hidden)
+                converted[key] = W
+            elif "k_proj" in key or "v_proj" in key:
+                out_src = tensor.shape[0]
+                src_hd = out_src // max(1, src_kv_heads)
+                if src_hd * src_kv_heads != out_src:
+                    src_hd = tgt_head_dim
+                W = remap_heads(
+                    tensor, src_kv_heads, tgt_kv_heads, src_hd, tgt_head_dim
+                )
+                W = self._pad_trunc_cols(W, tgt_hidden)
+                converted[key] = W
+            elif "o_proj" in key:
+                W = self._pad_trunc_cols(tensor, tgt_hidden)
+                W = self._pad_trunc_rows(W, tgt_hidden)
+                converted[key] = W
+        return converted
+    def convert_moe_layer_to_dense(
+        self, layer_tensors: Dict[str, torch.Tensor], layer_idx: int
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Convert MoE FFN experts to a single dense FFN matching target dims.
+        Orientation:
+          - up_proj, gate_proj -> [intermediate, hidden] (concat along rows)
+          - down_proj          -> [hidden, intermediate] (concat along cols)
+        """
+        dense_tensors: Dict[str, torch.Tensor] = {}
+        expert_tensors = {"up_proj": [], "down_proj": [], "gate_proj": []}
+        for key, tensor in layer_tensors.items():
+            if "experts" in key:
+                if "up_proj" in key:
+                    expert_tensors["up_proj"].append(tensor)
+                elif "down_proj" in key:
+                    expert_tensors["down_proj"].append(tensor)
+                elif "gate_proj" in key:
+                    expert_tensors["gate_proj"].append(tensor)
+            elif "router" not in key and "mlp" not in key:
+                dense_tensors[key] = tensor
+        attention_tensors = self.convert_attention_layers(
+            {k: v for k, v in layer_tensors.items() if "self_attn" in k},
+            layer_idx,
+        )
+        dense_tensors.update(attention_tensors)
+        target_intermediate = int(self.target_config["intermediate_size"])
+        target_hidden = int(self.target_config["hidden_size"])
+        def infer_per_expert_ffn() -> int:
+            src = (
+                expert_tensors["up_proj"]
+                or expert_tensors["gate_proj"]
+                or expert_tensors["down_proj"]
+            )
+            if not src:
+                return 1536
+            s = src[0].shape
+            if s[0] == target_hidden:
+                return int(s[1])
+            if s[1] == target_hidden:
+                return int(s[0])
+            return int(min(s[0], s[1]))
+        per_expert_ffn = infer_per_expert_ffn()
+        logger.info(f"  per_expert_ffn inferred as {per_expert_ffn}")
+        def to_up_gate_shape(W: torch.Tensor) -> torch.Tensor:
+            if W.shape == (target_hidden, per_expert_ffn):
+                return W.t().contiguous()
+            if W.shape == (per_expert_ffn, target_hidden):
+                return W.contiguous()
+            return W.t().contiguous()
+        def to_down_shape(W: torch.Tensor) -> torch.Tensor:
+            if W.shape == (per_expert_ffn, target_hidden):
+                return W.t().contiguous()
+            if W.shape == (target_hidden, per_expert_ffn):
+                return W.contiguous()
+            return W.t().contiguous()
+        if self.method == "concat_experts":
+            num_experts_needed = math.ceil(
+                target_intermediate / per_expert_ffn
+            )
+            for proj_type in ["up_proj", "gate_proj"]:
+                if expert_tensors[proj_type]:
+                    selected = expert_tensors[proj_type][:num_experts_needed]
+                    while len(selected) < num_experts_needed:
+                        selected.append(
+                            expert_tensors[proj_type][
+                                len(selected)
+                                % len(expert_tensors[proj_type])
+                            ]
+                        )
+                    blocks = [to_up_gate_shape(W) for W in selected]
+                    W_cat = torch.cat(blocks, dim=0)
+                    W_cat = W_cat[:target_intermediate, :].contiguous()
+                    dense_key = (
+                        f"model.layers.{layer_idx}.mlp.{proj_type}.weight"
+                    )
+                    dense_tensors[dense_key] = W_cat
+            if expert_tensors["down_proj"]:
+                selected = expert_tensors["down_proj"][:num_experts_needed]
+                while len(selected) < num_experts_needed:
+                    selected.append(
+                        expert_tensors["down_proj"][
+                            len(selected)
+                            % len(expert_tensors["down_proj"])
+                        ]
+                    )
+                blocks = [to_down_shape(W) for W in selected]
+                W_cat = torch.cat(blocks, dim=1)
+                W_cat = W_cat[:, :target_intermediate].contiguous()
+                dense_key = (
+                    f"model.layers.{layer_idx}.mlp.down_proj.weight"
+                )
+                dense_tensors[dense_key] = W_cat
+        elif self.method == "average":
+            for proj_type in ["up_proj", "gate_proj"]:
+                if expert_tensors[proj_type]:
+                    stack = torch.stack(
+                        [to_up_gate_shape(e) for e in expert_tensors[proj_type]]
+                    )
+                    W = torch.mean(stack, dim=0)
+                    if W.shape[0] < target_intermediate:
+                        pad = torch.zeros(
+                            target_intermediate - W.shape[0],
+                            W.shape[1],
+                            dtype=W.dtype,
+                        )
+                        W = torch.cat([W, pad], dim=0).contiguous()
+                    else:
+                        W = W[:target_intermediate, :].contiguous()
+                    dense_key = (
+                        f"model.layers.{layer_idx}.mlp.{proj_type}.weight"
+                    )
+                    dense_tensors[dense_key] = W
+            if expert_tensors["down_proj"]:
+                stack = torch.stack(
+                    [to_down_shape(e) for e in expert_tensors["down_proj"]]
+                )
+                W = torch.mean(stack, dim=0)
+                if W.shape[1] < target_intermediate:
+                    pad = torch.zeros(
+                        W.shape[0],
+                        target_intermediate - W.shape[1],
+                        dtype=W.dtype,
+                    )
+                    W = torch.cat([W, pad], dim=1).contiguous()
+                else:
+                    W = W[:, :target_intermediate].contiguous()
+                dense_key = (
+                    f"model.layers.{layer_idx}.mlp.down_proj.weight"
+                )
+                dense_tensors[dense_key] = W
+        elif self.method == "first":
+            for proj_type in ["up_proj", "gate_proj"]:
+                if expert_tensors[proj_type]:
+                    W = to_up_gate_shape(expert_tensors[proj_type][0])
+                    if W.shape[0] < target_intermediate:
+                        pad = torch.zeros(
+                            target_intermediate - W.shape[0],
+                            W.shape[1],
+                            dtype=W.dtype,
+                        )
+                        W = torch.cat([W, pad], dim=0).contiguous()
+                    else:
+                        W = W[:target_intermediate, :].contiguous()
+                    dense_key = (
+                        f"model.layers.{layer_idx}.mlp.{proj_type}.weight"
+                    )
+                    dense_tensors[dense_key] = W
+            if expert_tensors["down_proj"]:
+                W = to_down_shape(expert_tensors["down_proj"][0])
+                if W.shape[1] < target_intermediate:
+                    pad = torch.zeros(
+                        W.shape[0],
+                        target_intermediate - W.shape[1],
+                        dtype=W.dtype,
+                    )
+                    W = torch.cat([W, pad], dim=1).contiguous()
+                else:
+                    W = W[:, :target_intermediate].contiguous()
+                dense_key = (
+                    f"model.layers.{layer_idx}.mlp.down_proj.weight"
+                )
+                dense_tensors[dense_key] = W
+        return dense_tensors
+    def convert_safetensors_file(
+        self, input_file: str, output_file: str
+    ) -> Tuple[List[str], str]:
+        logger.info(f"Converting {os.path.basename(input_file)}...")
+        tensors_by_layer: Dict[int, Dict[str, torch.Tensor]] = {}
+        other_tensors: Dict[str, torch.Tensor] = {}
+        with safe_open(input_file, framework="pt") as f:
+            for key in f.keys():
+                tensor = f.get_tensor(key)
+                if "model.layers." in key:
+                    parts = key.split(".")
+                    layer_idx = int(parts[2])
+                    if layer_idx not in tensors_by_layer:
+                        tensors_by_layer[layer_idx] = {}
+                    tensors_by_layer[layer_idx][key] = tensor
+                else:
+                    other_tensors[key] = tensor
+        converted_tensors: Dict[str, torch.Tensor] = {}
+        for layer_idx in sorted(tensors_by_layer.keys()):
+            logger.info(f"Processing layer {layer_idx}")
+            layer_tensors = tensors_by_layer[layer_idx]
+            has_experts = any("experts" in key for key in layer_tensors.keys())
+            if has_experts:
+                dense_layer = self.convert_moe_layer_to_dense(
+                    layer_tensors, layer_idx
+                )
+                converted_tensors.update(dense_layer)
+            else:
+                attention_tensors = self.convert_attention_layers(
+                    {
+                        k: v
+                        for k, v in layer_tensors.items()
+                        if "self_attn" in k
+                    },
+                    layer_idx,
+                )
+                converted_tensors.update(attention_tensors)
+                for key, tensor in layer_tensors.items():
+                    if "router" not in key and "self_attn" not in key:
+                        converted_tensors[key] = tensor
+        converted_tensors.update(other_tensors)
+        for key in converted_tensors:
+            if not converted_tensors[key].is_contiguous():
+                converted_tensors[key] = converted_tensors[key].contiguous()
+        save_file(converted_tensors, output_file)
+        logger.info(f"Saved to {os.path.basename(output_file)}")
+        return list(converted_tensors.keys()), output_file
+    def convert(self):
+        self.model_path = self.download_model_if_needed()
+        logger.info("Converting configuration...")
+        config = self.load_config()
+        config_output = self.output_path / "config.json"
+        write_json(config_output, config)
+        logger.info(f"Saved config to {config_output}")
+        logger.info("Copying tokenizer files...")
+        tokenizer_files = [
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "special_tokens_map.json",
+            "vocab.json",
+            "merges.txt",
+            "tokenizer.model",
+            "generation_config.json",
+        ]
+        for file in tokenizer_files:
+            src = os.path.join(self.model_path, file)
+            if os.path.exists(src):
+                dst = self.output_path / file
+                shutil.copy2(src, dst)
+                logger.info(f"  Copied {file}")
+        weight_files = glob.glob(os.path.join(self.model_path, "*.safetensors"))
+        if not weight_files:
+            weight_files = glob.glob(
+                os.path.join(self.model_path, "model*.safetensors")
+            )
+        if not weight_files:
+            raise FileNotFoundError(
+                f"No safetensors files found in {self.model_path}"
+            )
+        weight_files.sort()
+        logger.info(f"Found {len(weight_files)} weight files to convert")
+        tensor_map: Dict[str, str] = {}
+        total_tensors = 0
+        for i, weight_file in enumerate(weight_files, 1):
+            output_filename = (
+                f"model-{i:05d}-of-{len(weight_files):05d}.safetensors"
+            )
+            output_file = self.output_path / output_filename
+            tensor_names, _ = self.convert_safetensors_file(
+                weight_file, str(output_file)
+            )
+            for tensor_name in tensor_names:
+                tensor_map[tensor_name] = output_filename
+            total_tensors += len(tensor_names)
+            logger.info(f"Progress: {i}/{len(weight_files)} files converted")
+            if self.low_memory:
+                gc.collect()
+        self.create_model_index(tensor_map)
+        logger.info("Conversion complete")
+        logger.info(f"   Total tensors converted: {total_tensors}")
+        logger.info(f"   Output saved to: {self.output_path}")
+        return self.output_path
+    def create_model_index(self, tensor_map: Dict[str, str]):
+        total_size = 0
+        for filename in set(tensor_map.values()):
+            file_path = self.output_path / filename
+            if file_path.exists():
+                total_size += file_path.stat().st_size
+        index = {
+            "metadata": {"total_size": total_size, "format": "safetensors"},
+            "weight_map": tensor_map,
+        }
+        index_path = self.output_path / "model.safetensors.index.json"
+        write_json(index_path, index)
+        logger.info(
+            f"Created index file with {len(tensor_map)} tensor mappings"
+        )
+def index_safetensors_dir(
+    model_dir: str,
+) -> Tuple[Dict[str, str], List[str]]:
+    model_dir = str(model_dir)
+    index_path = os.path.join(model_dir, "model.safetensors.index.json")
+    weight_map: Dict[str, str] = {}
+    files: List[str] = []
+    if os.path.exists(index_path):
+        idx = read_json(index_path)
+        weight_map = idx.get("weight_map", {})
+        files = sorted(
+            list({os.path.join(model_dir, f) for f in weight_map.values()})
+        )
+        return weight_map, files
+    st_files = glob.glob(os.path.join(model_dir, "*.safetensors"))
+    if not st_files:
+        raise FileNotFoundError(
+            f"No safetensors files found in {model_dir}"
+        )
+    for fpath in st_files:
+        with safe_open(fpath, framework="pt") as f:
+            for key in f.keys():
+                weight_map[key] = os.path.basename(fpath)
+    files = sorted(st_files)
+    return weight_map, files
+def list_layer_keys(weight_map: Dict[str, str], layer_idx: int) -> List[str]:
+    prefix = f"model.layers.{layer_idx}."
+    return [k for k in weight_map.keys() if k.startswith(prefix)]
+def load_tensors_by_keys(
+    model_dir: str,
+    weight_map: Dict[str, str],
+    keys: List[str],
+    cast_dtype_str: Optional[str] = None,
+) -> Dict[str, torch.Tensor]:
+    files_to_keys: Dict[str, List[str]] = {}
+    for k in keys:
+        filename = weight_map[k]
+        files_to_keys.setdefault(filename, []).append(k)
+    out: Dict[str, torch.Tensor] = {}
+    for filename, klist in files_to_keys.items():
+        fpath = os.path.join(model_dir, filename)
+        with safe_open(fpath, framework="pt") as f:
+            for k in klist:
+                t = f.get_tensor(k)
+                t = cast_tensor_dtype(t, cast_dtype_str)
+                if not t.is_contiguous():
+                    t = t.contiguous()
+                out[k] = t
+    return out
+def rename_layer_keys(
+    tensors: Dict[str, torch.Tensor],
+    src_layer: int,
+    dst_layer: int,
+) -> Dict[str, torch.Tensor]:
+    src_prefix = f"model.layers.{src_layer}."
+    dst_prefix = f"model.layers.{dst_layer}."
+    out: Dict[str, torch.Tensor] = {}
+    for k, v in tensors.items():
+        if not k.startswith(src_prefix):
+            continue
+        new_k = dst_prefix + k[len(src_prefix) :]
+        out[new_k] = v
+    return out
+def copy_non_layer_tensors(
+    src_dir: str,
+    cast_dtype_str: Optional[str] = None,
+) -> Dict[str, torch.Tensor]:
+    weight_map, _ = index_safetensors_dir(src_dir)
+    keys = [k for k in weight_map.keys() if "model.layers." not in k]
+    return load_tensors_by_keys(src_dir, weight_map, keys, cast_dtype_str)
+def build_even_interleave_plan(
+    final_layers: int,
+    base_layers: int,
+    moe_layers: int,
+) -> List[Tuple[str, int]]:
+    n_moe = min(moe_layers, max(0, final_layers - base_layers))
+    n_base = final_layers - n_moe
+    plan: List[Tuple[str, int]] = []
+    moe_slots = set()
+    for i in range(final_layers):
+        if (
+            math.floor((i + 1) * n_moe / max(1, final_layers))
+            != math.floor(i * n_moe / max(1, final_layers))
+            and len(moe_slots) < n_moe
+        ):
+            moe_slots.add(i)
+    used_moe = 0
+    used_base = 0
+    for i in range(final_layers):
+        if i in moe_slots:
+            src_idx = 0
+            if n_moe > 0:
+                src_idx = min(
+                    moe_layers - 1,
+                    math.floor(used_moe * moe_layers / max(1, n_moe)),
+                )
+            plan.append(("moe", src_idx))
+            used_moe += 1
+        else:
+            src_idx = 0
+            if n_base > 0:
+                src_idx = min(
+                    base_layers - 1,
+                    math.floor(used_base * base_layers / max(1, n_base)),
+                )
+            plan.append(("base", src_idx))
+            used_base += 1
+    return plan
+def build_alternate_plan(
+    final_layers: int,
+    base_layers: int,
+    moe_layers: int,
+) -> List[Tuple[str, int]]:
+    plan: List[Tuple[str, int]] = []
+    b = 0
+    m = 0
+    turn_moe = True
+    while len(plan) < final_layers:
+        if turn_moe and m < moe_layers:
+            plan.append(("moe", m))
+            m += 1
+        elif b < base_layers:
+            plan.append(("base", b))
+            b += 1
+        elif m < moe_layers:
+            plan.append(("moe", m))
+            m += 1
+        else:
+            plan.append(plan[-1])
+        turn_moe = not turn_moe
+    return plan
+def build_composite_model(
+    base_model_dir: str,
+    moe_model_dir: str,
+    output_dir: str,
+    final_layers: int,
+    interleave_strategy: str = "even",
+    cast_dtype_str: Optional[str] = None,
+    low_memory: bool = True,
+):
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    base_cfg = read_json(os.path.join(base_model_dir, "config.json"))
+    moe_cfg = read_json(os.path.join(moe_model_dir, "config.json"))
+    for k in ["hidden_size", "intermediate_size"]:
+        if base_cfg.get(k) != moe_cfg.get(k):
+            raise ValueError(
+                f"Config mismatch for {k}: base={base_cfg.get(k)} "
+                f"moe={moe_cfg.get(k)}"
+            )
+    base_layers = int(base_cfg.get("num_hidden_layers"))
+    moe_layers = int(moe_cfg.get("num_hidden_layers"))
+    logger.info(
+        f"Composite plan: base_layers={base_layers}, "
+        f"moe_layers={moe_layers}, final_layers={final_layers}"
+    )
+    if interleave_strategy == "even":
+        plan = build_even_interleave_plan(
+            final_layers, base_layers, moe_layers
+        )
+    elif interleave_strategy == "alternate":
+        plan = build_alternate_plan(
+            final_layers, base_layers, moe_layers
+        )
+    else:
+        raise ValueError(
+            "interleave_strategy must be 'even' or 'alternate'"
+        )
+    out_cfg = dict(base_cfg)
+    out_cfg["num_hidden_layers"] = final_layers
+    write_json(out_dir / "config.json", out_cfg)
+    logger.info("Copying tokenizer/aux files from base...")
+    for fname in [
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.model",
+        "generation_config.json",
+    ]:
+        src = os.path.join(base_model_dir, fname)
+        if os.path.exists(src):
+            shutil.copy2(src, out_dir / fname)
+    base_map, _ = index_safetensors_dir(base_model_dir)
+    moe_map, _ = index_safetensors_dir(moe_model_dir)
+    logger.info("Saving non-layer tensors...")
+    non_layer_tensors = copy_non_layer_tensors(base_model_dir, cast_dtype_str)
+    non_layer_file = "model-nonlayers.safetensors"
+    save_file(non_layer_tensors, str(out_dir / non_layer_file))
+    out_weight_map: Dict[str, str] = {
+        k: non_layer_file for k in non_layer_tensors.keys()
+    }
+    del non_layer_tensors
+    if low_memory:
+        gc.collect()
+    logger.info("Building layers...")
+    for tgt_idx, (src_tag, src_idx) in tqdm(
+        list(enumerate(plan)), dynamic_ncols=True
+    ):
+        src_dir = base_model_dir if src_tag == "base" else moe_model_dir
+        src_map = base_map if src_tag == "base" else moe_map
+        src_keys = list_layer_keys(src_map, src_idx)
+        if not src_keys:
+            raise RuntimeError(
+                f"No layer keys found for {src_tag} layer {src_idx}"
+            )
+        tensors = load_tensors_by_keys(
+            src_dir, src_map, src_keys, cast_dtype_str
+        )
+        renamed = rename_layer_keys(tensors, src_idx, tgt_idx)
+        layer_fname = f"model-layer-{tgt_idx:05d}.safetensors"
+        save_file(renamed, str(out_dir / layer_fname))
+        for k in renamed.keys():
+            out_weight_map[k] = layer_fname
+        del tensors, renamed
+        if low_memory:
+            gc.collect()
+    total_size = 0
+    for fname in set(out_weight_map.values()):
+        fp = out_dir / fname
+        if fp.exists():
+            total_size += fp.stat().st_size
+    index = {
+        "metadata": {"total_size": total_size, "format": "safetensors"},
+        "weight_map": out_weight_map,
+    }
+    write_json(out_dir / "model.safetensors.index.json", index)
+    logger.info(
+        f"Composite model written to {out_dir} with {final_layers} layers."
+    )
+def validate_model_load(model_dir: str):
+    try:
+        from transformers import AutoModelForCausalLM
+        _ = AutoModelForCausalLM.from_pretrained(
+            model_dir,
+            torch_dtype="auto",
+            trust_remote_code=True,
+            device_map="meta",
+        )
+        logger.info("Model loads successfully on meta device.")
+    except Exception as e:
+        logger.error("Model failed to load on meta device.")
+        raise
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Convert MoE model to dense format with target compatibility, "
+            "and/or build a larger composite model by interleaving layers."
+        )
+    )
+    parser.add_argument("--model_id", type=str, help="MoE model ID or path")
+    parser.add_argument(
+        "--target_model",
+        type=str,
+        help="Target dense model for dimension matching",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Path to save the converted dense model",
+    )
+    parser.add_argument(
+        "--method",
+        type=str,
+        choices=["concat_experts", "average", "first"],
+        default="average",
+        help="Expert merge method",
+    )
+    parser.add_argument(
+        "--low_memory",
+        action="store_true",
+        help="Low memory conversion",
+    )
+    parser.add_argument(
+        "--test_merge",
+        action="store_true",
+        help="Write a sample merge config",
+    )
+    parser.add_argument(
+        "--compose_interleaved",
+        action="store_true",
+        help="Build composite model by interleaving layers",
+    )
+    parser.add_argument(
+        "--base_model", type=str, help="Base dense model path or HF ID"
+    )
+    parser.add_argument(
+        "--moe_converted", type=str, help="Converted MoE-dense model dir"
+    )
+    parser.add_argument(
+        "--composite_output_path",
+        type=str,
+        help="Output path for composite model",
+    )
+    parser.add_argument(
+        "--final_layers",
+        type=int,
+        help="Number of transformer layers in composite",
+    )
+    parser.add_argument(
+        "--interleave_strategy",
+        type=str,
+        choices=["even", "alternate"],
+        default="even",
+        help="Interleaving strategy",
+    )
+    parser.add_argument(
+        "--cast_dtype",
+        type=str,
+        choices=["float32", "fp32", "float16", "fp16", "bfloat16", "bf16"],
+        help="Optional cast during composite build",
+    )
+    parser.add_argument(
+        "--validate_model",
+        type=str,
+        help="Validate a model directory on meta device",
+    )
+    args = parser.parse_args()
+    if args.validate_model:
+        validate_model_load(args.validate_model)
+        return
+    if args.compose_interleaved:
+        if not args.base_model or not args.moe_converted:
+            raise ValueError(
+                "--compose_interleaved requires --base_model and "
+                "--moe_converted"
+            )
+        base_dir = args.base_model
+        if not os.path.exists(base_dir):
+            logger.info(f"Downloading base model {base_dir}...")
+            base_dir = snapshot_download(
+                base_dir,
+                cache_dir="./model_cache",
+                resume_download=True,
+            )
+        moe_dir = args.moe_converted
+        if not os.path.exists(moe_dir):
+            raise FileNotFoundError(
+                f"--moe_converted path not found: {moe_dir}"
+            )
+        out_dir = args.composite_output_path or "./composite_interleaved"
+        if not args.final_layers:
+            base_cfg = read_json(os.path.join(base_dir, "config.json"))
+            moe_cfg = read_json(os.path.join(moe_dir, "config.json"))
+            args.final_layers = int(base_cfg["num_hidden_layers"]) + int(
+                moe_cfg["num_hidden_layers"]
+            )
+        build_composite_model(
+            base_model_dir=base_dir,
+            moe_model_dir=moe_dir,
+            output_dir=out_dir,
+            final_layers=int(args.final_layers),
+            interleave_strategy=args.interleave_strategy,
+            cast_dtype_str=args.cast_dtype,
+            low_memory=args.low_memory,
+        )
+        logger.info(
+            f"Composite interleaving complete. Output: {out_dir}"
+        )
+        return
+    if not args.model_id or not args.target_model or not args.output_path:
+        parser.error(
+            "Conversion mode requires --model_id, --target_model, "
+            "and --output_path"
+        )
+    converter = MoEToDenseConverter(
+        model_path=args.model_id,
+        target_model_path=args.target_model,
+        output_path=args.output_path,
+        method=args.method,
+        low_memory=args.low_memory,
+    )
+    output_path = converter.convert()
+    if args.test_merge:
+        merge_config = {
+            "models": [
+                {"model": args.target_model, "parameters": {"weight": 0.7}},
+                {"model": str(output_path), "parameters": {"weight": 0.3}},
+            ],
+            "merge_method": "linear",
+            "base_model": args.target_model,
+            "dtype": "bfloat16",
+        }
+        merge_config_path = Path(args.output_path).parent / "merge_config.yaml"
+        try:
+            import yaml
+            with open(merge_config_path, "w") as f:
+                yaml.dump(merge_config, f)
+            logger.info(
+                f"Wrote sample merge configuration to {merge_config_path}"
+            )
+        except Exception as e:
+            logger.warning(
+                f"Could not write sample merge YAML: {e}"
+            )
+if __name__ == "__main__":
+    main()

sample.txt ADDED Viewed

	@@ -0,0 +1,65 @@

+You are a helpful assistant. Respond to the next user message in a single paragraph.
+Explain why the sky appears blue using a two-sentence summary.
+List three practical uses of hash maps and one limitation.
+Summarize how gradient descent works without equations.
+Give a neutral definition of reinforcement learning and a common pitfall.
+Describe the difference between latency and throughput with a concrete example.
+Explain what a vector database is and when you would not use one.
+In one paragraph, compare JSON and Parquet from a data engineering perspective.
+Outline the steps to reproduce a minimal bug report for a Python library.
+Provide three strategies to reduce overfitting in neural networks.
+Write a short product description for a reusable water bottle aimed at travelers.
+Translate this to Spanish: The library opens at nine and closes at six.
+Paraphrase this sentence: The results were surprising but not conclusive.
+Convert the following bullet list into a single coherent paragraph: speed, safety, cost.
+Explain what a checksum is and why it matters for file downloads.
+State the pros and cons of using WebAssembly in the browser.
+Explain CAP theorem in one paragraph and give a real-world trade-off.
+Outline a basic incident response checklist for a small engineering team.
+Describe how HTTP caching works and why ETags are useful.
+Explain the purpose of unit tests versus integration tests.
+Give a concise explanation of SIMD and where it helps.
+Describe a reliable backup strategy for a personal laptop.
+In two sentences, explain how public key cryptography enables secure messaging.
+Explain the difference between imperative and declarative programming with an example.
+Write a short release note for a minor version update of a CLI tool.
+Explain what cosine similarity measures and where it’s used.
+Provide a brief, friendly onboarding message for a new community member.
+Give a two-sentence description of how transformers use attention.
+Describe what a memory leak is and how to spot one.
+Explain why idempotency keys are important in payment APIs.
+Summarize the key ideas behind zero-copy I/O.
+Define a feature flag and explain one safe rollout pattern.
+Write a short announcement for scheduled maintenance with expected impact.
+Explain what a bloom filter is and when false positives are acceptable.
+Give a checklist for code review that fits on a sticky note.
+Describe a resilient way to schedule background jobs in a web app.
+Explain the concept of backpressure in streaming systems.
+Compare columnar vs row-oriented storage in one paragraph.
+Explain how pagination strategies differ between offset and cursor methods.
+Write a brief description of a dataset card for a public corpus of recipes.
+Explain what a content-addressable store is and why it’s powerful.
+In one paragraph, describe how to interpret a confusion matrix.
+Give guidance for writing good commit messages with examples.
+Describe the trade-offs of strongly typed schemas versus schema-on-read.
+Explain what vector quantization is in plain language.
+Provide a short primer on HTTP/2 multiplexing and head-of-line blocking.
+Explain what a rolling hash is and where it’s useful.
+Write a brief warning about common pitfalls when using floating point numbers.
+Describe how JWTs work and one reason to rotate signing keys.
+Explain the difference between top-k and nucleus sampling in text generation.
+Provide a simple migration plan from REST to gRPC for a single service.
+Give a one-paragraph overview of entropy as used in information theory.
+Explain how learned positional encodings differ from rotary encodings at a high level.
+Write a concise guide to choosing batch size under a fixed memory budget.
+Explain what an embedding dimension means and why larger isn’t always better.
+Describe a safe pattern for storing API secrets in a deployment pipeline.
+Provide a two-sentence overview of LoRA fine-tuning and its benefits.
+Explain the idea of teacher forcing in sequence models and a downside.
+Write a short FAQ entry: “Why are my generations repetitive?”
+Describe how to evaluate a summarization system without human raters.
+Explain the difference between deterministic and stochastic decoding.
+Provide a brief note on choosing between FP16 and BF16 on modern GPUs.
+Write a compact introduction to beam search and its main trade-offs.
+Give a minimal example of a retry policy with exponential backoff described in words.
+Explain why logging PII can create compliance risks and how to avoid it.

scales.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "9": { "attn_q": 0.96, "attn_k": 0.96, "mlp_down": 0.98 },
+  "11": { "attn_q": 0.90, "attn_k": 0.90 },
+  "13": { "attn_q": 0.90, "attn_k": 0.90 },
+  "25": { "attn_q": 0.92, "attn_k": 0.92 },
+  "30": { "attn_q": 0.88, "attn_k": 0.88, "mlp_down": 0.95 },
+  "34": { "attn_q": 0.90, "attn_k": 0.90 }
+}

visualize_activations.py ADDED Viewed

	@@ -0,0 +1,467 @@

+#!/usr/bin/env python3
+"""
+Visualize activation statistics JSON produced by activation_stats.py.
+Generates an interactive HTML dashboard with:
+- Token RMS mean across layers for attention q/k/v/o and MLP up/gate/down
+- Zero fraction heatmap per layer and module type
+- Attention entropy per layer (if present)
+- Top-N modules by token_rms_mean and zero_fraction
+"""
+import argparse
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Tuple
+import plotly.graph_objects as go
+try:
+    import pandas as pd
+except Exception:
+    pd = None
+@dataclass
+class ModuleStat:
+    name: str
+    layer: Optional[int]
+    mtype: str
+    token_rms_mean: Optional[float]
+    token_rms_std: Optional[float]
+    mean: Optional[float]
+    std: Optional[float]
+    min: Optional[float]
+    max: Optional[float]
+    zero_frac: Optional[float]
+    count: int
+    nan_count: int
+    inf_count: int
+ATTN_TYPES = ["q_proj", "k_proj", "v_proj", "o_proj"]
+MLP_TYPES = ["mlp.up_proj", "mlp.gate_proj", "mlp.down_proj"]
+NORM_HINTS = ["layernorm", ".norm"]
+def parse_layer_idx(name: str) -> Optional[int]:
+    m = re.search(r"model\.layers\.(\d+)\.", name)
+    if m:
+        try:
+            return int(m.group(1))
+        except Exception:
+            return None
+    return None
+def infer_type(name: str) -> str:
+    lname = name.lower()
+    for t in ATTN_TYPES:
+        if f".self_attn.{t}" in lname:
+            return t
+    for t in MLP_TYPES:
+        if t in lname:
+            return t
+    for h in NORM_HINTS:
+        if h in lname:
+            return "norm"
+    return "other"
+def try_float(x: Any) -> Optional[float]:
+    try:
+        if x is None:
+            return None
+        return float(x)
+    except Exception:
+        return None
+def load_stats_json(path: str) -> Tuple[List[ModuleStat], Dict[int, float]]:
+    with open(path, "r") as f:
+        data = json.load(f)
+    rows: List[ModuleStat] = []
+    attn_entropy: Dict[int, float] = {}
+    if "_attention_entropy" in data and isinstance(
+        data["_attention_entropy"], dict
+    ):
+        for k, v in data["_attention_entropy"].items():
+            try:
+                attn_entropy[int(k)] = float(v)
+            except Exception:
+                continue
+    for name, entry in data.items():
+        if name.startswith("_"):
+            continue
+        if not isinstance(entry, dict):
+            continue
+        g = entry.get("global", {})
+        tr = entry.get("token_rms", {})
+        count = int(g.get("count", 0) or 0)
+        zero_count = int(g.get("zero_count", 0) or 0)
+        zero_frac = (zero_count / count) if count > 0 else None
+        rows.append(
+            ModuleStat(
+                name=name,
+                layer=parse_layer_idx(name),
+                mtype=infer_type(name),
+                token_rms_mean=try_float(tr.get("mean")),
+                token_rms_std=try_float(tr.get("std")),
+                mean=try_float(g.get("mean")),
+                std=try_float(g.get("std")),
+                min=try_float(g.get("min")),
+                max=try_float(g.get("max")),
+                zero_frac=zero_frac,
+                count=count,
+                nan_count=int(g.get("nan_count", 0) or 0),
+                inf_count=int(g.get("inf_count", 0) or 0),
+            )
+        )
+    return rows, attn_entropy
+def filter_rows(
+    rows: List[ModuleStat],
+    allowed_types: Optional[List[str]],
+    layer_min: Optional[int],
+    layer_max: Optional[int],
+) -> List[ModuleStat]:
+    out: List[ModuleStat] = []
+    for r in rows:
+        if allowed_types and (r.mtype not in allowed_types):
+            continue
+        if r.layer is not None:
+            if layer_min is not None and r.layer < layer_min:
+                continue
+            if layer_max is not None and r.layer > layer_max:
+                continue
+        out.append(r)
+    return out
+def group_by_layer_type(
+    rows: List[ModuleStat], types: List[str]
+) -> Dict[str, Dict[int, ModuleStat]]:
+    d: Dict[str, Dict[int, ModuleStat]] = {t: {} for t in types}
+    for r in rows:
+        if r.layer is None:
+            continue
+        if r.mtype in d and r.layer not in d[r.mtype]:
+            d[r.mtype][r.layer] = r
+    return d
+def make_sorted_layers(mapping: Dict[int, Any]) -> List[int]:
+    return sorted(list(mapping.keys()))
+def fig_token_rms_lines(
+    rows: List[ModuleStat],
+    types: List[str],
+    title: str,
+    y_field: str = "token_rms_mean",
+) -> go.Figure:
+    grouped = group_by_layer_type(rows, types)
+    fig = go.Figure()
+    for t in types:
+        lay2stat = grouped.get(t, {})
+        if not lay2stat:
+            continue
+        layers = make_sorted_layers(lay2stat)
+        ys = [
+            getattr(lay2stat[L], y_field) if lay2stat[L] else None
+            for L in layers
+        ]
+        fig.add_trace(
+            go.Scatter(
+                x=layers,
+                y=ys,
+                mode="lines+markers",
+                name=t,
+                connectgaps=False,
+            )
+        )
+    fig.update_layout(
+        title=title,
+        xaxis_title="Layer",
+        yaxis_title=y_field,
+        template="plotly_white",
+        legend_title="Module type",
+    )
+    return fig
+def fig_zero_frac_heatmap(rows: List[ModuleStat], types: List[str]) -> go.Figure:
+    grouped = group_by_layer_type(rows, types)
+    all_layers = sorted(
+        list({L for t in types for L in grouped.get(t, {}).keys()})
+    )
+    if not all_layers:
+        return go.Figure()
+    z = []
+    for t in types:
+        lay2stat = grouped.get(t, {})
+        row = []
+        for L in all_layers:
+            s = lay2stat.get(L)
+            row.append(s.zero_frac if s else None)
+        z.append(row)
+    fig = go.Figure(
+        data=go.Heatmap(
+            z=z,
+            x=all_layers,
+            y=types,
+            colorscale="Viridis",
+            colorbar_title="zero_fraction",
+        )
+    )
+    fig.update_layout(
+        title="Zero fraction heatmap by layer and module type",
+        xaxis_title="Layer",
+        yaxis_title="Module type",
+        template="plotly_white",
+    )
+    return fig
+def fig_attention_entropy(entropy: Dict[int, float]) -> go.Figure:
+    if not entropy:
+        return go.Figure()
+    layers = sorted(entropy.keys())
+    vals = [entropy[L] for L in layers]
+    fig = go.Figure(
+        data=go.Scatter(
+            x=layers, y=vals, mode="lines+markers", name="attn_entropy"
+        )
+    )
+    fig.update_layout(
+        title="Attention entropy (mean per layer)",
+        xaxis_title="Layer",
+        yaxis_title="Entropy",
+        template="plotly_white",
+    )
+    return fig
+def top_k_bar(
+    rows: List[ModuleStat],
+    field: str,
+    title: str,
+    top_k: int = 20,
+    reverse: bool = True,
+) -> go.Figure:
+    vals: List[Tuple[str, float]] = []
+    for r in rows:
+        v = getattr(r, field)
+        if v is None:
+            continue
+        vals.append((r.name, float(v)))
+    if not vals:
+        return go.Figure()
+    vals.sort(key=lambda x: x[1], reverse=reverse)
+    vals = vals[:top_k]
+    names = [v[0] for v in vals]
+    ys = [v[1] for v in vals]
+    fig = go.Figure(
+        data=go.Bar(
+            x=ys,
+            y=names,
+            orientation="h",
+            marker_color="steelblue",
+            name=field,
+        )
+    )
+    fig.update_layout(
+        title=title,
+        xaxis_title=field,
+        yaxis_title="Module",
+        template="plotly_white",
+        margin=dict(l=200),
+    )
+    return fig
+def make_dashboard(
+    attn_rows: List[ModuleStat],
+    mlp_rows: List[ModuleStat],
+    all_rows: List[ModuleStat],
+    attn_entropy: Dict[int, float],
+    top_k: int,
+) -> str:
+    figs: List[go.Figure] = []
+    figs.append(
+        fig_token_rms_lines(
+            attn_rows, ATTN_TYPES, "Attention Token RMS mean by layer"
+        )
+    )
+    figs.append(
+        fig_token_rms_lines(
+            mlp_rows, MLP_TYPES, "MLP Token RMS mean by layer"
+        )
+    )
+    figs.append(fig_zero_frac_heatmap(attn_rows, ATTN_TYPES))
+    figs.append(fig_zero_frac_heatmap(mlp_rows, MLP_TYPES))
+    if attn_entropy:
+        figs.append(fig_attention_entropy(attn_entropy))
+    figs.append(
+        top_k_bar(
+            all_rows,
+            "token_rms_mean",
+            f"Top {top_k} modules by token_rms_mean",
+            top_k=top_k,
+        )
+    )
+    figs.append(
+        top_k_bar(
+            all_rows,
+            "zero_frac",
+            f"Top {top_k} modules by zero_fraction",
+            top_k=top_k,
+        )
+    )
+    parts = []
+    for i, fig in enumerate(figs):
+        parts.append(
+            fig.to_html(
+                full_html=False,
+                include_plotlyjs="cdn",
+                default_width="100%",
+                default_height="600px",
+            )
+        )
+    html = f"""
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<title>Activation Statistics Dashboard</title>
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<style>
+  body {{
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
+                 "Helvetica Neue", Arial, "Noto Sans", "Liberation Sans",
+                 sans-serif;
+    margin: 0;
+    padding: 0 16px 64px 16px;
+    background: #ffffff;
+    color: #111;
+  }}
+  h1 {{
+    font-size: 22px;
+    font-weight: 600;
+    margin-top: 16px;
+  }}
+  .fig {{
+    margin: 24px 0;
+    border: 1px solid #eee;
+    padding: 8px;
+    border-radius: 8px;
+    box-shadow: 0 1px 0 rgba(0,0,0,0.04);
+  }}
+</style>
+</head>
+<body>
+  <h1>Activation Statistics Dashboard</h1>
+  <div class="fig">{parts[0] if len(parts) > 0 else ""}</div>
+  <div class="fig">{parts[1] if len(parts) > 1 else ""}</div>
+  <div class="fig">{parts[2] if len(parts) > 2 else ""}</div>
+  <div class="fig">{parts[3] if len(parts) > 3 else ""}</div>
+  <div class="fig">{parts[4] if len(parts) > 4 else ""}</div>
+  <div class="fig">{parts[5] if len(parts) > 5 else ""}</div>
+  <div class="fig">{parts[6] if len(parts) > 6 else ""}</div>
+</body>
+</html>
+"""
+    return html
+def main():
+    ap = argparse.ArgumentParser(
+        description="Visualize activation stats JSON to interactive HTML."
+    )
+    ap.add_argument("--stats_json", type=str, required=True)
+    ap.add_argument("--out_html", type=str, required=True)
+    ap.add_argument("--out_csv", type=str)
+    ap.add_argument("--types", type=str, default=None)
+    ap.add_argument("--layer_min", type=int, default=None)
+    ap.add_argument("--layer_max", type=int, default=None)
+    ap.add_argument("--top_k", type=int, default=20)
+    args = ap.parse_args()
+    rows, attn_entropy = load_stats_json(args.stats_json)
+    allowed_types = None
+    if args.types:
+        allowed_types = [t.strip() for t in args.types.split(",") if t.strip()]
+    attn_rows = filter_rows(
+        rows,
+        allowed_types or ATTN_TYPES,
+        args.layer_min,
+        args.layer_max,
+    )
+    attn_rows = [r for r in attn_rows if r.mtype in ATTN_TYPES]
+    mlp_rows = filter_rows(
+        rows,
+        allowed_types or MLP_TYPES,
+        args.layer_min,
+        args.layer_max,
+    )
+    mlp_rows = [r for r in mlp_rows if r.mtype in MLP_TYPES]
+    all_rows = filter_rows(rows, allowed_types, args.layer_min, args.layer_max)
+    html = make_dashboard(attn_rows, mlp_rows, all_rows, attn_entropy, args.top_k)
+    out_html = Path(args.out_html)
+    out_html.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_html, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"Wrote HTML dashboard to: {out_html}")
+    if args.out_csv:
+        if pd is None:
+            print("pandas not available; CSV not written.")
+        else:
+            df = pd.DataFrame(
+                [
+                    {
+                        "name": r.name,
+                        "layer": r.layer,
+                        "type": r.mtype,
+                        "token_rms_mean": r.token_rms_mean,
+                        "token_rms_std": r.token_rms_std,
+                        "mean": r.mean,
+                        "std": r.std,
+                        "min": r.min,
+                        "max": r.max,
+                        "zero_frac": r.zero_frac,
+                        "count": r.count,
+                        "nan_count": r.nan_count,
+                        "inf_count": r.inf_count,
+                    }
+                    for r in all_rows
+                ]
+            )
+            out_csv = Path(args.out_csv)
+            out_csv.parent.mkdir(parents=True, exist_ok=True)
+            df.to_csv(out_csv, index=False)
+            print(f"Wrote CSV summary to: {out_csv}")
+if __name__ == "__main__":
+    main()