import os
import torch
from datasets import load_from_disk, concatenate_datasets, Dataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from peft.tuners.lora import LoraLayer
from trl import SFTTrainer, SFTConfig
import logging
import torch.distributed as dist
from datetime import timedelta, datetime
import time
from transformers.trainer import TrainerCallback
import gc
import sys
import shutil  # For handling file operations
import glob  # For file pattern matching
import threading  # For background cleanup
import multiprocessing
import subprocess
import tempfile
import json
import random
import math
import queue
import numpy as np

# Import the specific layer class for FSDP wrapping
try:
    from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
except ImportError:
    logging.warning("Could not import Qwen2DecoderLayer. FSDP wrapping might fail.")
    Qwen2DecoderLayer = None

# Configure more detailed logging with timestamps
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    stream=sys.stdout,  # Ensure logs go to stdout for immediate visibility
    force=True
)

# Set up temporary directory for cache files
temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp")
os.makedirs(temp_dir, exist_ok=True)
logging.info(f"Using temporary directory: {temp_dir}")

# Set environment variables to control temporary file creation
os.environ["TMPDIR"] = temp_dir  # Unix
os.environ["TEMP"] = temp_dir    # Windows
os.environ["TMP"] = temp_dir     # Windows alternative

# Set default cache locations
hf_datasets_cache_path = os.path.join(temp_dir, "hf_datasets_cache")
transformers_cache_path = os.path.join(temp_dir, "transformers_cache")
hf_home_path = os.path.join(temp_dir, "hf_home")
os.makedirs(hf_datasets_cache_path, exist_ok=True)
os.makedirs(transformers_cache_path, exist_ok=True)
os.makedirs(hf_home_path, exist_ok=True)

os.environ["HF_DATASETS_CACHE"] = hf_datasets_cache_path
os.environ["TRANSFORMERS_CACHE"] = transformers_cache_path
os.environ["HF_HOME"] = hf_home_path
logging.info(f"Hugging Face Datasets cache directed to: {hf_datasets_cache_path}")
logging.info(f"Hugging Face Transformers cache directed to: {transformers_cache_path}")

# Keep forcing Arrow to use system memory pool if possible
os.environ["ARROW_DEFAULT_MEMORY_POOL"] = "system" 
logging.info("Configured temporary directory and cache locations.")

# Set environment variable to control PyTorch's memory allocator
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"
# Disable PYTORCH_NO_CUDA_MEMORY_CACHING for better performance
if "PYTORCH_NO_CUDA_MEMORY_CACHING" in os.environ:
    del os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"]
# Set a longer timeout for NCCL operations
os.environ["NCCL_BLOCKING_WAIT"] = "1"
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
os.environ["NCCL_TIMEOUT"] = "3600"  # 1 hour timeout for NCCL operations

# Initialize distributed environment with better error handling
def init_distributed():
    try:
        # Check if we're in a distributed training environment
        if "WORLD_SIZE" in os.environ and int(os.environ["WORLD_SIZE"]) > 1:
            # Set memory optimization environment variables
            if int(os.environ.get("LOCAL_RANK", 0)) == 0:
                logging.info("Setting PyTorch memory optimizations for H200 GPUs")
                # Empty CUDA cache before initializing process group
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    logging.info("CUDA cache cleared")
            
            local_rank = int(os.environ.get("LOCAL_RANK", 0))
            world_size = int(os.environ.get("WORLD_SIZE", 1))
            rank = int(os.environ.get("RANK", 0))
            
            logging.info(f"Initializing distributed training for 8x H200s. Rank: {rank}, Local Rank: {local_rank}, World Size: {world_size}")
            
            # Set the device for this process explicitly before initializing
            torch.cuda.set_device(local_rank)
            logging.info(f"Setting device {local_rank} for process rank {rank}")
            
            # Set a longer timeout to handle long operations (3 hours)
            timeout = timedelta(hours=3)
            
            # Initialize the distributed process group
            dist.init_process_group(
                backend='nccl', 
                init_method='env://', 
                timeout=timeout,
                rank=rank,
                world_size=world_size
            )
            
            # Verify initialization was successful
            if dist.is_initialized():
                logging.info(f"Successfully initialized distributed process group. Rank: {rank}, Device: {torch.cuda.current_device()}")
                # Log NCCL environment
                logging.info(f"NCCL Version: {torch.cuda.nccl.version() if hasattr(torch.cuda, 'nccl') else 'unknown'}")
                logging.info(f"CUDA Device Count: {torch.cuda.device_count()}")
                logging.info(f"CUDA Device Name: {torch.cuda.get_device_name(local_rank)}")
            else:
                logging.error(f"Failed to initialize distributed process group. Rank: {rank}")
                
            # Ensure all processes can communicate with specified device
            try:
                device_ids = [local_rank]
                dist.barrier(device_ids=device_ids)
                logging.info(f"Communication test successful. Process {rank} on device {local_rank} can communicate.")
            except Exception as e:
                logging.error(f"Communication test failed. Processes cannot communicate: {str(e)}. Rank: {rank}")
                raise
            
            return True
        else:
            logging.info("Not running in distributed mode.")
            return False
    except Exception as e:
        logging.error(f"Error initializing distributed environment: {str(e)}")
        raise

# Initialize distributed environment
distributed_mode = init_distributed()

# --- Configuration ---

# Model ID updated based on user input
MODEL_ID = "Qwen/QwQ-32B"

# Path to the processed dataset created by preprocess_data.py
DATASET_PATH = "./processed_datasets/combined_code_finetune_data"

# Number of examples to use (set to -1 for all)
MAX_EXAMPLES = -1  # Use all examples by default

# LoRA configuration (Optimized for 8x H200 GPUs)
LORA_R = 64              # Doubled to increase parameter count significantly
LORA_ALPHA = 128         # Increased alpha to match r
LORA_DROPOUT = 0.05      # Dropout probability for LoRA layers
# Target modules might need verification for QwQ-32B specifically.
# Common targets for Qwen models:
LORA_TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
    # "embed_tokens", # Removed to reduce overhead/complexity
    # "lm_head",      # Removed to reduce overhead/complexity
]

# Training arguments optimized for 8x H200 GPUs with memory constraints
OUTPUT_DIR = "./qwq-32b-finetuned-adapters" 
PER_DEVICE_TRAIN_BATCH_SIZE = 8   # Increase BS after halving seq length again
GRADIENT_ACCUMULATION_STEPS = 6   # Decrease accumulation (8*8*6 = 384)
# Global batch size = PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS * NumGPUs
# Example: 8 * 6 * 8 = 384
LEARNING_RATE = 3e-5     # Slightly higher LR for larger batch size
EPOCHS = 1                 # Start with 1 epoch, increase cautiously
MAX_SEQ_LENGTH = 4096      # Halved sequence length again
LOGGING_STEPS = 50         # Increased logging frequency
SAVE_STEPS = 500           # Increased save frequency
OPTIMIZER = "adamw_bnb_8bit" # Use 8-bit optimizer to save significant memory
WARMUP_RATIO = 0.03
LR_SCHEDULER_TYPE = "cosine"

# H200-specific optimizations (8x setup)
USE_FLASH_ATTN = True      # Enable Flash Attention 2 for H200s
USE_SEQUENCE_PARALLEL = False # Disable when using FSDP
USE_BETTER_TRANSFORMERS = True # Use better transformers for optimized kernels
DATALOADER_NUM_WORKERS = 8      # Reduced workers to avoid CPU contention
TOKENIZATION_NUM_WORKERS = 224 # Maximum worker count for tokenization
USE_ACTIVATION_CHECKPOINTING = True # Enable activation checkpointing to save memory with long sequences

# Advanced distributed training options for 8x GPUs
USE_FSDP = True  # Enable FSDP
FSDP_CONFIG = {
    "fsdp_offload_params": False, # Disable CPU Offload
    "fsdp_sharding_strategy": 1, # 1 = FULL_SHARD
    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
    "fsdp_transformer_layer_cls_to_wrap": [Qwen2DecoderLayer.__name__] if Qwen2DecoderLayer else [],
    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
    "fsdp_backward_prefetch": "backward_post", # Changed from backward_pre
    "fsdp_forward_prefetch": False, # Disabled forward prefetch
    "fsdp_activation_checkpointing": [Qwen2DecoderLayer.__name__] if Qwen2DecoderLayer else [], # Use FSDP activation checkpointing
}

# WandB Integration
REPORT_TO_WANDB = True # Set to False to disable WandB reporting
WANDB_PROJECT_NAME = "QwQ-32B-Finetune-8xH200" # Updated for 8x GPUs
WANDB_ENTITY = None  # Set to your username or team name if needed

# Determine report_to destination
report_to = "none"
if REPORT_TO_WANDB:
    # Disable WandB in all processes except rank 0 in distributed mode
    if distributed_mode and int(os.environ.get("LOCAL_RANK", 0)) != 0:
        logging.info(f"Rank {os.environ.get('RANK', '?')}: Disabling WandB")
        os.environ["WANDB_DISABLED"] = "true"
        report_to = "none" # Explicitly set to none for non-main processes
    else:
        # Main process or non-distributed mode, attempt WandB initialization
        try:
            import wandb
            logging.info("Initializing WandB directly...")
            run_name = f"qwq-32b-finetune-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
            if wandb.run is None:
                try:
                    wandb.init(
                        project=WANDB_PROJECT_NAME,
                        entity=WANDB_ENTITY,
                        name=run_name,
                        config={
                            "model_name": MODEL_ID,
                            "batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
                            "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS, 
                            "learning_rate": LEARNING_RATE,
                            "epochs": EPOCHS,
                            "sequence_length": MAX_SEQ_LENGTH,
                            "lora_r": LORA_R,
                            "lora_alpha": LORA_ALPHA,
                        }
                    )
                    logging.info(f"WandB initialized: {wandb.run.name} (ID: {wandb.run.id})")
                    report_to = "wandb"
                except Exception as e:
                    logging.error(f"WandB initialization error: {str(e)}")
                    report_to = "tensorboard"
            else:
                logging.info(f"Using existing WandB run: {wandb.run.name} (ID: {wandb.run.id})")
                report_to = "wandb"
        except ImportError:
            logging.warning("WandB package not installed. Reporting to TensorBoard.")
            report_to = "tensorboard"
        except Exception as wandb_init_e:
             logging.error(f"General WandB setup error: {wandb_init_e}")
             report_to = "tensorboard"
# If WandB reporting is disabled, set report_to accordingly
elif not distributed_mode: 
    report_to = "tensorboard"
    logging.info("WandB reporting disabled. Reporting to TensorBoard.")
else: # If WandB is disabled and it IS distributed
    report_to = "none"
    logging.info("WandB reporting disabled for this distributed rank.")

# Quantization (QLoRA)
USE_4BIT_QUANTIZATION = False # Disable QLoRA due to FSDP incompatibility
BNB_4BIT_COMPUTE_DTYPE = "bfloat16" # Use bfloat16 if supported, else float16
BNB_4BIT_QUANT_TYPE = "nf4"

# --- Check Optional Dependencies (Define flags globally) ---
FLASH_ATTN_AVAILABLE = False
BETTER_TRANSFORMERS_AVAILABLE = False
try:
    import flash_attn
    FLASH_ATTN_AVAILABLE = True
    logging.info("Flash Attention available - will be used if enabled.")
except ImportError:
    logging.warning("Flash Attention not available. Install with 'pip install flash-attn'")

try:
    from optimum.bettertransformer import BetterTransformer
    BETTER_TRANSFORMERS_AVAILABLE = True
    logging.info("Better Transformers available - will be used if enabled.")
except ImportError:
    logging.warning("Better Transformers not available. Install with 'pip install optimum'")

# --- Check Dataset ---
if not os.path.exists(DATASET_PATH):
    logging.error(f"Dataset not found at {DATASET_PATH}. Run preprocess_data.py first.")
    exit(1)

logging.info(f"Loading dataset from {DATASET_PATH}...")

# Load dataset normally
dataset = load_from_disk(DATASET_PATH)

# Apply truncation if needed
if MAX_EXAMPLES > 0 and len(dataset) > MAX_EXAMPLES:
    logging.info(f"Truncating dataset to {MAX_EXAMPLES} examples")
    indices = list(range(min(MAX_EXAMPLES, len(dataset))))
    dataset = dataset.select(indices)

logging.info(f"Dataset loaded: {dataset} with {len(dataset)} examples")

# --- Tokenizer ---
logging.info(f"Loading tokenizer for {MODEL_ID}...")

# Enable fast tokenizer and optimizations
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID, 
    use_fast=True,  # Explicitly request the fast Rust-based tokenizer
    trust_remote_code=True,
   # model_max_length=MAX_SEQ_LENGTH,
    padding_side="right",
)

# Log tokenizer type for verification
if hasattr(tokenizer, 'is_fast') and tokenizer.is_fast:
    logging.info(f"Successfully loaded fast tokenizer (Rust implementation): {type(tokenizer).__name__}")
    # Fast tokenizers are automatically parallel in dataset.map() when num_proc > 1
    logging.info(f"Fast tokenizer will use parallel processing during dataset.map() with {TOKENIZATION_NUM_WORKERS} workers")
else:
    logging.warning(f"Using Python tokenizer: {type(tokenizer).__name__}")
    logging.warning("Python tokenizers are slower than Rust-based fast tokenizers")

# Check and set pad token based on Qwen documentation (<|endoftext|>)
# Qwen models might have this set correctly, but we verify.
EXPECTED_PAD_TOKEN = "<|endoftext|>"
if tokenizer.pad_token is None or tokenizer.pad_token != EXPECTED_PAD_TOKEN:
    logging.warning(f"Tokenizer pad_token is missing or not '{EXPECTED_PAD_TOKEN}'. Setting pad_token='{EXPECTED_PAD_TOKEN}'.")
    tokenizer.pad_token = EXPECTED_PAD_TOKEN

# Enable padding and truncation defaults for batch processing
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
tokenizer.padding_side = "right"  # Typically "right" for decoder-only models like Qwen

# Log tokenizer configuration
logging.info(f"Tokenizer configuration:")
logging.info(f"  - Type: {'Fast' if hasattr(tokenizer, 'is_fast') and tokenizer.is_fast else 'Python'}")
logging.info(f"  - Pad token: {tokenizer.pad_token}")
logging.info(f"  - EOS token: {tokenizer.eos_token}")  # Should be <|im_end|>
logging.info(f"  - Vocab size: {tokenizer.vocab_size}")
logging.info(f"  - Model max length: {tokenizer.model_max_length}")
logging.info(f"  - Padding side: {tokenizer.padding_side}")

# Define parallel preprocessing function for the dataset
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        return_tensors=None,  # Return Python lists for dataset
    )

# Create a cache directory for tokenized datasets
TOKENIZED_DATASET_CACHE_DIR = os.path.join(os.path.dirname(DATASET_PATH), "tokenized_cache")
os.makedirs(TOKENIZED_DATASET_CACHE_DIR, exist_ok=True)
tokenized_dataset_path = os.path.join(TOKENIZED_DATASET_CACHE_DIR, "tokenized_dataset")

# Create a file to signal tokenization completion
tokenization_done_file = os.path.join(TOKENIZED_DATASET_CACHE_DIR, "tokenization_complete")

# Function to clean up temporary files in dataset directory
def delete_existing_tmp_files():
    """Find and delete any existing tmp files in dataset directory"""
    # Look for tmp files in dataset directory
    tmp_files = glob.glob(os.path.join(DATASET_PATH, "tmp*"))
    
    if tmp_files:
        logging.info(f"Found {len(tmp_files)} existing tmp files, removing...")
        for tmp_file in tmp_files:
            try:
                if os.path.isdir(tmp_file):
                    shutil.rmtree(tmp_file)
                else:
                    os.remove(tmp_file)
                logging.info(f"Removed: {tmp_file}")
            except Exception as e:
                logging.warning(f"Could not remove {tmp_file}: {str(e)}")
    else:
        logging.info("No existing tmp files found")

# Check if we're in distributed mode and get rank
if distributed_mode:
    rank = int(os.environ.get("RANK", "0"))
    world_size = int(os.environ.get("WORLD_SIZE", "1"))
    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
    is_main_process = rank == 0
    logging.info(f"Rank {rank}/{world_size}: Preparing for dataset processing")
else:
    is_main_process = True
    rank = 0
    world_size = 1
    local_rank = 0

# Clean up temp files - only on main process to avoid conflicts
if is_main_process:
    delete_existing_tmp_files()
    # Also remove the tokenization_done_file if it exists
    if os.path.exists(tokenization_done_file):
        os.remove(tokenization_done_file)
        logging.info(f"Rank {rank}: Removed old tokenization completion marker")

# Only tokenize on main process (rank 0) to avoid redundant work
need_tokenization = False

# Check if tokenized dataset already exists
if os.path.exists(tokenized_dataset_path) and os.path.isdir(tokenized_dataset_path):
    # --- Dataset Exists --- 
    logging.info(f"Rank {rank}: Found existing tokenized dataset at {tokenized_dataset_path}")
    path_to_load = tokenized_dataset_path # All ranks will load from the persistent path
    need_tokenization = False 
    
    # Rank 0 ensures completion marker exists
    if is_main_process and not os.path.exists(tokenization_done_file):
        total_original_examples = "unknown"
        try:
            from datasets import load_dataset_builder # Local import
            original_dataset_info = load_dataset_builder(DATASET_PATH).info
            total_original_examples = original_dataset_info.splits['train'].num_examples
        except Exception as info_e:
            logging.warning(f"Rank {rank}: Could not get original dataset info: {info_e}")
        try: 
            # Get size of existing loaded dataset (approximate if needed)
            # This requires loading a small part or metadata, might be slow
            # For now, let's just mark it as existing
            # loaded_size = len(load_from_disk(tokenized_dataset_path, keep_in_memory=False))
            loaded_size = "unknown (loaded existing)"
            with open(tokenization_done_file, "w") as f:
                f.write(f"Tokenization assumed complete (loaded existing) at {datetime.now().isoformat()}\n")
                f.write(f"Processed {loaded_size} examples out of {total_original_examples}\n")
            logging.info(f"Rank {rank}: Created tokenization completion marker as it was missing.")
        except Exception as file_e:
            logging.error(f"Rank {rank}: Failed to create missing completion marker: {file_e}")
            # Proceeding anyway, but other ranks might hang if they rely solely on the file

    # Non-main ranks still need to wait for the marker to be sure Rank 0 checked/created it
    elif not is_main_process:
        logging.info(f"Rank {rank}: Waiting for main process confirmation via marker file...")
        max_wait_time = 300 # Shorter wait, just confirming file exists
        wait_start = time.time()
        while not os.path.exists(tokenization_done_file):
            if time.time() - wait_start > max_wait_time:
                 logging.error(f"Rank {rank}: Timed out waiting for marker file from Rank 0.")
                 raise TimeoutError("Marker file wait timeout")
            time.sleep(5)
        logging.info(f"Rank {rank}: Marker file found.")

elif is_main_process: # Tokenized doesn't exist, Rank 0 needs to create it
    logging.info(f"Rank {rank}: Tokenization required. Proceeding with tokenization...")
    need_tokenization = True
    path_to_load = None 
    
elif distributed_mode: # Tokenized doesn't exist, non-main ranks need to wait
    logging.info(f"Rank {rank}: Tokenization required. Waiting for main process...")
    need_tokenization = True 
    path_to_load = tokenized_dataset_path

# --- Perform Tokenization (if needed by Rank 0) ---
if need_tokenization and is_main_process:
    tokenized_dataset_obj = None # Use a distinct name for the object returned by map
    try:
        # Process the dataset using dataset.map with internal parallelism
        start_time = time.time()  # Define start_time here
        
        # Standard tokenization with caching enabled
        logging.info(f"Rank {rank}: Starting tokenization using dataset.map with {TOKENIZATION_NUM_WORKERS} workers.")
        
        tokenized_dataset_obj = dataset.map(
            preprocess_function,
            batched=True,
            batch_size=1000, 
            num_proc=TOKENIZATION_NUM_WORKERS,
            remove_columns=["text"],
            load_from_cache_file=True, # Allow using cache file if it exists
            desc=f"Tokenizing dataset ({TOKENIZATION_NUM_WORKERS} workers)"
        )
        
        elapsed = time.time() - start_time
        logging.info(f"Rank {rank}: Tokenization successful in {elapsed:.2f} seconds.")

        # If tokenization was successful:
        if tokenized_dataset_obj is not None:
            logging.info(f"Rank {rank}: Dataset tokenization completed.")
            
            # Save directly to final path
            logging.info(f"Rank {rank}: Saving tokenized dataset to {tokenized_dataset_path}...")
            save_start = time.time()
            
            # Ensure target directory doesn't exist (needed for clean save)
            if os.path.exists(tokenized_dataset_path):
                shutil.rmtree(tokenized_dataset_path)
                
            tokenized_dataset_obj.save_to_disk(tokenized_dataset_path)
            save_elapsed = time.time() - save_start
            logging.info(f"Rank {rank}: Tokenized dataset saved in {save_elapsed:.2f} seconds.")
            
            # Create completion marker file ONLY after successful save
            with open(tokenization_done_file, "w") as f:
                f.write(f"Tokenization completed and saved at {datetime.now().isoformat()}\n")
            logging.info(f"Rank {rank}: Created tokenization completion marker")
            
            # Keep the result in memory for Rank 0 for immediate use
            dataset = tokenized_dataset_obj 
            path_to_load = None # Rank 0 uses the in-memory object directly
            
    except Exception as e:
        logging.error(f"Rank {rank}: Tokenization failed: {e}")
        import traceback
        logging.error(traceback.format_exc())
        # Create done file indicating failure
        with open(tokenization_done_file, "w") as f:
            f.write(f"Tokenization FAILED at {datetime.now().isoformat()}\nError: {e}")
        raise RuntimeError("Tokenization failed.") from e

# --- Load Dataset (All Ranks) --- 
# This block now runs for all ranks *after* rank 0 has either tokenized or copied data
dataset_for_trainer = None # Use a distinct variable name for clarity
if path_to_load: # If path_to_load is set (means rank 0 copied or non-main rank needs to load)
    if not is_main_process and need_tokenization:
         # Non-main ranks wait for the done file if tokenization was required
         logging.info(f"Rank {rank}: Waiting for tokenization completion signal (already checked for existence)...")
         # Wait logic already happened if we got here and path_to_load is set
         pass 
         
    # All ranks with a path_to_load proceed to load
    logging.info(f"Rank {rank}: Loading dataset from {path_to_load}...")
    load_start_time = time.time()
    try:
        # Load without forcing into memory initially
        dataset_for_trainer = load_from_disk(path_to_load, keep_in_memory=False)
        load_elapsed = time.time() - load_start_time
        logging.info(f"Rank {rank}: Successfully loaded dataset in {load_elapsed:.2f}s. Length: {len(dataset_for_trainer)}")
    except Exception as e:
        logging.error(f"Rank {rank}: CRITICAL - Failed to load dataset from {path_to_load}: {e}")
        raise
elif is_main_process and not need_tokenization: 
     # Rank 0 loaded existing, copied to RAM disk, and path_to_load points there
     # It still needs to load it for the trainer
     logging.info(f"Rank {rank}: Loading dataset from RAM disk copy {path_to_load}...")
     try:
         dataset_for_trainer = load_from_disk(path_to_load, keep_in_memory=False)
         logging.info(f"Rank {rank}: Successfully loaded dataset from RAM disk copy.")
     except Exception as e:
         logging.error(f"Rank {rank}: CRITICAL - Failed to load from RAM disk copy {path_to_load}: {e}")
         raise
elif is_main_process and need_tokenization:
    # Rank 0 just tokenized, 'dataset' variable already holds the result in memory
    logging.info(f"Rank {rank}: Using in-memory dataset from successful tokenization.")
    dataset_for_trainer = dataset # Use the object directly
else:
    # Should not happen
    logging.error(f"Rank {rank}: Dataset path logic error. path_to_load='{path_to_load}', need_tokenization={need_tokenization}")
    raise RuntimeError("Dataset preparation failed - logic error.")

# At this point, 'dataset' on all ranks should hold the ready-to-use data.

# Synchronize processes after dataset is ready on all ranks
if distributed_mode:
    try:
        logging.info(f"Rank {rank}: Synchronizing after dataset preparation...")
        dist.barrier()
        logging.info(f"Rank {rank}: Synchronization complete.")
    except Exception as sync_e:
        logging.error(f"Rank {rank}: Synchronization after dataset prep failed: {sync_e}")
        raise

# --- Helper Function for Memory Check ---
def check_gpu_memory_utilization():
    """Check and report GPU memory utilization"""
    if not torch.cuda.is_available():
        logging.info("CUDA not available, skipping GPU memory check.")
        return 0 # Return 0 utilization if no GPU
        
    logging.info("==== GPU MEMORY UTILIZATION CHECK ====")
    total_allocated_gb = 0
    total_reserved_gb = 0
    total_capacity_gb = 0
    
    try:
        for i in range(torch.cuda.device_count()):
            free_mem, total_mem = torch.cuda.mem_get_info(i)
            allocated = torch.cuda.memory_allocated(i)
            reserved = torch.cuda.memory_reserved(i)
            
            free_gb = free_mem / (1024**3)
            total_gb = total_mem / (1024**3)
            allocated_gb = allocated / (1024**3)
            reserved_gb = reserved / (1024**3)
            utilized_pct = (1 - free_mem/total_mem) * 100 if total_mem > 0 else 0
            
            total_allocated_gb += allocated_gb
            total_reserved_gb += reserved_gb
            total_capacity_gb += total_gb
            
            logging.info(f"GPU {i}: Allocated {allocated_gb:.1f}GB, Reserved {reserved_gb:.1f}GB, "
                         f"Free {free_gb:.1f}GB, Total {total_gb:.1f}GB, "
                         f"Utilization: {utilized_pct:.1f}%")
        
        avg_utilization = (total_allocated_gb / total_capacity_gb) * 100 if total_capacity_gb > 0 else 0
        logging.info(f"OVERALL: Using {total_allocated_gb:.1f}GB / {total_capacity_gb:.1f}GB ({avg_utilization:.1f}% allocated)")
        logging.info("========================================")
        return avg_utilization
    except Exception as e:
        logging.error(f"Error checking GPU memory: {e}")
        return 0 # Return 0 on error

# --- Model Loading & Preparation (Runs on ALL ranks) ---
logging.info(f"Rank {rank}: Loading model: {MODEL_ID}...")

# 1. Load Model Configuration
config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
logging.info("Enabling YaRN scaling in model configuration.")
config.rope_scaling = {
    "type": "yarn",
    "factor": 4.0,
    "original_max_position_embeddings": 32768, 
}

# Determine torch dtype
torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

# Set device_map based on distributed mode
# When using FSDP, device_map should typically be None or "auto", FSDP handles placement.
if USE_FSDP:
    device_map = None
    logging.info("FSDP enabled: Setting device_map=None")
elif distributed_mode:
    local_rank = int(os.environ.get("LOCAL_RANK", 0))
    device_map = {"": local_rank}
    logging.info(f"Rank {rank}: DDP mode: Loading model on device {local_rank}")
else:
    device_map = "auto"
    logging.info("Rank {rank}: Single process mode: Using automatic device mapping")

# Configure Flash Attention and other optimizations
use_flash_attn = USE_FLASH_ATTN and FLASH_ATTN_AVAILABLE
attn_implementation = "flash_attention_2" if use_flash_attn else None

# Configure Quantization if enabled
# quantization_config = None
# if USE_4BIT_QUANTIZATION:
#     logging.info("Configuring 4-bit quantization (QLoRA)...")
#     compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
#     quantization_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
#         bnb_4bit_compute_dtype=compute_dtype,
#         bnb_4bit_use_double_quant=True, # Qwen models often benefit from double quant
#     )
#     # Override torch_dtype when using quantization as recommended
#     # torch_dtype = None 
#     logging.info(f"4-bit quantization config created: type={BNB_4BIT_QUANT_TYPE}, compute={BNB_4BIT_COMPUTE_DTYPE}")

# Configure model loading kwargs
model_load_kwargs = {
    "config": config,
    "device_map": device_map,
    "low_cpu_mem_usage": True,
    "trust_remote_code": True,
}
if use_flash_attn:
    model_load_kwargs["attn_implementation"] = "flash_attention_2"
# if quantization_config:
#     model_load_kwargs["quantization_config"] = quantization_config
# Always set torch_dtype when not using quantization
model_load_kwargs["torch_dtype"] = torch_dtype

# Log memory before loading
# ... (memory logging logic - keep as is) ...

# Load the model
model = None # Initialize model variable
try:
    logging.info(f"Rank {rank}: Calling AutoModelForCausalLM.from_pretrained...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        **model_load_kwargs
    )
    logging.info(f"Rank {rank}: Base model loaded successfully on device: {model.device if device_map is None else 'CPU/Multi'}")
    
    # Ensure consistent dtype before FSDP wrapping (which happens in trainer.train)
    if torch_dtype == torch.bfloat16:
        logging.info("Explicitly casting model to bfloat16...")
        model = model.to(torch.bfloat16)

    # Apply Better Transformers optimization
    use_better_transformers_flag = USE_BETTER_TRANSFORMERS and BETTER_TRANSFORMERS_AVAILABLE
    if use_better_transformers_flag:
        try:
            logging.info("Applying BetterTransformer optimizations...")
            model = BetterTransformer.transform(model)
            logging.info("BetterTransformer optimizations applied successfully")
        except Exception as bt_e:
            logging.warning(f"Could not apply BetterTransformer optimizations: {str(bt_e)}")

    # Apply activation checkpointing
    if USE_ACTIVATION_CHECKPOINTING:
        try:
            logging.info("Enabling activation checkpointing...")
            model.gradient_checkpointing_enable()
            logging.info("Activation checkpointing enabled.")
        except Exception as ac_e:
            logging.warning(f"Could not enable activation checkpointing: {str(ac_e)}")

    # Log model config and check memory utilization
    logging.info(f"Rank {rank}: Model setup complete.")
    check_gpu_memory_utilization() # This function needs to be defined or moved

except Exception as model_load_e: # Correct indentation for except
    logging.error(f"Rank {rank}: Failed during model loading or preparation: {model_load_e}")
    import traceback
    logging.error(traceback.format_exc())
    # Attempt to clean up distributed env before raising
    if distributed_mode and dist.is_initialized():
        try: dist.destroy_process_group()
        except: pass
    raise # Re-raise error

# --- LoRA Configuration ---
# ... (LoRA config - keep as is) ...
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=LORA_TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM",
)

# --- Synchronize AFTER model loading & PEFT config --- 
if distributed_mode:
    try:
        logging.info(f"Rank {rank}: Synchronizing after model loading...")
        dist.barrier()
        logging.info(f"Rank {rank}: Synchronization after model loading complete.")
    except Exception as sync_e:
        logging.error(f"Rank {rank}: Synchronization after model loading failed: {sync_e}")
        raise

# --- Define Training Arguments --- 
# (Determine determined_run_name logic here as before)
determined_run_name = None
if REPORT_TO_WANDB and is_main_process:
    try:
        import wandb 
        if wandb.run is not None: determined_run_name = wandb.run.name
    except Exception: pass # Ignore errors here, handled by report_to
    
base_training_args = {
    # ... (all base args, including max_seq_length) ...
    "output_dir": OUTPUT_DIR,
    "per_device_train_batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
    "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
    "optim": OPTIMIZER,
    "save_steps": SAVE_STEPS,
    "logging_steps": LOGGING_STEPS,
    "learning_rate": LEARNING_RATE,
    "num_train_epochs": EPOCHS,
    "max_steps": -1, 
    "fp16": False, 
    "bf16": torch_dtype == torch.bfloat16, # Use previously determined dtype
    "max_grad_norm": 0.3,
    "warmup_ratio": WARMUP_RATIO,
    "group_by_length": False, # Explicitly disable to prevent pre-computation hang
    "lr_scheduler_type": LR_SCHEDULER_TYPE,
    "report_to": report_to,
    "save_total_limit": 3, 
    "logging_first_step": True,
    **({"run_name": determined_run_name} if determined_run_name is not None else {}),
    "fsdp": "full_shard" if USE_FSDP else "", # Pass FSDP strategy string (removed offload)
    "fsdp_config": FSDP_CONFIG if USE_FSDP else {}, # Pass FSDP config dict
    "dataloader_num_workers": DATALOADER_NUM_WORKERS,
    "resume_from_checkpoint": "auto",
    "save_strategy": "steps", 
    "load_best_model_at_end": False, 
    "metric_for_best_model": None, 
    "dataset_text_field": "text",
    "packing": False,  
    "max_seq_length": MAX_SEQ_LENGTH,
    # Memory/Performance Optimizations
    "gradient_checkpointing_kwargs": {"use_reentrant": False}, # More stable checkpointing for FSDP activation checkpointing
    "ddp_find_unused_parameters": False, # Should be False for FSDP
    "tf32": True, # Enable TF32 for faster compute on compatible GPUs
}
training_arguments = SFTConfig(**base_training_args)
logging.info(f"Rank {rank}: Training arguments (SFTConfig) created.")

# --- Define Callbacks ---

# Create memory monitoring callback
class MemoryMonitorCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:  # Log every 10 steps
            if torch.cuda.is_available():
                gc.collect()
                torch.cuda.empty_cache() 
                rank = int(os.environ.get("RANK", 0))
                local_rank = int(os.environ.get("LOCAL_RANK", 0))
                try:
                    free_mem, total_mem = torch.cuda.mem_get_info(local_rank)
                    free_gb = free_mem / (1024**3)
                    used_gb = (total_mem - free_mem) / (1024**3)
                    total_gb = total_mem / (1024**3)
                    reserved = torch.cuda.memory_reserved(local_rank) / (1024**3)
                    allocated = torch.cuda.memory_allocated(local_rank) / (1024**3)
                    logging.info(f"Rank {rank}: Memory at step {state.global_step}: "
                               f"{free_gb:.1f}GB free, {used_gb:.1f}GB used, {total_gb:.1f}GB total, "
                               f"{reserved:.1f}GB reserved, {allocated:.1f}GB allocated")
                except Exception as mem_e:
                     logging.warning(f"Rank {rank}: Could not get memory info: {mem_e}")
        return control

memory_monitor = MemoryMonitorCallback()

# Create a special first step callback with WandB support
class FirstStepCallback(TrainerCallback):
    def __init__(self):
        self.first_step_start_time = None
        self.progress_indicators = 0
        self.update_interval = 60  # Check every minute
        self.last_update_time = time.time()
        
    def on_step_begin(self, args, state, control, **kwargs):
        if state.global_step == 0:
            self.first_step_start_time = time.time()
            logging.info(f"FIRST STEP STARTING at {datetime.now().strftime('%H:%M:%S')}")
            if REPORT_TO_WANDB and 'wandb' in sys.modules:
                 try: 
                     import wandb # Import locally
                     if wandb.run:
                         wandb.log({"training_status": "first_step_started"})
                 except Exception as log_e: logging.warning(f"Wandb log error: {log_e}")
        return control
            
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step == 0:
            if self.first_step_start_time is None: # Should not happen, but safeguard
                logging.warning("First step ended but start time was not recorded.")
                return control
            duration = time.time() - self.first_step_start_time
            logging.info(f"FIRST STEP COMPLETED at {datetime.now().strftime('%H:%M:%S')} (took {duration:.2f} seconds)")
            if REPORT_TO_WANDB and 'wandb' in sys.modules:
                try:
                     import wandb # Import locally
                     if wandb.run:
                         wandb.log({
                             "training_status": "first_step_completed",
                             "first_step_duration": duration
                         })
                except Exception as log_e: logging.warning(f"Wandb log error: {log_e}")
        return control
    
    def on_substep_end(self, args, state, control, **kwargs):
        # This tracks progress within a step (during gradient accumulation)
        current_time = time.time()
        # Only report for the first step/substep and only from rank 0
        if (self.first_step_start_time is not None and 
            state.global_step == 0 and 
            current_time - self.last_update_time >= self.update_interval and
            (not distributed_mode or int(os.environ.get("LOCAL_RANK", 0)) == 0)):            
            self.progress_indicators += 1
            elapsed = current_time - self.first_step_start_time
            logging.info(f"First step still in progress... ({elapsed:.1f}s elapsed, progress indicator {self.progress_indicators})")
            if REPORT_TO_WANDB and 'wandb' in sys.modules:
                try:
                     import wandb # Import locally
                     if wandb.run:
                         wandb.log({
                             "training_status": "first_step_in_progress",
                             "first_step_elapsed": elapsed,
                             "progress_indicator": self.progress_indicators
                         })
                except Exception as log_e: logging.warning(f"Wandb log error: {log_e}")
            self.last_update_time = current_time
        return control

first_step_callback = FirstStepCallback()

# Add WandB logging callback if WandB is enabled
wandb_callback = None # Initialize
if REPORT_TO_WANDB and 'wandb' in sys.modules and (not distributed_mode or int(os.environ.get("LOCAL_RANK", 0)) == 0):
    try:
        # **** FULL WandBLoggingCallback Class Definition ****
        class WandBLoggingCallback(TrainerCallback):
            """Logs comprehensive training metrics and progress to Weights & Biases"""
            
            def __init__(self):
                self.training_start_time = None
                self.last_log_time = None
                self.total_steps = None
                self.samples_seen = 0
                self.tokens_seen = 0
                self.current_epoch = 0
                self.epoch_start_time = None
                self.step_history = []  # For tracking steps/second
                self.global_tokens_per_second = 0
                self.progress_table = None # Initialize table to None
                
            def on_train_begin(self, args, state, control, **kwargs):
                """Log hyperparameters and initialize tracking at the start of training"""
                if not (REPORT_TO_WANDB and 'wandb' in sys.modules): return # Check if WandB should be used
                
                try:
                    import wandb # Import locally
                    if not wandb.run:
                        logging.warning("WandBCallback: Wandb not initialized in on_train_begin.")
                        return
                except ImportError:
                     logging.warning("WandBCallback: wandb not imported, cannot log on_train_begin")
                     return
                    
                self.training_start_time = time.time()
                self.epoch_start_time = time.time()
                self.last_log_time = time.time()
                
                # Calculate total expected steps
                if args.max_steps > 0:
                    self.total_steps = args.max_steps
                else:
                    # Use trainer passed in kwargs if available (prioritize 'trainer' key)
                    trainer_instance = kwargs.get('trainer', None)
                    if trainer_instance is None:
                         trainer_instance = kwargs.get('model', None) # Fallback to 'model' key
                    
                    dataset_length = 0
                    if trainer_instance and hasattr(trainer_instance, 'train_dataset') and trainer_instance.train_dataset is not None:
                         try:
                             dataset_length = len(trainer_instance.train_dataset)
                         except Exception as len_e:
                             logging.warning(f"WandBCallback: Error getting dataset length: {len_e}")
                    else:
                         logging.warning("WandBCallback: Could not access train_dataset length during on_train_begin.")
                         
                    batch_size = args.per_device_train_batch_size
                    accumulation = args.gradient_accumulation_steps
                    world_size = int(os.environ.get("WORLD_SIZE", 1))
                    global_batch_denom = (batch_size * world_size * accumulation)
                    if dataset_length > 0 and global_batch_denom > 0:
                         self.total_steps = (dataset_length // global_batch_denom) * args.num_train_epochs
                    else:
                         self.total_steps = -1 # Indicate unknown total steps
                    
                # Log key hyperparameters
                config = {
                    "model_name": MODEL_ID,
                    "lora_r": LORA_R,
                    "lora_alpha": LORA_ALPHA,
                    "batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
                    "grad_accum": GRADIENT_ACCUMULATION_STEPS,
                    "effective_batch": PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
                    "global_batch": PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS * world_size,
                    "learning_rate": LEARNING_RATE,
                    "seq_length": MAX_SEQ_LENGTH,
                    "epochs": EPOCHS,
                    "total_steps_estimated": self.total_steps,
                    "optimizer": OPTIMIZER,
                    "warmup_ratio": WARMUP_RATIO,
                    "scheduler": LR_SCHEDULER_TYPE,
                }
                wandb.config.update(config)
                
                # Initialize training progress table
                columns = ["step", "epoch", "loss", "lr", "tokens/sec", "eta", "elapsed_hrs"]
                self.progress_table = wandb.Table(columns=columns)
                
                # Log training start
                wandb.log({"training_status": "started"})
                logging.info(f"Training started - total estimated steps: {self.total_steps}")
            
            def on_log(self, args, state, control, logs=None, **kwargs):
                """Log detailed metrics and progress after each logging step"""
                if not (REPORT_TO_WANDB and 'wandb' in sys.modules): return # Check if WandB should be used
                
                try:
                    import wandb # Import locally
                    if not wandb.run:
                        logging.warning("WandBCallback: Wandb run not active during on_log.")
                        return
                except ImportError:
                     logging.warning("WandBCallback: wandb not imported, cannot log on_log")
                     return
                
                if not logs:
                    return
                    
                # Format metrics for logging
                metrics = {}
                for k, v in logs.items():
                    if isinstance(v, (int, float)):
                        metrics[k] = v
                    elif hasattr(v, "item"):  # Handle tensors
                        try: metrics[k] = v.item()
                        except: pass 
                
                if not metrics:
                    return
                    
                # Calculate time-based metrics
                current_time = time.time()
                if self.training_start_time is None: self.training_start_time = current_time # Safeguard
                elapsed_time = current_time - self.training_start_time
                elapsed_hrs = elapsed_time / 3600
                
                # Estimate tokens processed
                batch_size = args.per_device_train_batch_size
                grad_accum = args.gradient_accumulation_steps
                world_size = int(os.environ.get("WORLD_SIZE", 1))
                global_batch_size = batch_size * grad_accum * world_size
                tokens_per_step = global_batch_size * MAX_SEQ_LENGTH # Use MAX_SEQ_LENGTH from outer scope
                
                # Update tokens seen
                steps_since_last = state.global_step - (self.step_history[-1][0] if self.step_history else -1)
                if steps_since_last <= 0: steps_since_last = 1 # Avoid issues on first log
                new_tokens = tokens_per_step * steps_since_last
                self.tokens_seen += new_tokens
                
                # Calculate throughput
                time_since_last = current_time - (self.last_log_time if self.last_log_time else current_time)
                if time_since_last <= 0: time_since_last = 1.0 # Avoid division by zero
                tokens_per_second = new_tokens / time_since_last
                
                # Update rolling average of tokens/sec
                alpha = 0.1
                self.global_tokens_per_second = alpha * tokens_per_second + (1 - alpha) * self.global_tokens_per_second
                
                # Track epoch progress
                if "epoch" in metrics:
                    new_epoch = int(metrics["epoch"])
                    if new_epoch > self.current_epoch:
                        epoch_time = current_time - (self.epoch_start_time if self.epoch_start_time else current_time)
                        self.epoch_start_time = current_time
                        self.current_epoch = new_epoch
                        wandb.log({"epoch/duration_sec": epoch_time}, step=state.global_step)
                        logging.info(f"Epoch {self.current_epoch-1} completed in {epoch_time:.2f} seconds")
                
                    epoch_float = metrics["epoch"]
                    epoch_progress = epoch_float - int(epoch_float)
                    metrics["epoch_progress"] = epoch_progress * 100
                
                # Estimate time remaining
                eta_hours = float('nan')
                if self.total_steps and self.total_steps > 0 and state.global_step > 0:
                    progress_fraction = state.global_step / self.total_steps
                    if progress_fraction > 1e-6: # Avoid division by zero early on
                        eta_seconds = elapsed_time / progress_fraction - elapsed_time
                        eta_hours = eta_seconds / 3600
                        metrics["eta_hours"] = eta_hours
                
                # Add additional calculated metrics
                metrics.update({
                    "progress/elapsed_hours": elapsed_hrs,
                    "progress/tokens_total": self.tokens_seen,
                    "performance/tokens_per_second": tokens_per_second,
                    "performance/tokens_per_second_avg": self.global_tokens_per_second,
                    "performance/global_batch_size": global_batch_size,
                })
                
                # Add GPU utilization if available
                if torch.cuda.is_available():
                    try:
                        local_rank = int(os.environ.get("LOCAL_RANK", 0))
                        # Note: torch.cuda.utilization might not be available/reliable
                        # metrics["gpu/utilization"] = torch.cuda.utilization(local_rank) 
                        metrics["gpu/memory_allocated_gb"] = torch.cuda.memory_allocated(local_rank) / 1e9
                        metrics["gpu/memory_reserved_gb"] = torch.cuda.memory_reserved(local_rank) / 1e9
                    except Exception as gpu_e:
                        logging.debug(f"Could not log GPU metrics: {gpu_e}")
                
                # Log all metrics to wandb
                wandb.log(metrics, step=state.global_step)
                
                # Add row to progress table
                if self.progress_table is not None:
                    loss_val = metrics.get("loss", float("nan"))
                    lr_val = metrics.get("learning_rate", float("nan"))
                    epoch_val = metrics.get("epoch", 0)
                    tokens_sec = metrics.get("performance/tokens_per_second_avg", 0)
                    
                    self.progress_table.add_data(
                        state.global_step,
                        f"{epoch_val:.2f}",
                        f"{loss_val:.4f}",
                        f"{lr_val:.2e}",
                        f"{tokens_sec:.1f}",
                        f"{eta_hours:.1f} hrs",
                        f"{elapsed_hrs:.1f} hrs"
                    )
                    # Log the updated progress table (might be verbose, consider less frequent logging)
                    # wandb.log({"training_progress": self.progress_table}, step=state.global_step) 
                
                # Print concise metrics to console
                log_info = (
                    f"Step {state.global_step}"
                    + (f"/{self.total_steps} ({100 * state.global_step / self.total_steps:.1f}%)" if self.total_steps and self.total_steps > 0 else "")
                    + f" | Loss: {loss_val:.4f} | LR: {lr_val:.2e} | Epoch: {epoch_val:.2f}"
                    + f" | Tokens/sec: {tokens_sec:.1f}"
                    + (f" | ETA: {eta_hours:.1f}h" if not math.isnan(eta_hours) else "")
                )
                logging.info(log_info)
                
                # Update time tracking
                self.last_log_time = current_time
                self.step_history.append((state.global_step, current_time))
                if len(self.step_history) > 100: # Keep only recent history
                    self.step_history = self.step_history[-100:]
                    
            def on_train_end(self, args, state, control, **kwargs):
                """Log final statistics at the end of training"""
                if not (REPORT_TO_WANDB and 'wandb' in sys.modules): return # Check if WandB should be used
                
                try:
                    import wandb # Import locally
                    if not wandb.run:
                        logging.warning("WandBCallback: Wandb run not active during on_train_end.")
                        return
                except ImportError:
                     logging.warning("WandBCallback: wandb not imported, cannot log on_train_end")
                     return
                
                total_time = time.time() - (self.training_start_time if self.training_start_time else time.time())
                hours = total_time / 3600
                
                final_stats = {
                    "training_status": "completed",
                    "total_steps_completed": state.global_step,
                    "total_epochs_completed": self.current_epoch,
                    "total_training_time_hours": hours,
                    "total_tokens_processed": self.tokens_seen,
                    "average_tokens_per_second": self.tokens_seen / total_time if total_time > 0 else 0
                }
                wandb.log(final_stats, step=state.global_step) # Log at final step
                
                wandb.run.summary.update({
                    "training_duration_hours": hours,
                    "total_steps": state.global_step,
                    "total_epochs": self.current_epoch,
                    "total_tokens": self.tokens_seen
                })
                logging.info(f"Training complete - {hours:.2f} hours, {state.global_step} steps, {self.tokens_seen:,} tokens processed")
        # **** End of WandBLoggingCallback Definition ****
        
        # Create callback instance
        wandb_callback = WandBLoggingCallback()
        logging.info("Enhanced WandB logging callback created")
    except Exception as e:
        logging.error(f"Error creating WandB callback: {str(e)}")
        wandb_callback = None

# Create the list of callbacks
trainer_callbacks = [memory_monitor, first_step_callback] # Use the instance names
if wandb_callback:
    trainer_callbacks.append(wandb_callback)
    logging.info("Added WandB callback to trainer")
# trainer_callbacks = [] # Temporarily disable all callbacks

# --- Initialize Trainer --- 
logging.info(f"Rank {rank}: Initializing SFTTrainer...")

trainer = None
try:
    trainer = SFTTrainer(
        model=model,
        # Using processing_class as per user confirmation
        processing_class=tokenizer, 
        args=training_arguments, 
        train_dataset=dataset_for_trainer, 
        peft_config=peft_config,
        # Ensure this matches whether the collator is defined/needed
        preprocess_logits_for_metrics=None, 
        callbacks=trainer_callbacks, # Pass the list here
    )
    logging.info(f"Rank {rank}: SFTTrainer initialized successfully.")
except Exception as e:
     logging.error(f"Rank {rank}: Error initializing SFTTrainer: {e}")
     import traceback
     logging.error(traceback.format_exc())
     if distributed_mode and dist.is_initialized():
         try: dist.destroy_process_group()
         except: pass
     raise

# --- Train --- 
if trainer is not None:
    logging.info(f"Beginning trainer.train() call at {datetime.now().strftime('%H:%M:%S')}")
    try:
        trainer.train()
        logging.info(f"Training finished successfully at {datetime.now().strftime('%H:%M:%S')}")
    except Exception as e:
        logging.error(f"Exception during training: {e}")
        import traceback
        logging.error(traceback.format_exc())
        if distributed_mode and dist.is_initialized():
            try: 
                dist.destroy_process_group()
                logging.info("Destroyed process group after training error")
            except: 
                pass
        raise

# --- Merge Model and Save Full Model ---
logging.info("Merging adapter weights into base model...")

# Clear some memory first if needed (especially if not using massive GPUs)
# del model
# del trainer
# torch.cuda.empty_cache()

# Reload the base model (consider lower precision to save VRAM during merge)
logging.info(f"Reloading base model ({MODEL_ID}) for merging...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    config=config, # Ensure YaRN config is used if applied during training
    torch_dtype=torch.bfloat16, # Or torch.float16, adjust as needed
    low_cpu_mem_usage=True, # Helps with large models
    trust_remote_code=True,
    device_map=None, # Load onto CPU first to potentially save GPU VRAM if needed
    attn_implementation="flash_attention_2"
)

# Load the PEFT model with adapters
logging.info(f"Loading PEFT model from {OUTPUT_DIR}...")
merged_model = PeftModel.from_pretrained(
    base_model,
    OUTPUT_DIR,
    device_map=None, # Load onto CPU first
)

# Merge the adapter weights
logging.info("Merging LoRA weights...")
merged_model = merged_model.merge_and_unload()
logging.info("LoRA weights merged.")

# Define path for the full model save
full_model_save_path = os.path.join(OUTPUT_DIR, "final_merged_model")

# Save the merged model
logging.info(f"Saving merged model to {full_model_save_path}...")
merged_model.save_pretrained(full_model_save_path)
logging.info("Merged model saved.")

# Save the tokenizer associated with the merged model
logging.info(f"Saving tokenizer to {full_model_save_path}...")
tokenizer.save_pretrained(full_model_save_path)
logging.info("Tokenizer saved.")

logging.info(f"Fine-tuning and merging process complete. Full model saved to {full_model_save_path}")

# --- Notes on Inference and Resuming Training ---
logging.info("Training Checkpoint Notes:")
logging.info(f"  • Checkpoints saved to: {OUTPUT_DIR}")
logging.info(f"  • To resume training from the latest checkpoint, just rerun this script")
logging.info(f"    (resume_from_checkpoint='auto' will automatically find the latest checkpoint)")
logging.info(f"  • To resume from a specific checkpoint, set resume_from_checkpoint='path/to/checkpoint'")

# --- Notes on Inference ---
# To use the trained adapters:
# from peft import PeftModel
# base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, ...)
# model = PeftModel.from_pretrained(base_model, final_adapter_path)
# model = model.merge_and_unload() # Optional: merge adapters for faster inference
# Then use model and tokenizer for generation.