Spaces:

ahmedsqrd
/

model_trace

Runtime error

File size: 5,176 Bytes

"""
Initialize the leaderboard with specific models and compute their p-values.

This module ensures only the specified models are included in the leaderboard
and their model trace p-values are computed.
"""

import os
import json
import sys
from src.evaluation.model_trace_eval import compute_model_trace_p_value
from src.envs import EVAL_RESULTS_PATH

# The specific models we want to include
ALLOWED_MODELS = [
    "lmsys/vicuna-7b-v1.5",
    "ibm-granite/granite-7b-base", 
    "EleutherAI/llemma_7b"
]

def create_model_result_file(model_name, precision="float16"):
    """
    Create a result file for a model with computed p-value.
    
    Args:
        model_name: HuggingFace model identifier
        precision: Model precision
    """
    sys.stderr.write(f"\n🔧 CREATING RESULT FILE FOR: {model_name}\n")
    sys.stderr.flush()
    
    # Create the results directory if it doesn't exist
    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
    
    # Generate a safe filename
    safe_name = model_name.replace("/", "_").replace("-", "_")
    result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
    
    sys.stderr.write(f"📁 Result file path: {result_file}\n")
    sys.stderr.flush()
    
    # Check if file already exists
    if os.path.exists(result_file):
        sys.stderr.write(f"✅ Result file already exists: {result_file}\n")
        sys.stderr.flush()
        return result_file
    
    # Create basic result structure
    result_data = {
        "config": {
            "model_dtype": f"torch.{precision}",
            "model_name": model_name,
            "model_sha": "main"
        },
        "results": {
            # No perplexity - we only care about p-values
        }
    }
    
    # Save the result file
    try:
        with open(result_file, 'w') as f:
            json.dump(result_data, f, indent=2)
        
        sys.stderr.write(f"✅ Created result file: {result_file}\n")
        sys.stderr.flush()
        return result_file
        
    except Exception as e:
        sys.stderr.write(f"❌ Failed to create result file: {e}\n")
        sys.stderr.flush()
        return None

def clean_non_allowed_results():
    """
    Remove result files for models that are not in the allowed list.
    """
    sys.stderr.write(f"\n🧹 CLEANING NON-ALLOWED RESULT FILES\n")
    sys.stderr.flush()
    
    if not os.path.exists(EVAL_RESULTS_PATH):
        sys.stderr.write("📁 Results directory doesn't exist, nothing to clean\n")
        sys.stderr.flush()
        return
        
    removed_count = 0
    
    # Walk through all files in the results directory
    for root, dirs, files in os.walk(EVAL_RESULTS_PATH):
        for file in files:
            if not file.endswith('.json'):
                continue
                
            file_path = os.path.join(root, file)
            
            try:
                # Try to extract model name from the result file
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                config = data.get("config", {})
                model_name = config.get("model_name", "")
                
                if model_name and not is_model_allowed(model_name):
                    sys.stderr.write(f"🗑️ Removing non-allowed model result: {file_path} (model: {model_name})\n")
                    os.remove(file_path)
                    removed_count += 1
                elif not model_name:
                    sys.stderr.write(f"⚠️ Skipping file with no model_name: {file_path}\n")
                    
            except Exception as e:
                sys.stderr.write(f"⚠️ Error processing file {file_path}: {e}\n")
                continue
    
    sys.stderr.write(f"✅ Removed {removed_count} non-allowed result files\n")
    sys.stderr.flush()

def initialize_allowed_models():
    """
    Initialize result files for all allowed models.
    """
    sys.stderr.write(f"\n🚀 INITIALIZING ALLOWED MODELS\n")
    sys.stderr.write(f"📋 Models to initialize: {ALLOWED_MODELS}\n")
    sys.stderr.flush()
    
    # First, clean up any existing non-allowed results
    clean_non_allowed_results()
    
    created_files = []
    
    for model_name in ALLOWED_MODELS:
        try:
            result_file = create_model_result_file(model_name)
            if result_file:
                created_files.append(result_file)
                
        except Exception as e:
            sys.stderr.write(f"❌ Failed to initialize {model_name}: {e}\n")
            sys.stderr.flush()
            continue
    
    sys.stderr.write(f"✅ Initialized {len(created_files)} model result files\n")
    sys.stderr.flush()
    
    return created_files

def is_model_allowed(model_name):
    """
    Check if a model is in the allowed list.
    
    Args:
        model_name: HuggingFace model identifier
        
    Returns:
        bool: True if model is allowed
    """
    return model_name in ALLOWED_MODELS

def get_allowed_models():
    """
    Get the list of allowed models.
    
    Returns:
        list: List of allowed model names
    """
    return ALLOWED_MODELS.copy()