model_trace / src /evaluation /initialize_models.py
Ahmed Ahmed
try again
1191811
"""
Initialize the leaderboard with specific models and compute their p-values.
This module ensures only the specified models are included in the leaderboard
and their model trace p-values are computed.
"""
import os
import json
import sys
from src.evaluation.model_trace_eval import compute_model_trace_p_value
from src.envs import EVAL_RESULTS_PATH
# The specific models we want to include
ALLOWED_MODELS = [
"lmsys/vicuna-7b-v1.5",
"ibm-granite/granite-7b-base",
"EleutherAI/llemma_7b"
]
def create_model_result_file(model_name, precision="float16"):
"""
Create a result file for a model with computed p-value.
Args:
model_name: HuggingFace model identifier
precision: Model precision
"""
sys.stderr.write(f"\nπŸ”§ CREATING RESULT FILE FOR: {model_name}\n")
sys.stderr.flush()
# Create the results directory if it doesn't exist
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
# Generate a safe filename
safe_name = model_name.replace("/", "_").replace("-", "_")
result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
sys.stderr.write(f"πŸ“ Result file path: {result_file}\n")
sys.stderr.flush()
# Check if file already exists
if os.path.exists(result_file):
sys.stderr.write(f"βœ… Result file already exists: {result_file}\n")
sys.stderr.flush()
return result_file
# Create basic result structure
result_data = {
"config": {
"model_dtype": f"torch.{precision}",
"model_name": model_name,
"model_sha": "main"
},
"results": {
# No perplexity - we only care about p-values
}
}
# Save the result file
try:
with open(result_file, 'w') as f:
json.dump(result_data, f, indent=2)
sys.stderr.write(f"βœ… Created result file: {result_file}\n")
sys.stderr.flush()
return result_file
except Exception as e:
sys.stderr.write(f"❌ Failed to create result file: {e}\n")
sys.stderr.flush()
return None
def clean_non_allowed_results():
"""
Remove result files for models that are not in the allowed list.
"""
sys.stderr.write(f"\n🧹 CLEANING NON-ALLOWED RESULT FILES\n")
sys.stderr.flush()
if not os.path.exists(EVAL_RESULTS_PATH):
sys.stderr.write("πŸ“ Results directory doesn't exist, nothing to clean\n")
sys.stderr.flush()
return
removed_count = 0
# Walk through all files in the results directory
for root, dirs, files in os.walk(EVAL_RESULTS_PATH):
for file in files:
if not file.endswith('.json'):
continue
file_path = os.path.join(root, file)
try:
# Try to extract model name from the result file
with open(file_path, 'r') as f:
data = json.load(f)
config = data.get("config", {})
model_name = config.get("model_name", "")
if model_name and not is_model_allowed(model_name):
sys.stderr.write(f"πŸ—‘οΈ Removing non-allowed model result: {file_path} (model: {model_name})\n")
os.remove(file_path)
removed_count += 1
elif not model_name:
sys.stderr.write(f"⚠️ Skipping file with no model_name: {file_path}\n")
except Exception as e:
sys.stderr.write(f"⚠️ Error processing file {file_path}: {e}\n")
continue
sys.stderr.write(f"βœ… Removed {removed_count} non-allowed result files\n")
sys.stderr.flush()
def initialize_allowed_models():
"""
Initialize result files for all allowed models.
"""
sys.stderr.write(f"\nπŸš€ INITIALIZING ALLOWED MODELS\n")
sys.stderr.write(f"πŸ“‹ Models to initialize: {ALLOWED_MODELS}\n")
sys.stderr.flush()
# First, clean up any existing non-allowed results
clean_non_allowed_results()
created_files = []
for model_name in ALLOWED_MODELS:
try:
result_file = create_model_result_file(model_name)
if result_file:
created_files.append(result_file)
except Exception as e:
sys.stderr.write(f"❌ Failed to initialize {model_name}: {e}\n")
sys.stderr.flush()
continue
sys.stderr.write(f"βœ… Initialized {len(created_files)} model result files\n")
sys.stderr.flush()
return created_files
def is_model_allowed(model_name):
"""
Check if a model is in the allowed list.
Args:
model_name: HuggingFace model identifier
Returns:
bool: True if model is allowed
"""
return model_name in ALLOWED_MODELS
def get_allowed_models():
"""
Get the list of allowed models.
Returns:
list: List of allowed model names
"""
return ALLOWED_MODELS.copy()