Spaces:
Runtime error
Runtime error
File size: 5,176 Bytes
4864926 1191811 4864926 36b1a23 4864926 36b1a23 4864926 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
"""
Initialize the leaderboard with specific models and compute their p-values.
This module ensures only the specified models are included in the leaderboard
and their model trace p-values are computed.
"""
import os
import json
import sys
from src.evaluation.model_trace_eval import compute_model_trace_p_value
from src.envs import EVAL_RESULTS_PATH
# The specific models we want to include
ALLOWED_MODELS = [
"lmsys/vicuna-7b-v1.5",
"ibm-granite/granite-7b-base",
"EleutherAI/llemma_7b"
]
def create_model_result_file(model_name, precision="float16"):
"""
Create a result file for a model with computed p-value.
Args:
model_name: HuggingFace model identifier
precision: Model precision
"""
sys.stderr.write(f"\nπ§ CREATING RESULT FILE FOR: {model_name}\n")
sys.stderr.flush()
# Create the results directory if it doesn't exist
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
# Generate a safe filename
safe_name = model_name.replace("/", "_").replace("-", "_")
result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
sys.stderr.write(f"π Result file path: {result_file}\n")
sys.stderr.flush()
# Check if file already exists
if os.path.exists(result_file):
sys.stderr.write(f"β
Result file already exists: {result_file}\n")
sys.stderr.flush()
return result_file
# Create basic result structure
result_data = {
"config": {
"model_dtype": f"torch.{precision}",
"model_name": model_name,
"model_sha": "main"
},
"results": {
# No perplexity - we only care about p-values
}
}
# Save the result file
try:
with open(result_file, 'w') as f:
json.dump(result_data, f, indent=2)
sys.stderr.write(f"β
Created result file: {result_file}\n")
sys.stderr.flush()
return result_file
except Exception as e:
sys.stderr.write(f"β Failed to create result file: {e}\n")
sys.stderr.flush()
return None
def clean_non_allowed_results():
"""
Remove result files for models that are not in the allowed list.
"""
sys.stderr.write(f"\nπ§Ή CLEANING NON-ALLOWED RESULT FILES\n")
sys.stderr.flush()
if not os.path.exists(EVAL_RESULTS_PATH):
sys.stderr.write("π Results directory doesn't exist, nothing to clean\n")
sys.stderr.flush()
return
removed_count = 0
# Walk through all files in the results directory
for root, dirs, files in os.walk(EVAL_RESULTS_PATH):
for file in files:
if not file.endswith('.json'):
continue
file_path = os.path.join(root, file)
try:
# Try to extract model name from the result file
with open(file_path, 'r') as f:
data = json.load(f)
config = data.get("config", {})
model_name = config.get("model_name", "")
if model_name and not is_model_allowed(model_name):
sys.stderr.write(f"ποΈ Removing non-allowed model result: {file_path} (model: {model_name})\n")
os.remove(file_path)
removed_count += 1
elif not model_name:
sys.stderr.write(f"β οΈ Skipping file with no model_name: {file_path}\n")
except Exception as e:
sys.stderr.write(f"β οΈ Error processing file {file_path}: {e}\n")
continue
sys.stderr.write(f"β
Removed {removed_count} non-allowed result files\n")
sys.stderr.flush()
def initialize_allowed_models():
"""
Initialize result files for all allowed models.
"""
sys.stderr.write(f"\nπ INITIALIZING ALLOWED MODELS\n")
sys.stderr.write(f"π Models to initialize: {ALLOWED_MODELS}\n")
sys.stderr.flush()
# First, clean up any existing non-allowed results
clean_non_allowed_results()
created_files = []
for model_name in ALLOWED_MODELS:
try:
result_file = create_model_result_file(model_name)
if result_file:
created_files.append(result_file)
except Exception as e:
sys.stderr.write(f"β Failed to initialize {model_name}: {e}\n")
sys.stderr.flush()
continue
sys.stderr.write(f"β
Initialized {len(created_files)} model result files\n")
sys.stderr.flush()
return created_files
def is_model_allowed(model_name):
"""
Check if a model is in the allowed list.
Args:
model_name: HuggingFace model identifier
Returns:
bool: True if model is allowed
"""
return model_name in ALLOWED_MODELS
def get_allowed_models():
"""
Get the list of allowed models.
Returns:
list: List of allowed model names
"""
return ALLOWED_MODELS.copy() |