Spaces:
Runtime error
Runtime error
""" | |
Initialize the leaderboard with specific models and compute their p-values. | |
This module ensures only the specified models are included in the leaderboard | |
and their model trace p-values are computed. | |
""" | |
import os | |
import json | |
import sys | |
from src.evaluation.model_trace_eval import compute_model_trace_p_value | |
from src.envs import EVAL_RESULTS_PATH | |
# The specific models we want to include | |
ALLOWED_MODELS = [ | |
"lmsys/vicuna-7b-v1.5", | |
"ibm-granite/granite-7b-base", | |
"EleutherAI/llemma_7b" | |
] | |
def create_model_result_file(model_name, precision="float16"): | |
""" | |
Create a result file for a model with computed p-value. | |
Args: | |
model_name: HuggingFace model identifier | |
precision: Model precision | |
""" | |
sys.stderr.write(f"\nπ§ CREATING RESULT FILE FOR: {model_name}\n") | |
sys.stderr.flush() | |
# Create the results directory if it doesn't exist | |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) | |
# Generate a safe filename | |
safe_name = model_name.replace("/", "_").replace("-", "_") | |
result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json") | |
sys.stderr.write(f"π Result file path: {result_file}\n") | |
sys.stderr.flush() | |
# Check if file already exists | |
if os.path.exists(result_file): | |
sys.stderr.write(f"β Result file already exists: {result_file}\n") | |
sys.stderr.flush() | |
return result_file | |
# Create basic result structure | |
result_data = { | |
"config": { | |
"model_dtype": f"torch.{precision}", | |
"model_name": model_name, | |
"model_sha": "main" | |
}, | |
"results": { | |
# No perplexity - we only care about p-values | |
} | |
} | |
# Save the result file | |
try: | |
with open(result_file, 'w') as f: | |
json.dump(result_data, f, indent=2) | |
sys.stderr.write(f"β Created result file: {result_file}\n") | |
sys.stderr.flush() | |
return result_file | |
except Exception as e: | |
sys.stderr.write(f"β Failed to create result file: {e}\n") | |
sys.stderr.flush() | |
return None | |
def clean_non_allowed_results(): | |
""" | |
Remove result files for models that are not in the allowed list. | |
""" | |
sys.stderr.write(f"\nπ§Ή CLEANING NON-ALLOWED RESULT FILES\n") | |
sys.stderr.flush() | |
if not os.path.exists(EVAL_RESULTS_PATH): | |
sys.stderr.write("π Results directory doesn't exist, nothing to clean\n") | |
sys.stderr.flush() | |
return | |
removed_count = 0 | |
# Walk through all files in the results directory | |
for root, dirs, files in os.walk(EVAL_RESULTS_PATH): | |
for file in files: | |
if not file.endswith('.json'): | |
continue | |
file_path = os.path.join(root, file) | |
try: | |
# Try to extract model name from the result file | |
with open(file_path, 'r') as f: | |
data = json.load(f) | |
config = data.get("config", {}) | |
model_name = config.get("model_name", "") | |
if model_name and not is_model_allowed(model_name): | |
sys.stderr.write(f"ποΈ Removing non-allowed model result: {file_path} (model: {model_name})\n") | |
os.remove(file_path) | |
removed_count += 1 | |
elif not model_name: | |
sys.stderr.write(f"β οΈ Skipping file with no model_name: {file_path}\n") | |
except Exception as e: | |
sys.stderr.write(f"β οΈ Error processing file {file_path}: {e}\n") | |
continue | |
sys.stderr.write(f"β Removed {removed_count} non-allowed result files\n") | |
sys.stderr.flush() | |
def initialize_allowed_models(): | |
""" | |
Initialize result files for all allowed models. | |
""" | |
sys.stderr.write(f"\nπ INITIALIZING ALLOWED MODELS\n") | |
sys.stderr.write(f"π Models to initialize: {ALLOWED_MODELS}\n") | |
sys.stderr.flush() | |
# First, clean up any existing non-allowed results | |
clean_non_allowed_results() | |
created_files = [] | |
for model_name in ALLOWED_MODELS: | |
try: | |
result_file = create_model_result_file(model_name) | |
if result_file: | |
created_files.append(result_file) | |
except Exception as e: | |
sys.stderr.write(f"β Failed to initialize {model_name}: {e}\n") | |
sys.stderr.flush() | |
continue | |
sys.stderr.write(f"β Initialized {len(created_files)} model result files\n") | |
sys.stderr.flush() | |
return created_files | |
def is_model_allowed(model_name): | |
""" | |
Check if a model is in the allowed list. | |
Args: | |
model_name: HuggingFace model identifier | |
Returns: | |
bool: True if model is allowed | |
""" | |
return model_name in ALLOWED_MODELS | |
def get_allowed_models(): | |
""" | |
Get the list of allowed models. | |
Returns: | |
list: List of allowed model names | |
""" | |
return ALLOWED_MODELS.copy() |