File size: 5,176 Bytes
4864926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1191811
4864926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36b1a23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4864926
 
 
 
 
 
 
 
36b1a23
 
 
4864926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Initialize the leaderboard with specific models and compute their p-values.

This module ensures only the specified models are included in the leaderboard
and their model trace p-values are computed.
"""

import os
import json
import sys
from src.evaluation.model_trace_eval import compute_model_trace_p_value
from src.envs import EVAL_RESULTS_PATH

# The specific models we want to include
ALLOWED_MODELS = [
    "lmsys/vicuna-7b-v1.5",
    "ibm-granite/granite-7b-base", 
    "EleutherAI/llemma_7b"
]

def create_model_result_file(model_name, precision="float16"):
    """
    Create a result file for a model with computed p-value.
    
    Args:
        model_name: HuggingFace model identifier
        precision: Model precision
    """
    sys.stderr.write(f"\nπŸ”§ CREATING RESULT FILE FOR: {model_name}\n")
    sys.stderr.flush()
    
    # Create the results directory if it doesn't exist
    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
    
    # Generate a safe filename
    safe_name = model_name.replace("/", "_").replace("-", "_")
    result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
    
    sys.stderr.write(f"πŸ“ Result file path: {result_file}\n")
    sys.stderr.flush()
    
    # Check if file already exists
    if os.path.exists(result_file):
        sys.stderr.write(f"βœ… Result file already exists: {result_file}\n")
        sys.stderr.flush()
        return result_file
    
    # Create basic result structure
    result_data = {
        "config": {
            "model_dtype": f"torch.{precision}",
            "model_name": model_name,
            "model_sha": "main"
        },
        "results": {
            # No perplexity - we only care about p-values
        }
    }
    
    # Save the result file
    try:
        with open(result_file, 'w') as f:
            json.dump(result_data, f, indent=2)
        
        sys.stderr.write(f"βœ… Created result file: {result_file}\n")
        sys.stderr.flush()
        return result_file
        
    except Exception as e:
        sys.stderr.write(f"❌ Failed to create result file: {e}\n")
        sys.stderr.flush()
        return None

def clean_non_allowed_results():
    """
    Remove result files for models that are not in the allowed list.
    """
    sys.stderr.write(f"\n🧹 CLEANING NON-ALLOWED RESULT FILES\n")
    sys.stderr.flush()
    
    if not os.path.exists(EVAL_RESULTS_PATH):
        sys.stderr.write("πŸ“ Results directory doesn't exist, nothing to clean\n")
        sys.stderr.flush()
        return
        
    removed_count = 0
    
    # Walk through all files in the results directory
    for root, dirs, files in os.walk(EVAL_RESULTS_PATH):
        for file in files:
            if not file.endswith('.json'):
                continue
                
            file_path = os.path.join(root, file)
            
            try:
                # Try to extract model name from the result file
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                config = data.get("config", {})
                model_name = config.get("model_name", "")
                
                if model_name and not is_model_allowed(model_name):
                    sys.stderr.write(f"πŸ—‘οΈ Removing non-allowed model result: {file_path} (model: {model_name})\n")
                    os.remove(file_path)
                    removed_count += 1
                elif not model_name:
                    sys.stderr.write(f"⚠️ Skipping file with no model_name: {file_path}\n")
                    
            except Exception as e:
                sys.stderr.write(f"⚠️ Error processing file {file_path}: {e}\n")
                continue
    
    sys.stderr.write(f"βœ… Removed {removed_count} non-allowed result files\n")
    sys.stderr.flush()

def initialize_allowed_models():
    """
    Initialize result files for all allowed models.
    """
    sys.stderr.write(f"\nπŸš€ INITIALIZING ALLOWED MODELS\n")
    sys.stderr.write(f"πŸ“‹ Models to initialize: {ALLOWED_MODELS}\n")
    sys.stderr.flush()
    
    # First, clean up any existing non-allowed results
    clean_non_allowed_results()
    
    created_files = []
    
    for model_name in ALLOWED_MODELS:
        try:
            result_file = create_model_result_file(model_name)
            if result_file:
                created_files.append(result_file)
                
        except Exception as e:
            sys.stderr.write(f"❌ Failed to initialize {model_name}: {e}\n")
            sys.stderr.flush()
            continue
    
    sys.stderr.write(f"βœ… Initialized {len(created_files)} model result files\n")
    sys.stderr.flush()
    
    return created_files

def is_model_allowed(model_name):
    """
    Check if a model is in the allowed list.
    
    Args:
        model_name: HuggingFace model identifier
        
    Returns:
        bool: True if model is allowed
    """
    return model_name in ALLOWED_MODELS

def get_allowed_models():
    """
    Get the list of allowed models.
    
    Returns:
        list: List of allowed model names
    """
    return ALLOWED_MODELS.copy()