Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
Β·
4864926
1
Parent(s):
de071e9
ok
Browse files- app.py +27 -8
- src/about.py +29 -14
- src/evaluation/initialize_models.py +121 -0
- src/evaluation/model_trace_eval.py +142 -187
- src/leaderboard/read_evals.py +8 -0
- test_model_trace.py +0 -43
app.py
CHANGED
@@ -89,9 +89,13 @@ def run_perplexity_test(model_name, revision, precision):
|
|
89 |
import sys
|
90 |
import traceback
|
91 |
import gradio as gr
|
|
|
92 |
|
93 |
if not model_name:
|
94 |
-
return "Please
|
|
|
|
|
|
|
95 |
|
96 |
try:
|
97 |
# Use stderr for more reliable logging in HF Spaces
|
@@ -125,7 +129,7 @@ def run_perplexity_test(model_name, revision, precision):
|
|
125 |
|
126 |
π **Results have been saved and both tables have been updated!**
|
127 |
|
128 |
-
Note
|
129 |
|
130 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
131 |
else:
|
@@ -167,9 +171,17 @@ except Exception as e:
|
|
167 |
# Ensure local directory exists even if repo operations fail
|
168 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
169 |
|
170 |
-
#
|
171 |
import sys
|
|
|
|
|
172 |
sys.stderr.write("\nπ STARTING GRADIO APP INITIALIZATION\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
sys.stderr.write("π Creating initial results DataFrame...\n")
|
174 |
sys.stderr.flush()
|
175 |
|
@@ -202,11 +214,17 @@ with demo:
|
|
202 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
203 |
|
204 |
with gr.TabItem("π§ͺ Test Model", elem_id="test-model-tab", id=2):
|
205 |
-
gr.Markdown("## Run Perplexity Test\n\nTest
|
|
|
|
|
206 |
|
207 |
with gr.Row():
|
208 |
with gr.Column():
|
209 |
-
model_name = gr.
|
|
|
|
|
|
|
|
|
210 |
revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
211 |
precision = gr.Dropdown(
|
212 |
choices=["float16", "bfloat16"],
|
@@ -231,13 +249,14 @@ with demo:
|
|
231 |
### Tips:
|
232 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
233 |
- **Results will update automatically** in the table above after evaluation completes
|
234 |
-
- **
|
235 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
|
|
236 |
|
237 |
### How it works:
|
238 |
-
1.
|
239 |
2. Click "Run Perplexity Test"
|
240 |
-
3. Wait for evaluation to complete (may take a few minutes for
|
241 |
4. Results will appear automatically in the table above!
|
242 |
""")
|
243 |
|
|
|
89 |
import sys
|
90 |
import traceback
|
91 |
import gradio as gr
|
92 |
+
from src.evaluation.initialize_models import is_model_allowed
|
93 |
|
94 |
if not model_name:
|
95 |
+
return "Please select a model.", gr.update(), gr.update()
|
96 |
+
|
97 |
+
if not is_model_allowed(model_name):
|
98 |
+
return f"β Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
|
99 |
|
100 |
try:
|
101 |
# Use stderr for more reliable logging in HF Spaces
|
|
|
129 |
|
130 |
π **Results have been saved and both tables have been updated!**
|
131 |
|
132 |
+
β° **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
|
133 |
|
134 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
135 |
else:
|
|
|
171 |
# Ensure local directory exists even if repo operations fail
|
172 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
173 |
|
174 |
+
# Initialize allowed models
|
175 |
import sys
|
176 |
+
from src.evaluation.initialize_models import initialize_allowed_models, get_allowed_models
|
177 |
+
|
178 |
sys.stderr.write("\nπ STARTING GRADIO APP INITIALIZATION\n")
|
179 |
+
sys.stderr.write("π Initializing allowed models...\n")
|
180 |
+
sys.stderr.flush()
|
181 |
+
|
182 |
+
# Initialize the allowed models
|
183 |
+
initialize_allowed_models()
|
184 |
+
|
185 |
sys.stderr.write("π Creating initial results DataFrame...\n")
|
186 |
sys.stderr.flush()
|
187 |
|
|
|
214 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
215 |
|
216 |
with gr.TabItem("π§ͺ Test Model", elem_id="test-model-tab", id=2):
|
217 |
+
gr.Markdown("## Run Perplexity Test\n\nTest one of the supported models for perplexity evaluation.")
|
218 |
+
|
219 |
+
allowed_models = get_allowed_models()
|
220 |
|
221 |
with gr.Row():
|
222 |
with gr.Column():
|
223 |
+
model_name = gr.Dropdown(
|
224 |
+
choices=allowed_models,
|
225 |
+
label="Model name",
|
226 |
+
value=allowed_models[0] if allowed_models else None
|
227 |
+
)
|
228 |
revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
229 |
precision = gr.Dropdown(
|
230 |
choices=["float16", "bfloat16"],
|
|
|
249 |
### Tips:
|
250 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
251 |
- **Results will update automatically** in the table above after evaluation completes
|
252 |
+
- **Available models**: Vicuna 7B v1.5, IBM Granite 7B Base, LLeMa 7B
|
253 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
254 |
+
- **Model trace p-values are computed automatically** (may take 10-30 minutes)
|
255 |
|
256 |
### How it works:
|
257 |
+
1. Select a model from the dropdown
|
258 |
2. Click "Run Perplexity Test"
|
259 |
+
3. Wait for evaluation to complete (may take a few minutes for perplexity + longer for p-value)
|
260 |
4. Results will appear automatically in the table above!
|
261 |
""")
|
262 |
|
src/about.py
CHANGED
@@ -17,37 +17,48 @@ NUM_FEWSHOT = 0 # Not used for perplexity
|
|
17 |
# ---------------------------------------------------
|
18 |
|
19 |
# Your leaderboard name
|
20 |
-
TITLE = """<h1 align="center" id="space-title">Model
|
21 |
|
22 |
# What does your leaderboard evaluate?
|
23 |
INTRODUCTION_TEXT = """
|
24 |
-
This leaderboard evaluates language models based on their perplexity scores
|
25 |
-
structural similarity to
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
28 |
-
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running?
|
32 |
LLM_BENCHMARKS_TEXT = """
|
33 |
## How it works
|
34 |
|
35 |
-
The evaluation runs two types of analysis on language models:
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
### 1. Perplexity Evaluation
|
38 |
Perplexity tests using a fixed test passage about artificial intelligence.
|
39 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
40 |
|
41 |
### 2. Model Tracing Analysis
|
42 |
-
Compares each model's internal structure to
|
43 |
-
- **Base Model**:
|
44 |
-
- **Comparison**:
|
45 |
- **Method**: Neuron matching analysis across transformer layers
|
46 |
- **Alignment**: Models are aligned before comparison using the Hungarian algorithm
|
47 |
-
- **Output**: P-value indicating structural similarity (lower = more similar to
|
48 |
|
49 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
50 |
-
between the base model and
|
51 |
|
52 |
## Test Text
|
53 |
|
@@ -62,11 +73,15 @@ with these important social considerations.
|
|
62 |
"""
|
63 |
|
64 |
EVALUATION_QUEUE_TEXT = """
|
65 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
|
68 |
-
2. The model should be loadable with AutoModelForCausalLM
|
69 |
-
3. The model should support text generation tasks
|
70 |
"""
|
71 |
|
72 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
17 |
# ---------------------------------------------------
|
18 |
|
19 |
# Your leaderboard name
|
20 |
+
TITLE = """<h1 align="center" id="space-title">Model Tracing Leaderboard</h1>"""
|
21 |
|
22 |
# What does your leaderboard evaluate?
|
23 |
INTRODUCTION_TEXT = """
|
24 |
+
This leaderboard evaluates specific language models based on their perplexity scores and
|
25 |
+
structural similarity to Llama-2-7B using model tracing analysis.
|
26 |
|
27 |
+
**Models Evaluated:**
|
28 |
+
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
|
29 |
+
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
30 |
+
- `EleutherAI/llemma_7b` - LLeMa 7B
|
31 |
+
|
32 |
+
**Metrics:**
|
33 |
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
34 |
+
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
|
35 |
"""
|
36 |
|
37 |
# Which evaluations are you running?
|
38 |
LLM_BENCHMARKS_TEXT = """
|
39 |
## How it works
|
40 |
|
41 |
+
The evaluation runs two types of analysis on the supported language models:
|
42 |
+
|
43 |
+
### Supported Models
|
44 |
+
- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
|
45 |
+
- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
|
46 |
+
- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
|
47 |
|
48 |
### 1. Perplexity Evaluation
|
49 |
Perplexity tests using a fixed test passage about artificial intelligence.
|
50 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
51 |
|
52 |
### 2. Model Tracing Analysis
|
53 |
+
Compares each model's internal structure to Llama-2-7B using the "match" statistic:
|
54 |
+
- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
|
55 |
+
- **Comparison Models**: The 3 supported models listed above
|
56 |
- **Method**: Neuron matching analysis across transformer layers
|
57 |
- **Alignment**: Models are aligned before comparison using the Hungarian algorithm
|
58 |
+
- **Output**: P-value indicating structural similarity (lower = more similar to Llama-2-7B)
|
59 |
|
60 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
61 |
+
between the base model and the comparison models.
|
62 |
|
63 |
## Test Text
|
64 |
|
|
|
73 |
"""
|
74 |
|
75 |
EVALUATION_QUEUE_TEXT = """
|
76 |
+
## Testing Models
|
77 |
+
|
78 |
+
This leaderboard focuses on comparing specific models:
|
79 |
+
|
80 |
+
1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
|
81 |
+
2. **IBM Granite 7B Base** - IBM's foundational language model
|
82 |
+
3. **LLeMa 7B** - EleutherAI's mathematical language model
|
83 |
|
84 |
+
Use the "Test Model" tab to run perplexity evaluation on any of these models.
|
|
|
|
|
85 |
"""
|
86 |
|
87 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/evaluation/initialize_models.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Initialize the leaderboard with specific models and compute their p-values.
|
3 |
+
|
4 |
+
This module ensures only the specified models are included in the leaderboard
|
5 |
+
and their model trace p-values are computed.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import sys
|
11 |
+
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
12 |
+
from src.envs import EVAL_RESULTS_PATH
|
13 |
+
|
14 |
+
# The specific models we want to include
|
15 |
+
ALLOWED_MODELS = [
|
16 |
+
"lmsys/vicuna-7b-v1.5",
|
17 |
+
"ibm-granite/granite-7b-base",
|
18 |
+
"EleutherAI/llemma_7b"
|
19 |
+
]
|
20 |
+
|
21 |
+
def create_model_result_file(model_name, precision="float16"):
|
22 |
+
"""
|
23 |
+
Create a result file for a model with computed p-value.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
model_name: HuggingFace model identifier
|
27 |
+
precision: Model precision
|
28 |
+
"""
|
29 |
+
sys.stderr.write(f"\nπ§ CREATING RESULT FILE FOR: {model_name}\n")
|
30 |
+
sys.stderr.flush()
|
31 |
+
|
32 |
+
# Create the results directory if it doesn't exist
|
33 |
+
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
34 |
+
|
35 |
+
# Generate a safe filename
|
36 |
+
safe_name = model_name.replace("/", "_").replace("-", "_")
|
37 |
+
result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
|
38 |
+
|
39 |
+
sys.stderr.write(f"π Result file path: {result_file}\n")
|
40 |
+
sys.stderr.flush()
|
41 |
+
|
42 |
+
# Check if file already exists
|
43 |
+
if os.path.exists(result_file):
|
44 |
+
sys.stderr.write(f"β
Result file already exists: {result_file}\n")
|
45 |
+
sys.stderr.flush()
|
46 |
+
return result_file
|
47 |
+
|
48 |
+
# Create basic result structure
|
49 |
+
result_data = {
|
50 |
+
"config": {
|
51 |
+
"model_dtype": f"torch.{precision}",
|
52 |
+
"model_name": model_name,
|
53 |
+
"model_sha": "main"
|
54 |
+
},
|
55 |
+
"results": {
|
56 |
+
"perplexity": {
|
57 |
+
"perplexity": None # Will be populated when user tests
|
58 |
+
}
|
59 |
+
}
|
60 |
+
}
|
61 |
+
|
62 |
+
# Save the result file
|
63 |
+
try:
|
64 |
+
with open(result_file, 'w') as f:
|
65 |
+
json.dump(result_data, f, indent=2)
|
66 |
+
|
67 |
+
sys.stderr.write(f"β
Created result file: {result_file}\n")
|
68 |
+
sys.stderr.flush()
|
69 |
+
return result_file
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
sys.stderr.write(f"β Failed to create result file: {e}\n")
|
73 |
+
sys.stderr.flush()
|
74 |
+
return None
|
75 |
+
|
76 |
+
def initialize_allowed_models():
|
77 |
+
"""
|
78 |
+
Initialize result files for all allowed models.
|
79 |
+
"""
|
80 |
+
sys.stderr.write(f"\nπ INITIALIZING ALLOWED MODELS\n")
|
81 |
+
sys.stderr.write(f"π Models to initialize: {ALLOWED_MODELS}\n")
|
82 |
+
sys.stderr.flush()
|
83 |
+
|
84 |
+
created_files = []
|
85 |
+
|
86 |
+
for model_name in ALLOWED_MODELS:
|
87 |
+
try:
|
88 |
+
result_file = create_model_result_file(model_name)
|
89 |
+
if result_file:
|
90 |
+
created_files.append(result_file)
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
sys.stderr.write(f"β Failed to initialize {model_name}: {e}\n")
|
94 |
+
sys.stderr.flush()
|
95 |
+
continue
|
96 |
+
|
97 |
+
sys.stderr.write(f"β
Initialized {len(created_files)} model result files\n")
|
98 |
+
sys.stderr.flush()
|
99 |
+
|
100 |
+
return created_files
|
101 |
+
|
102 |
+
def is_model_allowed(model_name):
|
103 |
+
"""
|
104 |
+
Check if a model is in the allowed list.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
model_name: HuggingFace model identifier
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
bool: True if model is allowed
|
111 |
+
"""
|
112 |
+
return model_name in ALLOWED_MODELS
|
113 |
+
|
114 |
+
def get_allowed_models():
|
115 |
+
"""
|
116 |
+
Get the list of allowed models.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
list: List of allowed model names
|
120 |
+
"""
|
121 |
+
return ALLOWED_MODELS.copy()
|
src/evaluation/model_trace_eval.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
"""
|
2 |
Model tracing evaluation for computing p-values from neuron matching statistics.
|
3 |
|
4 |
-
This module runs the model-tracing comparison
|
5 |
-
|
6 |
"""
|
7 |
|
8 |
import os
|
@@ -10,49 +10,26 @@ import sys
|
|
10 |
import subprocess
|
11 |
import tempfile
|
12 |
import pickle
|
13 |
-
import
|
14 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
15 |
|
16 |
-
#
|
17 |
model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
|
18 |
-
|
19 |
-
sys.path.append(model_tracing_path)
|
20 |
|
21 |
-
sys.stderr.write("π§
|
22 |
-
sys.stderr.
|
23 |
-
|
24 |
-
|
25 |
-
sys.stderr.write(" - Importing tracing.utils.llama.model...\n")
|
26 |
-
from tracing.utils.llama.model import permute_model, rotate_model
|
27 |
-
|
28 |
-
sys.stderr.write(" - Importing tracing.utils.llama.matching...\n")
|
29 |
-
from tracing.utils.llama.matching import align_model
|
30 |
-
|
31 |
-
sys.stderr.write(" - Importing tracing.utils.evaluate...\n")
|
32 |
-
from tracing.utils.evaluate import prepare_hf_dataset, prepare_hf_dataloader
|
33 |
-
|
34 |
-
sys.stderr.write(" - Importing tracing.utils.utils...\n")
|
35 |
-
from tracing.utils.utils import manual_seed
|
36 |
-
|
37 |
-
sys.stderr.write(" - Importing tracing.statistics.match...\n")
|
38 |
-
from tracing.statistics.match import statistic as match_stat
|
39 |
-
|
40 |
-
MODEL_TRACING_AVAILABLE = True
|
41 |
-
sys.stderr.write("β
ALL MODEL TRACING IMPORTS SUCCESSFUL\n")
|
42 |
-
|
43 |
-
except ImportError as e:
|
44 |
-
sys.stderr.write(f"β MODEL TRACING IMPORTS FAILED: {e}\n")
|
45 |
-
import traceback
|
46 |
-
sys.stderr.write(f"Full import traceback:\n{traceback.format_exc()}\n")
|
47 |
-
MODEL_TRACING_AVAILABLE = False
|
48 |
-
|
49 |
sys.stderr.write(f"π― Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
|
50 |
sys.stderr.flush()
|
51 |
|
52 |
|
53 |
def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
|
54 |
"""
|
55 |
-
Run model tracing analysis
|
|
|
|
|
|
|
56 |
|
57 |
Args:
|
58 |
ft_model_name: HuggingFace model identifier for the fine-tuned model
|
@@ -61,197 +38,175 @@ def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"
|
|
61 |
|
62 |
Returns:
|
63 |
tuple: (success: bool, result: float or error_message)
|
64 |
-
If success, result is the aggregate p-value
|
65 |
If failure, result is error message
|
66 |
"""
|
67 |
|
68 |
if not MODEL_TRACING_AVAILABLE:
|
69 |
-
return False, "Model tracing
|
70 |
|
71 |
try:
|
72 |
-
sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS ===\n")
|
73 |
-
sys.stderr.write(f"Base model:
|
74 |
sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
|
75 |
sys.stderr.write(f"Revision: {revision}\n")
|
76 |
sys.stderr.write(f"Precision: {precision}\n")
|
77 |
sys.stderr.flush()
|
78 |
|
79 |
-
#
|
80 |
-
|
81 |
-
|
82 |
-
# Determine dtype
|
83 |
-
if precision == "bfloat16":
|
84 |
-
dtype = torch.bfloat16
|
85 |
-
else:
|
86 |
-
dtype = torch.float16
|
87 |
-
|
88 |
-
# Load base model (gpt2)
|
89 |
-
base_model_id = "openai-community/gpt2"
|
90 |
-
sys.stderr.write(f"π€ Loading base model: {base_model_id}\n")
|
91 |
-
sys.stderr.write(f" - dtype: {dtype}\n")
|
92 |
-
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
93 |
-
sys.stderr.flush()
|
94 |
-
|
95 |
-
try:
|
96 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
97 |
-
base_model_id,
|
98 |
-
torch_dtype=dtype,
|
99 |
-
low_cpu_mem_usage=True
|
100 |
-
)
|
101 |
-
sys.stderr.write("β
Base model loaded successfully\n")
|
102 |
-
except Exception as e:
|
103 |
-
sys.stderr.write(f"β Failed to load base model: {e}\n")
|
104 |
-
raise
|
105 |
-
|
106 |
-
try:
|
107 |
-
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
|
108 |
-
sys.stderr.write("β
Base tokenizer loaded successfully\n")
|
109 |
-
except Exception as e:
|
110 |
-
sys.stderr.write(f"β Failed to load base tokenizer: {e}\n")
|
111 |
-
raise
|
112 |
-
|
113 |
-
# Load fine-tuned model
|
114 |
-
sys.stderr.write(f"π€ Loading fine-tuned model: {ft_model_name}\n")
|
115 |
-
sys.stderr.write(f" - revision: {revision}\n")
|
116 |
-
sys.stderr.write(f" - dtype: {dtype}\n")
|
117 |
-
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
118 |
-
sys.stderr.flush()
|
119 |
-
|
120 |
-
try:
|
121 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
122 |
-
ft_model_name,
|
123 |
-
revision=revision,
|
124 |
-
torch_dtype=dtype,
|
125 |
-
low_cpu_mem_usage=True
|
126 |
-
)
|
127 |
-
sys.stderr.write("β
Fine-tuned model loaded successfully\n")
|
128 |
-
except Exception as e:
|
129 |
-
sys.stderr.write(f"β Failed to load fine-tuned model: {e}\n")
|
130 |
-
raise
|
131 |
-
|
132 |
-
try:
|
133 |
-
ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, revision=revision, use_fast=False)
|
134 |
-
sys.stderr.write("β
Fine-tuned tokenizer loaded successfully\n")
|
135 |
-
except Exception as e:
|
136 |
-
sys.stderr.write(f"β Failed to load fine-tuned tokenizer: {e}\n")
|
137 |
-
raise
|
138 |
-
|
139 |
-
sys.stderr.write("π― ALL MODELS AND TOKENIZERS LOADED SUCCESSFULLY\n")
|
140 |
-
|
141 |
-
# Show memory info if available
|
142 |
-
if torch.cuda.is_available():
|
143 |
-
memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB
|
144 |
-
memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB
|
145 |
-
sys.stderr.write(f"πΎ GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB\n")
|
146 |
|
|
|
147 |
sys.stderr.flush()
|
148 |
|
149 |
-
#
|
150 |
-
|
151 |
-
sys.stderr.flush()
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
157 |
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
160 |
|
161 |
-
|
162 |
-
sys.stderr.write("Running model alignment...\n")
|
163 |
sys.stderr.flush()
|
164 |
|
|
|
|
|
165 |
try:
|
166 |
-
|
167 |
-
sys.stderr.write("
|
168 |
-
|
169 |
-
sys.stderr.write(f"Model alignment failed: {e}\n")
|
170 |
-
sys.stderr.write("Continuing without alignment...\n")
|
171 |
-
sys.stderr.flush()
|
172 |
-
|
173 |
-
# Run match statistic
|
174 |
-
sys.stderr.write("Computing match statistic...\n")
|
175 |
-
sys.stderr.flush()
|
176 |
-
|
177 |
-
# Get number of layers for the models
|
178 |
-
if hasattr(base_model, 'transformer') and hasattr(base_model.transformer, 'h'):
|
179 |
-
# GPT-2 style
|
180 |
-
n_blocks = len(base_model.transformer.h)
|
181 |
-
elif hasattr(base_model, 'model') and hasattr(base_model.model, 'layers'):
|
182 |
-
# LLaMA style
|
183 |
-
n_blocks = len(base_model.model.layers)
|
184 |
-
else:
|
185 |
-
# Default fallback
|
186 |
-
n_blocks = 12 # GPT-2 base has 12 layers
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
194 |
|
195 |
-
|
196 |
-
n_blocks = min(n_blocks, ft_n_blocks)
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
p_values = match_stat(base_model, ft_model, dataloader, n_blocks=n_blocks)
|
204 |
-
except Exception as e:
|
205 |
-
sys.stderr.write(f"Match statistic computation failed: {e}\n")
|
206 |
sys.stderr.flush()
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
sys.stderr.write("No valid p-values found, returning default\n")
|
218 |
sys.stderr.flush()
|
219 |
-
return True, 1.0
|
220 |
|
221 |
-
#
|
222 |
-
from tracing.utils.utils import fisher
|
223 |
try:
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
except Exception as e:
|
226 |
-
sys.stderr.write(f"
|
227 |
sys.stderr.flush()
|
228 |
-
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
-
sys.stderr.write(f"
|
232 |
sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
|
233 |
sys.stderr.flush()
|
234 |
|
235 |
-
# Clean up memory
|
236 |
-
del base_model
|
237 |
-
del ft_model
|
238 |
-
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
239 |
-
|
240 |
return True, aggregate_p_value
|
241 |
|
|
|
|
|
|
|
|
|
|
|
242 |
except Exception as e:
|
243 |
error_msg = str(e)
|
244 |
-
sys.stderr.write(f"Error in model trace analysis: {error_msg}\n")
|
245 |
import traceback
|
246 |
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
247 |
sys.stderr.flush()
|
248 |
-
|
249 |
-
# Clean up memory even on error
|
250 |
-
try:
|
251 |
-
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
252 |
-
except:
|
253 |
-
pass
|
254 |
-
|
255 |
return False, error_msg
|
256 |
|
257 |
|
|
|
1 |
"""
|
2 |
Model tracing evaluation for computing p-values from neuron matching statistics.
|
3 |
|
4 |
+
This module runs the model-tracing comparison using the main.py script from model-tracing
|
5 |
+
to determine structural similarity via p-value analysis.
|
6 |
"""
|
7 |
|
8 |
import os
|
|
|
10 |
import subprocess
|
11 |
import tempfile
|
12 |
import pickle
|
13 |
+
import statistics
|
|
|
14 |
|
15 |
+
# Check if model-tracing directory exists
|
16 |
model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
|
17 |
+
MODEL_TRACING_AVAILABLE = os.path.exists(model_tracing_path) and os.path.exists(os.path.join(model_tracing_path, 'main.py'))
|
|
|
18 |
|
19 |
+
sys.stderr.write("π§ CHECKING MODEL TRACING AVAILABILITY...\n")
|
20 |
+
sys.stderr.write(f" - Model tracing path: {model_tracing_path}\n")
|
21 |
+
sys.stderr.write(f" - Path exists: {os.path.exists(model_tracing_path)}\n")
|
22 |
+
sys.stderr.write(f" - main.py exists: {os.path.exists(os.path.join(model_tracing_path, 'main.py'))}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
sys.stderr.write(f"π― Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
|
24 |
sys.stderr.flush()
|
25 |
|
26 |
|
27 |
def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
|
28 |
"""
|
29 |
+
Run model tracing analysis using the main.py script from model-tracing directory.
|
30 |
+
|
31 |
+
Runs the exact command:
|
32 |
+
python main.py --base_model_id meta-llama/Llama-2-7b-hf --ft_model_id <ft_model_name> --stat match --align
|
33 |
|
34 |
Args:
|
35 |
ft_model_name: HuggingFace model identifier for the fine-tuned model
|
|
|
38 |
|
39 |
Returns:
|
40 |
tuple: (success: bool, result: float or error_message)
|
41 |
+
If success, result is the aggregate p-value from aligned test stat
|
42 |
If failure, result is error message
|
43 |
"""
|
44 |
|
45 |
if not MODEL_TRACING_AVAILABLE:
|
46 |
+
return False, "Model tracing main.py script not available"
|
47 |
|
48 |
try:
|
49 |
+
sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS VIA SUBPROCESS ===\n")
|
50 |
+
sys.stderr.write(f"Base model: meta-llama/Llama-2-7b-hf\n")
|
51 |
sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
|
52 |
sys.stderr.write(f"Revision: {revision}\n")
|
53 |
sys.stderr.write(f"Precision: {precision}\n")
|
54 |
sys.stderr.flush()
|
55 |
|
56 |
+
# Create a temporary file for results
|
57 |
+
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as tmp_file:
|
58 |
+
tmp_results_path = tmp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
sys.stderr.write(f"π Temporary results file: {tmp_results_path}\n")
|
61 |
sys.stderr.flush()
|
62 |
|
63 |
+
# Build the command exactly as user specified
|
64 |
+
base_model_id = "meta-llama/Llama-2-7b-hf"
|
|
|
65 |
|
66 |
+
# Build the command
|
67 |
+
cmd = [
|
68 |
+
"python", "main.py",
|
69 |
+
"--base_model_id", base_model_id,
|
70 |
+
"--ft_model_id", ft_model_name,
|
71 |
+
"--stat", "match",
|
72 |
+
"--save", tmp_results_path
|
73 |
+
]
|
74 |
|
75 |
+
# Add revision if not main/default
|
76 |
+
if revision and revision != "main":
|
77 |
+
# Note: main.py doesn't seem to have a revision flag, but we log it for reference
|
78 |
+
sys.stderr.write(f"β οΈ Note: Revision '{revision}' specified but main.py doesn't support --revision flag\n")
|
79 |
+
sys.stderr.flush()
|
80 |
|
81 |
+
sys.stderr.write(f"π Running command: {' '.join(cmd)}\n")
|
|
|
82 |
sys.stderr.flush()
|
83 |
|
84 |
+
# Change to model-tracing directory and run the command
|
85 |
+
original_cwd = os.getcwd()
|
86 |
try:
|
87 |
+
os.chdir(model_tracing_path)
|
88 |
+
sys.stderr.write(f"π Changed to directory: {model_tracing_path}\n")
|
89 |
+
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
+
# Run the subprocess
|
92 |
+
result = subprocess.run(
|
93 |
+
cmd,
|
94 |
+
capture_output=True,
|
95 |
+
text=True,
|
96 |
+
timeout=3600 # 1 hour timeout
|
97 |
+
)
|
98 |
|
99 |
+
sys.stderr.write(f"π Subprocess completed with return code: {result.returncode}\n")
|
|
|
100 |
|
101 |
+
# Log stdout and stderr from the subprocess
|
102 |
+
if result.stdout:
|
103 |
+
sys.stderr.write(f"π STDOUT from model tracing:\n{result.stdout}\n")
|
104 |
+
if result.stderr:
|
105 |
+
sys.stderr.write(f"β οΈ STDERR from model tracing:\n{result.stderr}\n")
|
|
|
|
|
|
|
106 |
sys.stderr.flush()
|
107 |
+
|
108 |
+
if result.returncode != 0:
|
109 |
+
error_msg = f"Model tracing script failed with return code {result.returncode}"
|
110 |
+
if result.stderr:
|
111 |
+
error_msg += f"\nSTDERR: {result.stderr}"
|
112 |
+
return False, error_msg
|
113 |
+
|
114 |
+
finally:
|
115 |
+
os.chdir(original_cwd)
|
116 |
+
sys.stderr.write(f"π Changed back to directory: {original_cwd}\n")
|
|
|
117 |
sys.stderr.flush()
|
|
|
118 |
|
119 |
+
# Load and parse the results
|
|
|
120 |
try:
|
121 |
+
sys.stderr.write(f"π Loading results from: {tmp_results_path}\n")
|
122 |
+
sys.stderr.flush()
|
123 |
+
|
124 |
+
with open(tmp_results_path, 'rb') as f:
|
125 |
+
results = pickle.load(f)
|
126 |
+
|
127 |
+
sys.stderr.write(f"β
Results loaded successfully\n")
|
128 |
+
sys.stderr.write(f"π Available result keys: {list(results.keys())}\n")
|
129 |
+
sys.stderr.flush()
|
130 |
+
|
131 |
+
# Get the aligned test stat (this is what we want with --align flag)
|
132 |
+
if "aligned test stat" in results:
|
133 |
+
aligned_stat = results["aligned test stat"]
|
134 |
+
sys.stderr.write(f"π Aligned test stat: {aligned_stat}\n")
|
135 |
+
sys.stderr.write(f"π Type: {type(aligned_stat)}\n")
|
136 |
+
|
137 |
+
# The match statistic returns a list of p-values per layer
|
138 |
+
if isinstance(aligned_stat, list):
|
139 |
+
sys.stderr.write(f"π List of {len(aligned_stat)} p-values: {aligned_stat}\n")
|
140 |
+
|
141 |
+
# Filter valid p-values
|
142 |
+
valid_p_values = [p for p in aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
|
143 |
+
sys.stderr.write(f"π Valid p-values: {len(valid_p_values)}/{len(aligned_stat)}\n")
|
144 |
+
|
145 |
+
if valid_p_values:
|
146 |
+
# Use median as the representative p-value
|
147 |
+
aggregate_p_value = statistics.median(valid_p_values)
|
148 |
+
sys.stderr.write(f"π Using median p-value: {aggregate_p_value}\n")
|
149 |
+
else:
|
150 |
+
sys.stderr.write("β οΈ No valid p-values found, using default\n")
|
151 |
+
aggregate_p_value = 1.0
|
152 |
+
|
153 |
+
elif isinstance(aligned_stat, (int, float)):
|
154 |
+
aggregate_p_value = float(aligned_stat)
|
155 |
+
sys.stderr.write(f"π Using single p-value: {aggregate_p_value}\n")
|
156 |
+
else:
|
157 |
+
sys.stderr.write(f"β οΈ Unexpected aligned_stat type: {type(aligned_stat)}, using default\n")
|
158 |
+
aggregate_p_value = 1.0
|
159 |
+
|
160 |
+
else:
|
161 |
+
sys.stderr.write("β οΈ No 'aligned test stat' found in results, checking non-aligned\n")
|
162 |
+
if "non-aligned test stat" in results:
|
163 |
+
non_aligned_stat = results["non-aligned test stat"]
|
164 |
+
sys.stderr.write(f"π Using non-aligned test stat: {non_aligned_stat}\n")
|
165 |
+
|
166 |
+
if isinstance(non_aligned_stat, list):
|
167 |
+
valid_p_values = [p for p in non_aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
|
168 |
+
if valid_p_values:
|
169 |
+
aggregate_p_value = statistics.median(valid_p_values)
|
170 |
+
else:
|
171 |
+
aggregate_p_value = 1.0
|
172 |
+
else:
|
173 |
+
aggregate_p_value = float(non_aligned_stat) if isinstance(non_aligned_stat, (int, float)) else 1.0
|
174 |
+
else:
|
175 |
+
sys.stderr.write("β No test stat found in results\n")
|
176 |
+
return False, "No test statistic found in results"
|
177 |
+
|
178 |
+
sys.stderr.flush()
|
179 |
+
|
180 |
except Exception as e:
|
181 |
+
sys.stderr.write(f"β Failed to load results: {e}\n")
|
182 |
sys.stderr.flush()
|
183 |
+
return False, f"Failed to load results: {e}"
|
184 |
+
|
185 |
+
finally:
|
186 |
+
# Clean up temporary file
|
187 |
+
try:
|
188 |
+
os.unlink(tmp_results_path)
|
189 |
+
sys.stderr.write(f"ποΈ Cleaned up temporary file: {tmp_results_path}\n")
|
190 |
+
except:
|
191 |
+
pass
|
192 |
|
193 |
+
sys.stderr.write(f"β
Final aggregate p-value: {aggregate_p_value}\n")
|
194 |
sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
|
195 |
sys.stderr.flush()
|
196 |
|
|
|
|
|
|
|
|
|
|
|
197 |
return True, aggregate_p_value
|
198 |
|
199 |
+
except subprocess.TimeoutExpired:
|
200 |
+
sys.stderr.write("β Model tracing analysis timed out after 1 hour\n")
|
201 |
+
sys.stderr.flush()
|
202 |
+
return False, "Analysis timed out"
|
203 |
+
|
204 |
except Exception as e:
|
205 |
error_msg = str(e)
|
206 |
+
sys.stderr.write(f"π₯ Error in model trace analysis: {error_msg}\n")
|
207 |
import traceback
|
208 |
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
209 |
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
return False, error_msg
|
211 |
|
212 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,6 +8,7 @@ from src.display.formatting import make_clickable_model
|
|
8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
9 |
from src.submission.check_validity import is_model_on_hub
|
10 |
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
|
|
11 |
|
12 |
@dataclass
|
13 |
class EvalResult:
|
@@ -236,6 +237,13 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
236 |
try:
|
237 |
sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
|
238 |
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
v.to_dict() # we test if the dict version is complete
|
240 |
results.append(v)
|
241 |
sys.stderr.write("Successfully converted and added result\n")
|
|
|
8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
9 |
from src.submission.check_validity import is_model_on_hub
|
10 |
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
11 |
+
from src.evaluation.initialize_models import is_model_allowed
|
12 |
|
13 |
@dataclass
|
14 |
class EvalResult:
|
|
|
237 |
try:
|
238 |
sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
|
239 |
sys.stderr.flush()
|
240 |
+
|
241 |
+
# Filter to only allowed models
|
242 |
+
if not is_model_allowed(v.full_model):
|
243 |
+
sys.stderr.write(f"βοΈ Skipping non-allowed model: {v.full_model}\n")
|
244 |
+
sys.stderr.flush()
|
245 |
+
continue
|
246 |
+
|
247 |
v.to_dict() # we test if the dict version is complete
|
248 |
results.append(v)
|
249 |
sys.stderr.write("Successfully converted and added result\n")
|
test_model_trace.py
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Test script for model tracing integration.
|
4 |
-
Tests the p-value computation for a simple model comparison.
|
5 |
-
"""
|
6 |
-
|
7 |
-
import sys
|
8 |
-
import os
|
9 |
-
|
10 |
-
# Add src to path
|
11 |
-
sys.path.append('src')
|
12 |
-
|
13 |
-
from evaluation.model_trace_eval import compute_model_trace_p_value
|
14 |
-
|
15 |
-
def test_model_trace():
|
16 |
-
"""Test the model trace p-value computation with a simple example."""
|
17 |
-
|
18 |
-
print("Testing model trace p-value computation...")
|
19 |
-
|
20 |
-
# Test with a simple model (should be fast)
|
21 |
-
test_model = "openai-community/gpt2"
|
22 |
-
|
23 |
-
print(f"Computing p-value for {test_model} vs GPT-2...")
|
24 |
-
|
25 |
-
try:
|
26 |
-
p_value = compute_model_trace_p_value(test_model, "main", "float16")
|
27 |
-
|
28 |
-
if p_value is not None:
|
29 |
-
print(f"β
Success! P-value: {p_value}")
|
30 |
-
if 0 <= p_value <= 1:
|
31 |
-
print("β
P-value is in valid range [0, 1]")
|
32 |
-
else:
|
33 |
-
print(f"β οΈ Warning: P-value {p_value} is outside expected range [0, 1]")
|
34 |
-
else:
|
35 |
-
print("β Failed: P-value is None")
|
36 |
-
|
37 |
-
except Exception as e:
|
38 |
-
print(f"β Error: {e}")
|
39 |
-
import traceback
|
40 |
-
traceback.print_exc()
|
41 |
-
|
42 |
-
if __name__ == "__main__":
|
43 |
-
test_model_trace()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|