Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
Β·
1dd4b6a
1
Parent(s):
86c1853
lets see
Browse files- app.py +82 -10
- model-tracing +1 -0
- requirements.txt +12 -1
- src/about.py +20 -3
- src/display/utils.py +2 -0
- src/evaluation/model_trace_eval.py +310 -0
- src/leaderboard/read_evals.py +29 -0
- test_model_trace.py +43 -0
app.py
CHANGED
@@ -22,23 +22,66 @@ from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
|
22 |
|
23 |
def create_results_dataframe():
|
24 |
"""Create and return the results DataFrame for display"""
|
|
|
|
|
|
|
|
|
|
|
25 |
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
|
|
|
|
|
|
26 |
if df is None or df.empty:
|
|
|
|
|
27 |
# Return empty DataFrame with proper columns
|
28 |
-
return pd.DataFrame(columns=["Model", "Perplexity", "Average Score", "Type", "Precision"])
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
32 |
AutoEvalColumn.model.name,
|
33 |
-
"Perplexity",
|
|
|
34 |
AutoEvalColumn.average.name,
|
35 |
AutoEvalColumn.model_type.name,
|
36 |
AutoEvalColumn.precision.name,
|
37 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
# Rename columns for better display
|
40 |
-
display_df.columns = ["Model", "Perplexity", "Average Score", "Type", "Precision"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
|
|
42 |
return display_df
|
43 |
|
44 |
def run_perplexity_test(model_name, revision, precision):
|
@@ -66,15 +109,23 @@ def run_perplexity_test(model_name, revision, precision):
|
|
66 |
sys.stderr.write("Evaluation succeeded - updating both results tables\n")
|
67 |
sys.stderr.flush()
|
68 |
|
69 |
-
# Get updated results
|
|
|
|
|
|
|
70 |
updated_df = create_results_dataframe()
|
71 |
|
|
|
|
|
|
|
72 |
success_msg = f"""β
**Perplexity evaluation completed successfully!**
|
73 |
|
74 |
**Model**: {model_name}
|
75 |
**Perplexity Score**: {result:.4f}
|
76 |
|
77 |
-
π **Results have been saved and both tables have been updated!**
|
|
|
|
|
78 |
|
79 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
80 |
else:
|
@@ -117,9 +168,21 @@ except Exception as e:
|
|
117 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
118 |
|
119 |
# Get initial results data
|
|
|
|
|
|
|
|
|
|
|
120 |
RESULTS_DF = create_results_dataframe()
|
121 |
|
|
|
|
|
|
|
|
|
122 |
# Create the Gradio interface
|
|
|
|
|
|
|
123 |
demo = gr.Blocks(css=custom_css)
|
124 |
with demo:
|
125 |
gr.HTML(TITLE)
|
@@ -130,7 +193,7 @@ with demo:
|
|
130 |
gr.Markdown("## Model Evaluation Results")
|
131 |
results_table = gr.DataFrame(
|
132 |
value=RESULTS_DF,
|
133 |
-
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
134 |
interactive=False,
|
135 |
wrap=False
|
136 |
)
|
@@ -159,7 +222,7 @@ with demo:
|
|
159 |
gr.Markdown("## Live Results")
|
160 |
live_results_table = gr.DataFrame(
|
161 |
value=RESULTS_DF,
|
162 |
-
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
163 |
interactive=False,
|
164 |
wrap=False
|
165 |
)
|
@@ -184,4 +247,13 @@ with demo:
|
|
184 |
[result, live_results_table, results_table]
|
185 |
)
|
186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
demo.queue(default_concurrency_limit=5).launch()
|
|
|
22 |
|
23 |
def create_results_dataframe():
|
24 |
"""Create and return the results DataFrame for display"""
|
25 |
+
import sys
|
26 |
+
|
27 |
+
sys.stderr.write("\nπ CREATE_RESULTS_DATAFRAME CALLED\n")
|
28 |
+
sys.stderr.flush()
|
29 |
+
|
30 |
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
31 |
+
|
32 |
+
sys.stderr.write(f"π Retrieved leaderboard df: {df.shape if df is not None else 'None'}\n")
|
33 |
+
sys.stderr.flush()
|
34 |
+
|
35 |
if df is None or df.empty:
|
36 |
+
sys.stderr.write("β οΈ DataFrame is None or empty, returning empty DataFrame\n")
|
37 |
+
sys.stderr.flush()
|
38 |
# Return empty DataFrame with proper columns
|
39 |
+
return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
|
40 |
|
41 |
+
sys.stderr.write(f"π Original DataFrame columns: {list(df.columns)}\n")
|
42 |
+
sys.stderr.flush()
|
43 |
+
|
44 |
+
# Check if required columns exist
|
45 |
+
required_cols = [
|
46 |
AutoEvalColumn.model.name,
|
47 |
+
"Perplexity",
|
48 |
+
AutoEvalColumn.model_trace_p_value.name,
|
49 |
AutoEvalColumn.average.name,
|
50 |
AutoEvalColumn.model_type.name,
|
51 |
AutoEvalColumn.precision.name,
|
52 |
+
]
|
53 |
+
|
54 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
55 |
+
if missing_cols:
|
56 |
+
sys.stderr.write(f"β οΈ Missing columns in DataFrame: {missing_cols}\n")
|
57 |
+
sys.stderr.flush()
|
58 |
+
# Add missing columns with default values
|
59 |
+
for col in missing_cols:
|
60 |
+
if col == AutoEvalColumn.model_trace_p_value.name:
|
61 |
+
df[col] = None
|
62 |
+
sys.stderr.write(f"β Added {col} column with None values\n")
|
63 |
+
|
64 |
+
# Select and rename columns for display
|
65 |
+
try:
|
66 |
+
display_df = df[required_cols].copy()
|
67 |
+
sys.stderr.write(f"β
Selected columns successfully: {list(display_df.columns)}\n")
|
68 |
+
except Exception as e:
|
69 |
+
sys.stderr.write(f"π₯ Error selecting columns: {e}\n")
|
70 |
+
sys.stderr.flush()
|
71 |
+
return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
|
72 |
|
73 |
# Rename columns for better display
|
74 |
+
display_df.columns = ["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"]
|
75 |
+
|
76 |
+
sys.stderr.write(f"π― Final display DataFrame shape: {display_df.shape}\n")
|
77 |
+
sys.stderr.write(f"π― Final columns: {list(display_df.columns)}\n")
|
78 |
+
|
79 |
+
# Check p-value column
|
80 |
+
if "Match P-Value" in display_df.columns:
|
81 |
+
p_value_stats = display_df["Match P-Value"].describe()
|
82 |
+
sys.stderr.write(f"π P-Value column stats:\n{p_value_stats}\n")
|
83 |
|
84 |
+
sys.stderr.flush()
|
85 |
return display_df
|
86 |
|
87 |
def run_perplexity_test(model_name, revision, precision):
|
|
|
109 |
sys.stderr.write("Evaluation succeeded - updating both results tables\n")
|
110 |
sys.stderr.flush()
|
111 |
|
112 |
+
# Get updated results (this will trigger model trace p-value computation for the new model)
|
113 |
+
sys.stderr.write("π Creating updated results DataFrame (may compute model trace p-values)...\n")
|
114 |
+
sys.stderr.flush()
|
115 |
+
|
116 |
updated_df = create_results_dataframe()
|
117 |
|
118 |
+
sys.stderr.write("β
Updated DataFrame created successfully\n")
|
119 |
+
sys.stderr.flush()
|
120 |
+
|
121 |
success_msg = f"""β
**Perplexity evaluation completed successfully!**
|
122 |
|
123 |
**Model**: {model_name}
|
124 |
**Perplexity Score**: {result:.4f}
|
125 |
|
126 |
+
π **Results have been saved and both tables have been updated!**
|
127 |
+
|
128 |
+
Note: Model trace p-value computation may take additional time and will appear in the logs."""
|
129 |
|
130 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
131 |
else:
|
|
|
168 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
169 |
|
170 |
# Get initial results data
|
171 |
+
import sys
|
172 |
+
sys.stderr.write("\nπ STARTING GRADIO APP INITIALIZATION\n")
|
173 |
+
sys.stderr.write("π Creating initial results DataFrame...\n")
|
174 |
+
sys.stderr.flush()
|
175 |
+
|
176 |
RESULTS_DF = create_results_dataframe()
|
177 |
|
178 |
+
sys.stderr.write(f"β
Initial DataFrame created with shape: {RESULTS_DF.shape}\n")
|
179 |
+
sys.stderr.write(f"π Columns: {list(RESULTS_DF.columns)}\n")
|
180 |
+
sys.stderr.flush()
|
181 |
+
|
182 |
# Create the Gradio interface
|
183 |
+
sys.stderr.write("π¨ Creating Gradio interface...\n")
|
184 |
+
sys.stderr.flush()
|
185 |
+
|
186 |
demo = gr.Blocks(css=custom_css)
|
187 |
with demo:
|
188 |
gr.HTML(TITLE)
|
|
|
193 |
gr.Markdown("## Model Evaluation Results")
|
194 |
results_table = gr.DataFrame(
|
195 |
value=RESULTS_DF,
|
196 |
+
headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
|
197 |
interactive=False,
|
198 |
wrap=False
|
199 |
)
|
|
|
222 |
gr.Markdown("## Live Results")
|
223 |
live_results_table = gr.DataFrame(
|
224 |
value=RESULTS_DF,
|
225 |
+
headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
|
226 |
interactive=False,
|
227 |
wrap=False
|
228 |
)
|
|
|
247 |
[result, live_results_table, results_table]
|
248 |
)
|
249 |
|
250 |
+
sys.stderr.write("π― GRADIO INTERFACE SETUP COMPLETE\n")
|
251 |
+
sys.stderr.write("π LAUNCHING GRADIO APP WITH MODEL TRACING INTEGRATION\n")
|
252 |
+
sys.stderr.write("π Features enabled:\n")
|
253 |
+
sys.stderr.write(" - Perplexity evaluation\n")
|
254 |
+
sys.stderr.write(" - Model trace p-value computation (vs GPT-2 base)\n")
|
255 |
+
sys.stderr.write(" - Match statistic with alignment\n")
|
256 |
+
sys.stderr.write("π Ready to accept requests!\n")
|
257 |
+
sys.stderr.flush()
|
258 |
+
|
259 |
demo.queue(default_concurrency_limit=5).launch()
|
model-tracing
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 9eb3b67655be2a3576348a6d482e69c62f72fc3e
|
requirements.txt
CHANGED
@@ -15,4 +15,15 @@ transformers>=4.30.0
|
|
15 |
tokenizers>=0.15.0
|
16 |
sentencepiece
|
17 |
torch>=2.0.0
|
18 |
-
accelerate>=0.20.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
tokenizers>=0.15.0
|
16 |
sentencepiece
|
17 |
torch>=2.0.0
|
18 |
+
accelerate>=0.20.0
|
19 |
+
# Model tracing dependencies
|
20 |
+
PyYAML==6.0.1
|
21 |
+
scipy==1.13.1
|
22 |
+
protobuf==5.27.1
|
23 |
+
zstandard==0.22.0
|
24 |
+
ipdb==0.13.13
|
25 |
+
# Development dependencies for model tracing
|
26 |
+
ruff==0.1.8
|
27 |
+
pre-commit==3.5.0
|
28 |
+
nbqa==1.7.1
|
29 |
+
ipykernel==6.29.0
|
src/about.py
CHANGED
@@ -21,17 +21,34 @@ TITLE = """<h1 align="center" id="space-title">Model Perplexity Leaderboard</h1>
|
|
21 |
|
22 |
# What does your leaderboard evaluate?
|
23 |
INTRODUCTION_TEXT = """
|
24 |
-
This leaderboard evaluates language models based on their perplexity scores on a fixed test passage
|
25 |
-
|
|
|
|
|
|
|
26 |
"""
|
27 |
|
28 |
# Which evaluations are you running?
|
29 |
LLM_BENCHMARKS_TEXT = """
|
30 |
## How it works
|
31 |
|
32 |
-
The evaluation runs
|
|
|
|
|
|
|
33 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
## Test Text
|
36 |
|
37 |
The evaluation uses the following passage:
|
|
|
21 |
|
22 |
# What does your leaderboard evaluate?
|
23 |
INTRODUCTION_TEXT = """
|
24 |
+
This leaderboard evaluates language models based on their perplexity scores on a fixed test passage and
|
25 |
+
structural similarity to GPT-2 using model tracing analysis.
|
26 |
+
|
27 |
+
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
28 |
+
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to GPT-2 after fine-tuning (neuron organization is maintained).
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running?
|
32 |
LLM_BENCHMARKS_TEXT = """
|
33 |
## How it works
|
34 |
|
35 |
+
The evaluation runs two types of analysis on language models:
|
36 |
+
|
37 |
+
### 1. Perplexity Evaluation
|
38 |
+
Perplexity tests using a fixed test passage about artificial intelligence.
|
39 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
40 |
|
41 |
+
### 2. Model Tracing Analysis
|
42 |
+
Compares each model's internal structure to GPT-2 using the "match" statistic with alignment:
|
43 |
+
- **Base Model**: GPT-2 (`openai-community/gpt2`)
|
44 |
+
- **Comparison**: Each model on the leaderboard
|
45 |
+
- **Method**: Neuron matching analysis across transformer layers
|
46 |
+
- **Alignment**: Models are aligned before comparison using the Hungarian algorithm
|
47 |
+
- **Output**: P-value indicating structural similarity (lower = more similar to GPT-2)
|
48 |
+
|
49 |
+
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
50 |
+
between the base model and fine-tuned variants.
|
51 |
+
|
52 |
## Test Text
|
53 |
|
54 |
The evaluation uses the following passage:
|
src/display/utils.py
CHANGED
@@ -34,6 +34,8 @@ for task in Tasks:
|
|
34 |
sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
|
35 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
|
36 |
sys.stderr.flush()
|
|
|
|
|
37 |
# Model information
|
38 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
39 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
34 |
sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
|
35 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
|
36 |
sys.stderr.flush()
|
37 |
+
# Model tracing p-value column
|
38 |
+
auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value β¬οΈ", "number", True)])
|
39 |
# Model information
|
40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
41 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/evaluation/model_trace_eval.py
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model tracing evaluation for computing p-values from neuron matching statistics.
|
3 |
+
|
4 |
+
This module runs the model-tracing comparison between a base model (gpt2) and
|
5 |
+
fine-tuned models to determine structural similarity via p-value analysis.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import sys
|
10 |
+
import subprocess
|
11 |
+
import tempfile
|
12 |
+
import pickle
|
13 |
+
import torch
|
14 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
15 |
+
|
16 |
+
# Add model-tracing to path
|
17 |
+
model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
|
18 |
+
if model_tracing_path not in sys.path:
|
19 |
+
sys.path.append(model_tracing_path)
|
20 |
+
|
21 |
+
sys.stderr.write("π§ ATTEMPTING TO IMPORT MODEL TRACING DEPENDENCIES...\n")
|
22 |
+
sys.stderr.flush()
|
23 |
+
|
24 |
+
try:
|
25 |
+
sys.stderr.write(" - Importing tracing.utils.llama.model...\n")
|
26 |
+
from tracing.utils.llama.model import permute_model, rotate_model
|
27 |
+
|
28 |
+
sys.stderr.write(" - Importing tracing.utils.llama.matching...\n")
|
29 |
+
from tracing.utils.llama.matching import align_model
|
30 |
+
|
31 |
+
sys.stderr.write(" - Importing tracing.utils.evaluate...\n")
|
32 |
+
from tracing.utils.evaluate import prepare_hf_dataset, prepare_hf_dataloader
|
33 |
+
|
34 |
+
sys.stderr.write(" - Importing tracing.utils.utils...\n")
|
35 |
+
from tracing.utils.utils import manual_seed
|
36 |
+
|
37 |
+
sys.stderr.write(" - Importing tracing.statistics.match...\n")
|
38 |
+
from tracing.statistics.match import statistic as match_stat
|
39 |
+
|
40 |
+
MODEL_TRACING_AVAILABLE = True
|
41 |
+
sys.stderr.write("β
ALL MODEL TRACING IMPORTS SUCCESSFUL\n")
|
42 |
+
|
43 |
+
except ImportError as e:
|
44 |
+
sys.stderr.write(f"β MODEL TRACING IMPORTS FAILED: {e}\n")
|
45 |
+
import traceback
|
46 |
+
sys.stderr.write(f"Full import traceback:\n{traceback.format_exc()}\n")
|
47 |
+
MODEL_TRACING_AVAILABLE = False
|
48 |
+
|
49 |
+
sys.stderr.write(f"π― Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
|
50 |
+
sys.stderr.flush()
|
51 |
+
|
52 |
+
|
53 |
+
def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
|
54 |
+
"""
|
55 |
+
Run model tracing analysis comparing ft_model against gpt2 base.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
ft_model_name: HuggingFace model identifier for the fine-tuned model
|
59 |
+
revision: Model revision/commit hash
|
60 |
+
precision: Model precision (float16, bfloat16)
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
tuple: (success: bool, result: float or error_message)
|
64 |
+
If success, result is the aggregate p-value
|
65 |
+
If failure, result is error message
|
66 |
+
"""
|
67 |
+
|
68 |
+
if not MODEL_TRACING_AVAILABLE:
|
69 |
+
return False, "Model tracing dependencies not available"
|
70 |
+
|
71 |
+
try:
|
72 |
+
sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS ===\n")
|
73 |
+
sys.stderr.write(f"Base model: openai-community/gpt2\n")
|
74 |
+
sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
|
75 |
+
sys.stderr.write(f"Revision: {revision}\n")
|
76 |
+
sys.stderr.write(f"Precision: {precision}\n")
|
77 |
+
sys.stderr.flush()
|
78 |
+
|
79 |
+
# Set random seed for reproducibility
|
80 |
+
manual_seed(0)
|
81 |
+
|
82 |
+
# Determine dtype
|
83 |
+
if precision == "bfloat16":
|
84 |
+
dtype = torch.bfloat16
|
85 |
+
else:
|
86 |
+
dtype = torch.float16
|
87 |
+
|
88 |
+
# Load base model (gpt2)
|
89 |
+
base_model_id = "openai-community/gpt2"
|
90 |
+
sys.stderr.write(f"π€ Loading base model: {base_model_id}\n")
|
91 |
+
sys.stderr.write(f" - dtype: {dtype}\n")
|
92 |
+
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
93 |
+
sys.stderr.flush()
|
94 |
+
|
95 |
+
try:
|
96 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
97 |
+
base_model_id,
|
98 |
+
torch_dtype=dtype,
|
99 |
+
low_cpu_mem_usage=True
|
100 |
+
)
|
101 |
+
sys.stderr.write("β
Base model loaded successfully\n")
|
102 |
+
except Exception as e:
|
103 |
+
sys.stderr.write(f"β Failed to load base model: {e}\n")
|
104 |
+
raise
|
105 |
+
|
106 |
+
try:
|
107 |
+
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
|
108 |
+
sys.stderr.write("β
Base tokenizer loaded successfully\n")
|
109 |
+
except Exception as e:
|
110 |
+
sys.stderr.write(f"β Failed to load base tokenizer: {e}\n")
|
111 |
+
raise
|
112 |
+
|
113 |
+
# Load fine-tuned model
|
114 |
+
sys.stderr.write(f"π€ Loading fine-tuned model: {ft_model_name}\n")
|
115 |
+
sys.stderr.write(f" - revision: {revision}\n")
|
116 |
+
sys.stderr.write(f" - dtype: {dtype}\n")
|
117 |
+
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
118 |
+
sys.stderr.flush()
|
119 |
+
|
120 |
+
try:
|
121 |
+
ft_model = AutoModelForCausalLM.from_pretrained(
|
122 |
+
ft_model_name,
|
123 |
+
revision=revision,
|
124 |
+
torch_dtype=dtype,
|
125 |
+
low_cpu_mem_usage=True
|
126 |
+
)
|
127 |
+
sys.stderr.write("β
Fine-tuned model loaded successfully\n")
|
128 |
+
except Exception as e:
|
129 |
+
sys.stderr.write(f"β Failed to load fine-tuned model: {e}\n")
|
130 |
+
raise
|
131 |
+
|
132 |
+
try:
|
133 |
+
ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, revision=revision, use_fast=False)
|
134 |
+
sys.stderr.write("β
Fine-tuned tokenizer loaded successfully\n")
|
135 |
+
except Exception as e:
|
136 |
+
sys.stderr.write(f"β Failed to load fine-tuned tokenizer: {e}\n")
|
137 |
+
raise
|
138 |
+
|
139 |
+
sys.stderr.write("π― ALL MODELS AND TOKENIZERS LOADED SUCCESSFULLY\n")
|
140 |
+
|
141 |
+
# Show memory info if available
|
142 |
+
if torch.cuda.is_available():
|
143 |
+
memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB
|
144 |
+
memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB
|
145 |
+
sys.stderr.write(f"πΎ GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB\n")
|
146 |
+
|
147 |
+
sys.stderr.flush()
|
148 |
+
|
149 |
+
# Prepare dataset (using wikitext like in the original)
|
150 |
+
sys.stderr.write("Preparing dataset...\n")
|
151 |
+
sys.stderr.flush()
|
152 |
+
|
153 |
+
block_size = 512
|
154 |
+
batch_size = 1
|
155 |
+
dataset = prepare_hf_dataset("dlwh/wikitext_103_detokenized", block_size, base_tokenizer)
|
156 |
+
dataloader = prepare_hf_dataloader(dataset, batch_size)
|
157 |
+
|
158 |
+
sys.stderr.write("Dataset prepared\n")
|
159 |
+
sys.stderr.flush()
|
160 |
+
|
161 |
+
# Run alignment (--align flag)
|
162 |
+
sys.stderr.write("Running model alignment...\n")
|
163 |
+
sys.stderr.flush()
|
164 |
+
|
165 |
+
try:
|
166 |
+
align_model(base_model, ft_model, ft_model)
|
167 |
+
sys.stderr.write("Model alignment completed\n")
|
168 |
+
except Exception as e:
|
169 |
+
sys.stderr.write(f"Model alignment failed: {e}\n")
|
170 |
+
sys.stderr.write("Continuing without alignment...\n")
|
171 |
+
sys.stderr.flush()
|
172 |
+
|
173 |
+
# Run match statistic
|
174 |
+
sys.stderr.write("Computing match statistic...\n")
|
175 |
+
sys.stderr.flush()
|
176 |
+
|
177 |
+
# Get number of layers for the models
|
178 |
+
if hasattr(base_model, 'transformer') and hasattr(base_model.transformer, 'h'):
|
179 |
+
# GPT-2 style
|
180 |
+
n_blocks = len(base_model.transformer.h)
|
181 |
+
elif hasattr(base_model, 'model') and hasattr(base_model.model, 'layers'):
|
182 |
+
# LLaMA style
|
183 |
+
n_blocks = len(base_model.model.layers)
|
184 |
+
else:
|
185 |
+
# Default fallback
|
186 |
+
n_blocks = 12 # GPT-2 base has 12 layers
|
187 |
+
|
188 |
+
# Check if fine-tuned model has compatible architecture
|
189 |
+
ft_n_blocks = n_blocks
|
190 |
+
if hasattr(ft_model, 'transformer') and hasattr(ft_model.transformer, 'h'):
|
191 |
+
ft_n_blocks = len(ft_model.transformer.h)
|
192 |
+
elif hasattr(ft_model, 'model') and hasattr(ft_model.model, 'layers'):
|
193 |
+
ft_n_blocks = len(ft_model.model.layers)
|
194 |
+
|
195 |
+
# Use minimum number of blocks to avoid index errors
|
196 |
+
n_blocks = min(n_blocks, ft_n_blocks)
|
197 |
+
|
198 |
+
sys.stderr.write(f"Using {n_blocks} blocks for analysis\n")
|
199 |
+
sys.stderr.flush()
|
200 |
+
|
201 |
+
# Run the match statistic - returns list of p-values per layer
|
202 |
+
try:
|
203 |
+
p_values = match_stat(base_model, ft_model, dataloader, n_blocks=n_blocks)
|
204 |
+
except Exception as e:
|
205 |
+
sys.stderr.write(f"Match statistic computation failed: {e}\n")
|
206 |
+
sys.stderr.flush()
|
207 |
+
# Return a default high p-value indicating no similarity
|
208 |
+
return True, 1.0
|
209 |
+
|
210 |
+
sys.stderr.write(f"Match statistic computed: {len(p_values)} p-values\n")
|
211 |
+
sys.stderr.flush()
|
212 |
+
|
213 |
+
# Filter out None/NaN values
|
214 |
+
valid_p_values = [p for p in p_values if p is not None and not (isinstance(p, float) and (p != p or p < 0 or p > 1))]
|
215 |
+
|
216 |
+
if not valid_p_values:
|
217 |
+
sys.stderr.write("No valid p-values found, returning default\n")
|
218 |
+
sys.stderr.flush()
|
219 |
+
return True, 1.0
|
220 |
+
|
221 |
+
# Calculate aggregate p-value using Fisher's method
|
222 |
+
from tracing.utils.utils import fisher
|
223 |
+
try:
|
224 |
+
aggregate_p_value = fisher(valid_p_values)
|
225 |
+
except Exception as e:
|
226 |
+
sys.stderr.write(f"Fisher's method failed: {e}\n")
|
227 |
+
sys.stderr.flush()
|
228 |
+
# Use the mean of valid p-values as fallback
|
229 |
+
aggregate_p_value = sum(valid_p_values) / len(valid_p_values)
|
230 |
+
|
231 |
+
sys.stderr.write(f"Aggregate p-value: {aggregate_p_value}\n")
|
232 |
+
sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
|
233 |
+
sys.stderr.flush()
|
234 |
+
|
235 |
+
# Clean up memory
|
236 |
+
del base_model
|
237 |
+
del ft_model
|
238 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
239 |
+
|
240 |
+
return True, aggregate_p_value
|
241 |
+
|
242 |
+
except Exception as e:
|
243 |
+
error_msg = str(e)
|
244 |
+
sys.stderr.write(f"Error in model trace analysis: {error_msg}\n")
|
245 |
+
import traceback
|
246 |
+
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
247 |
+
sys.stderr.flush()
|
248 |
+
|
249 |
+
# Clean up memory even on error
|
250 |
+
try:
|
251 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
252 |
+
except:
|
253 |
+
pass
|
254 |
+
|
255 |
+
return False, error_msg
|
256 |
+
|
257 |
+
|
258 |
+
def compute_model_trace_p_value(model_name, revision="main", precision="float16"):
|
259 |
+
"""
|
260 |
+
Wrapper function to compute model trace p-value for a single model.
|
261 |
+
|
262 |
+
Args:
|
263 |
+
model_name: HuggingFace model identifier
|
264 |
+
revision: Model revision
|
265 |
+
precision: Model precision
|
266 |
+
|
267 |
+
Returns:
|
268 |
+
float or None: P-value if successful, None if failed
|
269 |
+
"""
|
270 |
+
sys.stderr.write(f"\n{'='*60}\n")
|
271 |
+
sys.stderr.write(f"COMPUTE_MODEL_TRACE_P_VALUE CALLED\n")
|
272 |
+
sys.stderr.write(f"Model: {model_name}\n")
|
273 |
+
sys.stderr.write(f"Revision: {revision}\n")
|
274 |
+
sys.stderr.write(f"Precision: {precision}\n")
|
275 |
+
sys.stderr.write(f"Model tracing available: {MODEL_TRACING_AVAILABLE}\n")
|
276 |
+
sys.stderr.write(f"{'='*60}\n")
|
277 |
+
sys.stderr.flush()
|
278 |
+
|
279 |
+
if not MODEL_TRACING_AVAILABLE:
|
280 |
+
sys.stderr.write("β MODEL TRACING NOT AVAILABLE - returning None\n")
|
281 |
+
sys.stderr.flush()
|
282 |
+
return None
|
283 |
+
|
284 |
+
try:
|
285 |
+
sys.stderr.write("π Starting model trace analysis...\n")
|
286 |
+
sys.stderr.flush()
|
287 |
+
|
288 |
+
success, result = run_model_trace_analysis(model_name, revision, precision)
|
289 |
+
|
290 |
+
sys.stderr.write(f"π Analysis completed - Success: {success}, Result: {result}\n")
|
291 |
+
sys.stderr.flush()
|
292 |
+
|
293 |
+
if success:
|
294 |
+
sys.stderr.write(f"β
SUCCESS: Returning p-value {result}\n")
|
295 |
+
sys.stderr.flush()
|
296 |
+
return result
|
297 |
+
else:
|
298 |
+
sys.stderr.write(f"β FAILED: {result}\n")
|
299 |
+
sys.stderr.write("π Returning None as fallback\n")
|
300 |
+
sys.stderr.flush()
|
301 |
+
return None
|
302 |
+
|
303 |
+
except Exception as e:
|
304 |
+
sys.stderr.write(f"π₯ CRITICAL ERROR in compute_model_trace_p_value for {model_name}:\n")
|
305 |
+
sys.stderr.write(f"Exception: {e}\n")
|
306 |
+
import traceback
|
307 |
+
sys.stderr.write(f"Full traceback:\n{traceback.format_exc()}\n")
|
308 |
+
sys.stderr.write("π Returning None as fallback\n")
|
309 |
+
sys.stderr.flush()
|
310 |
+
return None
|
src/leaderboard/read_evals.py
CHANGED
@@ -7,6 +7,7 @@ from dataclasses import dataclass
|
|
7 |
from src.display.formatting import make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
9 |
from src.submission.check_validity import is_model_on_hub
|
|
|
10 |
|
11 |
@dataclass
|
12 |
class EvalResult:
|
@@ -131,6 +132,34 @@ class EvalResult:
|
|
131 |
data_dict[AutoEvalColumn.params.name] = 0
|
132 |
data_dict[AutoEvalColumn.likes.name] = 0
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
135 |
sys.stderr.flush()
|
136 |
|
|
|
7 |
from src.display.formatting import make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
9 |
from src.submission.check_validity import is_model_on_hub
|
10 |
+
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
11 |
|
12 |
@dataclass
|
13 |
class EvalResult:
|
|
|
132 |
data_dict[AutoEvalColumn.params.name] = 0
|
133 |
data_dict[AutoEvalColumn.likes.name] = 0
|
134 |
|
135 |
+
# Compute model trace p-value
|
136 |
+
sys.stderr.write(f"\n𧬠COMPUTING MODEL TRACE P-VALUE FOR: {self.full_model}\n")
|
137 |
+
sys.stderr.write(f" - Revision: {self.revision if self.revision else 'main'}\n")
|
138 |
+
sys.stderr.write(f" - Precision: {self.precision.value.name.lower()}\n")
|
139 |
+
sys.stderr.flush()
|
140 |
+
|
141 |
+
try:
|
142 |
+
model_trace_p_value = compute_model_trace_p_value(
|
143 |
+
self.full_model,
|
144 |
+
self.revision if self.revision else "main",
|
145 |
+
self.precision.value.name.lower()
|
146 |
+
)
|
147 |
+
|
148 |
+
if model_trace_p_value is not None:
|
149 |
+
sys.stderr.write(f"β
Model trace p-value computed successfully: {model_trace_p_value}\n")
|
150 |
+
else:
|
151 |
+
sys.stderr.write(f"β οΈ Model trace p-value is None (computation failed or not available)\n")
|
152 |
+
|
153 |
+
except Exception as e:
|
154 |
+
sys.stderr.write(f"π₯ Exception during model trace p-value computation: {e}\n")
|
155 |
+
import traceback
|
156 |
+
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
157 |
+
model_trace_p_value = None
|
158 |
+
|
159 |
+
data_dict[AutoEvalColumn.model_trace_p_value.name] = model_trace_p_value
|
160 |
+
sys.stderr.write(f"π Added to data_dict: {AutoEvalColumn.model_trace_p_value.name} = {model_trace_p_value}\n")
|
161 |
+
sys.stderr.flush()
|
162 |
+
|
163 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
164 |
sys.stderr.flush()
|
165 |
|
test_model_trace.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for model tracing integration.
|
4 |
+
Tests the p-value computation for a simple model comparison.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import sys
|
8 |
+
import os
|
9 |
+
|
10 |
+
# Add src to path
|
11 |
+
sys.path.append('src')
|
12 |
+
|
13 |
+
from evaluation.model_trace_eval import compute_model_trace_p_value
|
14 |
+
|
15 |
+
def test_model_trace():
|
16 |
+
"""Test the model trace p-value computation with a simple example."""
|
17 |
+
|
18 |
+
print("Testing model trace p-value computation...")
|
19 |
+
|
20 |
+
# Test with a simple model (should be fast)
|
21 |
+
test_model = "openai-community/gpt2"
|
22 |
+
|
23 |
+
print(f"Computing p-value for {test_model} vs GPT-2...")
|
24 |
+
|
25 |
+
try:
|
26 |
+
p_value = compute_model_trace_p_value(test_model, "main", "float16")
|
27 |
+
|
28 |
+
if p_value is not None:
|
29 |
+
print(f"β
Success! P-value: {p_value}")
|
30 |
+
if 0 <= p_value <= 1:
|
31 |
+
print("β
P-value is in valid range [0, 1]")
|
32 |
+
else:
|
33 |
+
print(f"β οΈ Warning: P-value {p_value} is outside expected range [0, 1]")
|
34 |
+
else:
|
35 |
+
print("β Failed: P-value is None")
|
36 |
+
|
37 |
+
except Exception as e:
|
38 |
+
print(f"β Error: {e}")
|
39 |
+
import traceback
|
40 |
+
traceback.print_exc()
|
41 |
+
|
42 |
+
if __name__ == "__main__":
|
43 |
+
test_model_trace()
|