Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
1191811
1
Parent(s):
1bac1ed
try again
Browse files- app.py +27 -118
- src/about.py +9 -25
- src/display/utils.py +2 -10
- src/evaluation/initialize_models.py +1 -3
- src/leaderboard/read_evals.py +7 -37
app.py
CHANGED
@@ -18,7 +18,6 @@ from src.display.utils import (
|
|
18 |
)
|
19 |
from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
|
20 |
from src.populate import get_leaderboard_df
|
21 |
-
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
22 |
|
23 |
def create_results_dataframe():
|
24 |
"""Create and return the results DataFrame for display"""
|
@@ -36,17 +35,15 @@ def create_results_dataframe():
|
|
36 |
sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
|
37 |
sys.stderr.flush()
|
38 |
# Return empty DataFrame with proper columns
|
39 |
-
return pd.DataFrame(columns=["Model", "
|
40 |
|
41 |
sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
|
42 |
sys.stderr.flush()
|
43 |
|
44 |
-
# Check if required columns exist
|
45 |
required_cols = [
|
46 |
AutoEvalColumn.model.name,
|
47 |
-
"Perplexity",
|
48 |
AutoEvalColumn.model_trace_p_value.name,
|
49 |
-
AutoEvalColumn.average.name,
|
50 |
AutoEvalColumn.model_type.name,
|
51 |
AutoEvalColumn.precision.name,
|
52 |
]
|
@@ -68,10 +65,10 @@ def create_results_dataframe():
|
|
68 |
except Exception as e:
|
69 |
sys.stderr.write(f"💥 Error selecting columns: {e}\n")
|
70 |
sys.stderr.flush()
|
71 |
-
return pd.DataFrame(columns=["Model", "
|
72 |
|
73 |
# Rename columns for better display
|
74 |
-
display_df.columns = ["Model", "
|
75 |
|
76 |
sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
|
77 |
sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
|
@@ -84,64 +81,7 @@ def create_results_dataframe():
|
|
84 |
sys.stderr.flush()
|
85 |
return display_df
|
86 |
|
87 |
-
|
88 |
-
"""Run perplexity evaluation on demand."""
|
89 |
-
import sys
|
90 |
-
import traceback
|
91 |
-
import gradio as gr
|
92 |
-
from src.evaluation.initialize_models import is_model_allowed
|
93 |
-
|
94 |
-
if not model_name:
|
95 |
-
return "Please select a model.", gr.update(), gr.update()
|
96 |
-
|
97 |
-
if not is_model_allowed(model_name):
|
98 |
-
return f"❌ Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
|
99 |
-
|
100 |
-
try:
|
101 |
-
# Use stderr for more reliable logging in HF Spaces
|
102 |
-
sys.stderr.write(f"\n=== RUNNING PERPLEXITY TEST ===\n")
|
103 |
-
sys.stderr.write(f"Model: {model_name}\n")
|
104 |
-
sys.stderr.write(f"Revision: {revision}\n")
|
105 |
-
sys.stderr.write(f"Precision: {precision}\n")
|
106 |
-
sys.stderr.flush()
|
107 |
-
|
108 |
-
success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
|
109 |
-
sys.stderr.write(f"Evaluation result - Success: {success}, Result: {result}\n")
|
110 |
-
sys.stderr.flush()
|
111 |
-
|
112 |
-
if success:
|
113 |
-
sys.stderr.write("Evaluation succeeded - updating both results tables\n")
|
114 |
-
sys.stderr.flush()
|
115 |
-
|
116 |
-
# Get updated results (this will trigger model trace p-value computation for the new model)
|
117 |
-
sys.stderr.write("🔄 Creating updated results DataFrame (may compute model trace p-values)...\n")
|
118 |
-
sys.stderr.flush()
|
119 |
-
|
120 |
-
updated_df = create_results_dataframe()
|
121 |
-
|
122 |
-
sys.stderr.write("✅ Updated DataFrame created successfully\n")
|
123 |
-
sys.stderr.flush()
|
124 |
-
|
125 |
-
success_msg = f"""✅ **Perplexity evaluation completed successfully!**
|
126 |
-
|
127 |
-
**Model**: {model_name}
|
128 |
-
**Perplexity Score**: {result:.4f}
|
129 |
-
|
130 |
-
🎉 **Results have been saved and both tables have been updated!**
|
131 |
-
|
132 |
-
⏰ **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
|
133 |
-
|
134 |
-
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
135 |
-
else:
|
136 |
-
return f"❌ **Evaluation failed**: {result}", gr.update(), gr.update()
|
137 |
-
|
138 |
-
except Exception as e:
|
139 |
-
error_msg = str(e)
|
140 |
-
traceback_str = traceback.format_exc()
|
141 |
-
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
142 |
-
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
143 |
-
sys.stderr.flush()
|
144 |
-
return f"❌ **Critical error**: {error_msg}", gr.update(), gr.update()
|
145 |
|
146 |
# Initialize results repository and directory
|
147 |
try:
|
@@ -173,7 +113,7 @@ except Exception as e:
|
|
173 |
|
174 |
# Initialize allowed models
|
175 |
import sys
|
176 |
-
from src.evaluation.initialize_models import initialize_allowed_models
|
177 |
|
178 |
sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
|
179 |
sys.stderr.write("📊 Initializing allowed models...\n")
|
@@ -205,7 +145,7 @@ with demo:
|
|
205 |
gr.Markdown("## Model Evaluation Results")
|
206 |
results_table = gr.DataFrame(
|
207 |
value=RESULTS_DF,
|
208 |
-
headers=["Model", "
|
209 |
interactive=False,
|
210 |
wrap=False
|
211 |
)
|
@@ -213,66 +153,35 @@ with demo:
|
|
213 |
with gr.TabItem("📝 About", elem_id="about-tab", id=1):
|
214 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
215 |
|
216 |
-
with gr.TabItem("
|
217 |
-
gr.Markdown("##
|
218 |
-
|
219 |
-
allowed_models = get_allowed_models()
|
220 |
-
|
221 |
-
with gr.Row():
|
222 |
-
with gr.Column():
|
223 |
-
model_name = gr.Dropdown(
|
224 |
-
choices=allowed_models,
|
225 |
-
label="Model name",
|
226 |
-
value=allowed_models[0] if allowed_models else None
|
227 |
-
)
|
228 |
-
revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
229 |
-
precision = gr.Dropdown(
|
230 |
-
choices=["float16", "bfloat16"],
|
231 |
-
label="Precision",
|
232 |
-
value="float16"
|
233 |
-
)
|
234 |
-
debug_mode = gr.Checkbox(label="Enable debug mode (more verbose logging)", value=True)
|
235 |
-
|
236 |
-
with gr.Column():
|
237 |
-
test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
|
238 |
-
result = gr.Markdown()
|
239 |
-
|
240 |
-
gr.Markdown("## Live Results")
|
241 |
-
live_results_table = gr.DataFrame(
|
242 |
-
value=RESULTS_DF,
|
243 |
-
headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
|
244 |
-
interactive=False,
|
245 |
-
wrap=False
|
246 |
-
)
|
247 |
|
248 |
gr.Markdown("""
|
249 |
-
###
|
250 |
-
- **
|
251 |
-
- **
|
252 |
-
- **
|
253 |
-
- **
|
254 |
-
|
|
|
|
|
|
|
|
|
255 |
|
256 |
### How it works:
|
257 |
-
1.
|
258 |
-
2.
|
259 |
-
3.
|
260 |
-
4. Results
|
261 |
""")
|
262 |
-
|
263 |
-
test_button.click(
|
264 |
-
run_perplexity_test,
|
265 |
-
[model_name, revision, precision],
|
266 |
-
[result, live_results_table, results_table]
|
267 |
-
)
|
268 |
|
269 |
sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
|
270 |
-
sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING
|
271 |
sys.stderr.write("📊 Features enabled:\n")
|
272 |
-
sys.stderr.write(" -
|
273 |
-
sys.stderr.write(" - Model trace p-value computation (vs GPT-2 base)\n")
|
274 |
sys.stderr.write(" - Match statistic with alignment\n")
|
275 |
-
sys.stderr.write("
|
|
|
276 |
sys.stderr.flush()
|
277 |
|
278 |
demo.queue(default_concurrency_limit=5).launch()
|
|
|
18 |
)
|
19 |
from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
|
20 |
from src.populate import get_leaderboard_df
|
|
|
21 |
|
22 |
def create_results_dataframe():
|
23 |
"""Create and return the results DataFrame for display"""
|
|
|
35 |
sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
|
36 |
sys.stderr.flush()
|
37 |
# Return empty DataFrame with proper columns
|
38 |
+
return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
|
39 |
|
40 |
sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
|
41 |
sys.stderr.flush()
|
42 |
|
43 |
+
# Check if required columns exist - only p-values matter
|
44 |
required_cols = [
|
45 |
AutoEvalColumn.model.name,
|
|
|
46 |
AutoEvalColumn.model_trace_p_value.name,
|
|
|
47 |
AutoEvalColumn.model_type.name,
|
48 |
AutoEvalColumn.precision.name,
|
49 |
]
|
|
|
65 |
except Exception as e:
|
66 |
sys.stderr.write(f"💥 Error selecting columns: {e}\n")
|
67 |
sys.stderr.flush()
|
68 |
+
return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
|
69 |
|
70 |
# Rename columns for better display
|
71 |
+
display_df.columns = ["Model", "Match P-Value", "Type", "Precision"]
|
72 |
|
73 |
sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
|
74 |
sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
|
|
|
81 |
sys.stderr.flush()
|
82 |
return display_df
|
83 |
|
84 |
+
# Perplexity testing removed - we only focus on p-values now
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
# Initialize results repository and directory
|
87 |
try:
|
|
|
113 |
|
114 |
# Initialize allowed models
|
115 |
import sys
|
116 |
+
from src.evaluation.initialize_models import initialize_allowed_models
|
117 |
|
118 |
sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
|
119 |
sys.stderr.write("📊 Initializing allowed models...\n")
|
|
|
145 |
gr.Markdown("## Model Evaluation Results")
|
146 |
results_table = gr.DataFrame(
|
147 |
value=RESULTS_DF,
|
148 |
+
headers=["Model", "Match P-Value", "Type", "Precision"],
|
149 |
interactive=False,
|
150 |
wrap=False
|
151 |
)
|
|
|
153 |
with gr.TabItem("📝 About", elem_id="about-tab", id=1):
|
154 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
155 |
|
156 |
+
with gr.TabItem("🔬 Analysis", elem_id="analysis-tab", id=2):
|
157 |
+
gr.Markdown("## Model Tracing Analysis\n\nP-values are computed automatically for all supported models.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
gr.Markdown("""
|
160 |
+
### Current Analysis Status:
|
161 |
+
- **P-values are computed automatically** using the model tracing pipeline
|
162 |
+
- **Lower p-values indicate higher structural similarity** to Llama-2-7B
|
163 |
+
- **Analysis compares neuron organization** across transformer layers
|
164 |
+
- **Results appear in the main table** once computation is complete
|
165 |
+
|
166 |
+
### Supported Models:
|
167 |
+
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
|
168 |
+
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
169 |
+
- `EleutherAI/llemma_7b` - LLeMa 7B
|
170 |
|
171 |
### How it works:
|
172 |
+
1. Models are automatically analyzed against Llama-2-7B base
|
173 |
+
2. Match statistic with alignment is computed
|
174 |
+
3. P-values indicate structural similarity preservation
|
175 |
+
4. Results appear in the main Results tab
|
176 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
|
179 |
+
sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING ANALYSIS\n")
|
180 |
sys.stderr.write("📊 Features enabled:\n")
|
181 |
+
sys.stderr.write(" - Model trace p-value computation (vs Llama-2-7B base)\n")
|
|
|
182 |
sys.stderr.write(" - Match statistic with alignment\n")
|
183 |
+
sys.stderr.write(" - Structural similarity analysis\n")
|
184 |
+
sys.stderr.write("🎉 Ready to display p-values!\n")
|
185 |
sys.stderr.flush()
|
186 |
|
187 |
demo.queue(default_concurrency_limit=5).launch()
|
src/about.py
CHANGED
@@ -10,10 +10,10 @@ class Task:
|
|
10 |
# Select your tasks here
|
11 |
# ---------------------------------------------------
|
12 |
class Tasks(Enum):
|
13 |
-
#
|
14 |
-
|
15 |
|
16 |
-
NUM_FEWSHOT = 0 # Not used
|
17 |
# ---------------------------------------------------
|
18 |
|
19 |
# Your leaderboard name
|
@@ -29,8 +29,7 @@ structural similarity to Llama-2-7B using model tracing analysis.
|
|
29 |
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
30 |
- `EleutherAI/llemma_7b` - LLeMa 7B
|
31 |
|
32 |
-
**
|
33 |
-
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
34 |
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
|
35 |
"""
|
36 |
|
@@ -38,18 +37,14 @@ structural similarity to Llama-2-7B using model tracing analysis.
|
|
38 |
LLM_BENCHMARKS_TEXT = """
|
39 |
## How it works
|
40 |
|
41 |
-
The evaluation runs
|
42 |
|
43 |
### Supported Models
|
44 |
- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
|
45 |
- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
|
46 |
- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
|
47 |
|
48 |
-
###
|
49 |
-
Perplexity tests using a fixed test passage about artificial intelligence.
|
50 |
-
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
51 |
-
|
52 |
-
### 2. Model Tracing Analysis
|
53 |
Compares each model's internal structure to Llama-2-7B using the "match" statistic:
|
54 |
- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
|
55 |
- **Comparison Models**: The 3 supported models listed above
|
@@ -59,29 +54,18 @@ Compares each model's internal structure to Llama-2-7B using the "match" statist
|
|
59 |
|
60 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
61 |
between the base model and the comparison models.
|
62 |
-
|
63 |
-
## Test Text
|
64 |
-
|
65 |
-
The evaluation uses the following passage:
|
66 |
-
```
|
67 |
-
Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
|
68 |
-
From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
|
69 |
-
sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
|
70 |
-
with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
|
71 |
-
with these important social considerations.
|
72 |
-
```
|
73 |
"""
|
74 |
|
75 |
EVALUATION_QUEUE_TEXT = """
|
76 |
-
##
|
77 |
|
78 |
-
This leaderboard
|
79 |
|
80 |
1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
|
81 |
2. **IBM Granite 7B Base** - IBM's foundational language model
|
82 |
3. **LLeMa 7B** - EleutherAI's mathematical language model
|
83 |
|
84 |
-
|
85 |
"""
|
86 |
|
87 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
10 |
# Select your tasks here
|
11 |
# ---------------------------------------------------
|
12 |
class Tasks(Enum):
|
13 |
+
# No tasks - we only care about p-values
|
14 |
+
pass
|
15 |
|
16 |
+
NUM_FEWSHOT = 0 # Not used
|
17 |
# ---------------------------------------------------
|
18 |
|
19 |
# Your leaderboard name
|
|
|
29 |
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
30 |
- `EleutherAI/llemma_7b` - LLeMa 7B
|
31 |
|
32 |
+
**Metric:**
|
|
|
33 |
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
|
34 |
"""
|
35 |
|
|
|
37 |
LLM_BENCHMARKS_TEXT = """
|
38 |
## How it works
|
39 |
|
40 |
+
The evaluation runs model tracing analysis on the supported language models:
|
41 |
|
42 |
### Supported Models
|
43 |
- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
|
44 |
- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
|
45 |
- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
|
46 |
|
47 |
+
### Model Tracing Analysis
|
|
|
|
|
|
|
|
|
48 |
Compares each model's internal structure to Llama-2-7B using the "match" statistic:
|
49 |
- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
|
50 |
- **Comparison Models**: The 3 supported models listed above
|
|
|
54 |
|
55 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
56 |
between the base model and the comparison models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
"""
|
58 |
|
59 |
EVALUATION_QUEUE_TEXT = """
|
60 |
+
## Model Analysis
|
61 |
|
62 |
+
This leaderboard analyzes structural similarity between specific models and Llama-2-7B:
|
63 |
|
64 |
1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
|
65 |
2. **IBM Granite 7B Base** - IBM's foundational language model
|
66 |
3. **LLeMa 7B** - EleutherAI's mathematical language model
|
67 |
|
68 |
+
The p-values are computed automatically using the model tracing analysis.
|
69 |
"""
|
70 |
|
71 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/display/utils.py
CHANGED
@@ -26,15 +26,7 @@ auto_eval_column_dict = []
|
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
-
#
|
30 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
-
for task in Tasks:
|
32 |
-
# Use exact column name from Tasks
|
33 |
-
task_col_name = task.value.col_name
|
34 |
-
sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
|
35 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
|
36 |
-
sys.stderr.flush()
|
37 |
-
# Model tracing p-value column
|
38 |
auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
|
39 |
# Model information
|
40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
@@ -122,7 +114,7 @@ sys.stderr.write(f"COLS: {COLS}\n")
|
|
122 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
123 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
124 |
|
125 |
-
BENCHMARK_COLS = [
|
126 |
sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
|
127 |
sys.stderr.write(f"=== END COLUMN SETUP ===\n")
|
128 |
sys.stderr.flush()
|
|
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
+
# Only p-value column - no other scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
|
31 |
# Model information
|
32 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
114 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
115 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
116 |
|
117 |
+
BENCHMARK_COLS = [] # No benchmark columns - only p-values
|
118 |
sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
|
119 |
sys.stderr.write(f"=== END COLUMN SETUP ===\n")
|
120 |
sys.stderr.flush()
|
src/evaluation/initialize_models.py
CHANGED
@@ -53,9 +53,7 @@ def create_model_result_file(model_name, precision="float16"):
|
|
53 |
"model_sha": "main"
|
54 |
},
|
55 |
"results": {
|
56 |
-
|
57 |
-
"perplexity": None # Will be populated when user tests
|
58 |
-
}
|
59 |
}
|
60 |
}
|
61 |
|
|
|
53 |
"model_sha": "main"
|
54 |
},
|
55 |
"results": {
|
56 |
+
# No perplexity - we only care about p-values
|
|
|
|
|
57 |
}
|
58 |
}
|
59 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -59,10 +59,8 @@ class EvalResult:
|
|
59 |
if architectures:
|
60 |
architecture = ";".join(architectures)
|
61 |
|
62 |
-
#
|
63 |
results = {}
|
64 |
-
if "perplexity" in data["results"]:
|
65 |
-
results["perplexity"] = data["results"]["perplexity"]["perplexity"]
|
66 |
|
67 |
return self(
|
68 |
eval_name=result_key,
|
@@ -88,29 +86,9 @@ class EvalResult:
|
|
88 |
sys.stderr.write(f"Weight type: {self.weight_type}\n")
|
89 |
sys.stderr.flush()
|
90 |
|
91 |
-
#
|
92 |
-
|
93 |
-
|
94 |
-
sys.stderr.write(f"Available tasks: {[task.name for task in Tasks]}\n")
|
95 |
-
|
96 |
-
for task in Tasks:
|
97 |
-
sys.stderr.write(f"Looking for task: {task.value.benchmark} in results\n")
|
98 |
-
if task.value.benchmark in self.results:
|
99 |
-
score = self.results[task.value.benchmark]
|
100 |
-
perplexity_score = score # Save the raw score
|
101 |
-
sys.stderr.write(f"Found score for {task.value.benchmark}: {score}\n")
|
102 |
-
# Convert perplexity to a 0-100 scale where lower perplexity = higher score
|
103 |
-
# Using a log scale since perplexity can vary widely
|
104 |
-
# Cap at 100 for very low perplexity and 0 for very high perplexity
|
105 |
-
score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
|
106 |
-
scores.append(score)
|
107 |
-
sys.stderr.write(f"Converted score: {score}\n")
|
108 |
-
else:
|
109 |
-
sys.stderr.write(f"Task {task.value.benchmark} not found in results\n")
|
110 |
-
sys.stderr.flush()
|
111 |
-
|
112 |
-
average = sum(scores) / len(scores) if scores else 0
|
113 |
-
sys.stderr.write(f"Calculated average score: {average}\n")
|
114 |
sys.stderr.flush()
|
115 |
|
116 |
# Create data dictionary with comprehensive debugging
|
@@ -164,17 +142,9 @@ class EvalResult:
|
|
164 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
165 |
sys.stderr.flush()
|
166 |
|
167 |
-
#
|
168 |
-
|
169 |
-
|
170 |
-
if task.value.benchmark in self.results:
|
171 |
-
task_score = self.results[task.value.benchmark]
|
172 |
-
data_dict[task_col_name] = task_score
|
173 |
-
sys.stderr.write(f"Added task score: {task_col_name} = {task_score}\n")
|
174 |
-
else:
|
175 |
-
data_dict[task_col_name] = None
|
176 |
-
sys.stderr.write(f"Added None for missing task: {task_col_name}\n")
|
177 |
-
sys.stderr.flush()
|
178 |
|
179 |
sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
|
180 |
sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
|
|
|
59 |
if architectures:
|
60 |
architecture = ";".join(architectures)
|
61 |
|
62 |
+
# No perplexity extraction - we only care about p-values
|
63 |
results = {}
|
|
|
|
|
64 |
|
65 |
return self(
|
66 |
eval_name=result_key,
|
|
|
86 |
sys.stderr.write(f"Weight type: {self.weight_type}\n")
|
87 |
sys.stderr.flush()
|
88 |
|
89 |
+
# No task-based scoring - we only care about p-values
|
90 |
+
average = 0 # Default average since we don't have tasks
|
91 |
+
sys.stderr.write(f"No task-based scoring, using default average: {average}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
sys.stderr.flush()
|
93 |
|
94 |
# Create data dictionary with comprehensive debugging
|
|
|
142 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
143 |
sys.stderr.flush()
|
144 |
|
145 |
+
# No task-specific scores - we only have p-values
|
146 |
+
sys.stderr.write("No task-specific scores to add\n")
|
147 |
+
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
|
150 |
sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
|