Ahmed Ahmed commited on
Commit
1191811
·
1 Parent(s): 1bac1ed
app.py CHANGED
@@ -18,7 +18,6 @@ from src.display.utils import (
18
  )
19
  from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
20
  from src.populate import get_leaderboard_df
21
- from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
22
 
23
  def create_results_dataframe():
24
  """Create and return the results DataFrame for display"""
@@ -36,17 +35,15 @@ def create_results_dataframe():
36
  sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
37
  sys.stderr.flush()
38
  # Return empty DataFrame with proper columns
39
- return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
40
 
41
  sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
42
  sys.stderr.flush()
43
 
44
- # Check if required columns exist
45
  required_cols = [
46
  AutoEvalColumn.model.name,
47
- "Perplexity",
48
  AutoEvalColumn.model_trace_p_value.name,
49
- AutoEvalColumn.average.name,
50
  AutoEvalColumn.model_type.name,
51
  AutoEvalColumn.precision.name,
52
  ]
@@ -68,10 +65,10 @@ def create_results_dataframe():
68
  except Exception as e:
69
  sys.stderr.write(f"💥 Error selecting columns: {e}\n")
70
  sys.stderr.flush()
71
- return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
72
 
73
  # Rename columns for better display
74
- display_df.columns = ["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"]
75
 
76
  sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
77
  sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
@@ -84,64 +81,7 @@ def create_results_dataframe():
84
  sys.stderr.flush()
85
  return display_df
86
 
87
- def run_perplexity_test(model_name, revision, precision):
88
- """Run perplexity evaluation on demand."""
89
- import sys
90
- import traceback
91
- import gradio as gr
92
- from src.evaluation.initialize_models import is_model_allowed
93
-
94
- if not model_name:
95
- return "Please select a model.", gr.update(), gr.update()
96
-
97
- if not is_model_allowed(model_name):
98
- return f"❌ Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
99
-
100
- try:
101
- # Use stderr for more reliable logging in HF Spaces
102
- sys.stderr.write(f"\n=== RUNNING PERPLEXITY TEST ===\n")
103
- sys.stderr.write(f"Model: {model_name}\n")
104
- sys.stderr.write(f"Revision: {revision}\n")
105
- sys.stderr.write(f"Precision: {precision}\n")
106
- sys.stderr.flush()
107
-
108
- success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
109
- sys.stderr.write(f"Evaluation result - Success: {success}, Result: {result}\n")
110
- sys.stderr.flush()
111
-
112
- if success:
113
- sys.stderr.write("Evaluation succeeded - updating both results tables\n")
114
- sys.stderr.flush()
115
-
116
- # Get updated results (this will trigger model trace p-value computation for the new model)
117
- sys.stderr.write("🔄 Creating updated results DataFrame (may compute model trace p-values)...\n")
118
- sys.stderr.flush()
119
-
120
- updated_df = create_results_dataframe()
121
-
122
- sys.stderr.write("✅ Updated DataFrame created successfully\n")
123
- sys.stderr.flush()
124
-
125
- success_msg = f"""✅ **Perplexity evaluation completed successfully!**
126
-
127
- **Model**: {model_name}
128
- **Perplexity Score**: {result:.4f}
129
-
130
- 🎉 **Results have been saved and both tables have been updated!**
131
-
132
- ⏰ **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
133
-
134
- return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
135
- else:
136
- return f"❌ **Evaluation failed**: {result}", gr.update(), gr.update()
137
-
138
- except Exception as e:
139
- error_msg = str(e)
140
- traceback_str = traceback.format_exc()
141
- sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
142
- sys.stderr.write(f"Traceback: {traceback_str}\n")
143
- sys.stderr.flush()
144
- return f"❌ **Critical error**: {error_msg}", gr.update(), gr.update()
145
 
146
  # Initialize results repository and directory
147
  try:
@@ -173,7 +113,7 @@ except Exception as e:
173
 
174
  # Initialize allowed models
175
  import sys
176
- from src.evaluation.initialize_models import initialize_allowed_models, get_allowed_models
177
 
178
  sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
179
  sys.stderr.write("📊 Initializing allowed models...\n")
@@ -205,7 +145,7 @@ with demo:
205
  gr.Markdown("## Model Evaluation Results")
206
  results_table = gr.DataFrame(
207
  value=RESULTS_DF,
208
- headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
209
  interactive=False,
210
  wrap=False
211
  )
@@ -213,66 +153,35 @@ with demo:
213
  with gr.TabItem("📝 About", elem_id="about-tab", id=1):
214
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
215
 
216
- with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
217
- gr.Markdown("## Run Perplexity Test\n\nTest one of the supported models for perplexity evaluation.")
218
-
219
- allowed_models = get_allowed_models()
220
-
221
- with gr.Row():
222
- with gr.Column():
223
- model_name = gr.Dropdown(
224
- choices=allowed_models,
225
- label="Model name",
226
- value=allowed_models[0] if allowed_models else None
227
- )
228
- revision = gr.Textbox(label="Revision", placeholder="main", value="main")
229
- precision = gr.Dropdown(
230
- choices=["float16", "bfloat16"],
231
- label="Precision",
232
- value="float16"
233
- )
234
- debug_mode = gr.Checkbox(label="Enable debug mode (more verbose logging)", value=True)
235
-
236
- with gr.Column():
237
- test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
238
- result = gr.Markdown()
239
-
240
- gr.Markdown("## Live Results")
241
- live_results_table = gr.DataFrame(
242
- value=RESULTS_DF,
243
- headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
244
- interactive=False,
245
- wrap=False
246
- )
247
 
248
  gr.Markdown("""
249
- ### Tips:
250
- - **Check stderr logs** in HF Spaces for detailed debugging information
251
- - **Results will update automatically** in the table above after evaluation completes
252
- - **Available models**: Vicuna 7B v1.5, IBM Granite 7B Base, LLeMa 7B
253
- - **Lower perplexity scores = better performance** (better at predicting text)
254
- - **Model trace p-values are computed automatically** (may take 10-30 minutes)
 
 
 
 
255
 
256
  ### How it works:
257
- 1. Select a model from the dropdown
258
- 2. Click "Run Perplexity Test"
259
- 3. Wait for evaluation to complete (may take a few minutes for perplexity + longer for p-value)
260
- 4. Results will appear automatically in the table above!
261
  """)
262
-
263
- test_button.click(
264
- run_perplexity_test,
265
- [model_name, revision, precision],
266
- [result, live_results_table, results_table]
267
- )
268
 
269
  sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
270
- sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING INTEGRATION\n")
271
  sys.stderr.write("📊 Features enabled:\n")
272
- sys.stderr.write(" - Perplexity evaluation\n")
273
- sys.stderr.write(" - Model trace p-value computation (vs GPT-2 base)\n")
274
  sys.stderr.write(" - Match statistic with alignment\n")
275
- sys.stderr.write("🎉 Ready to accept requests!\n")
 
276
  sys.stderr.flush()
277
 
278
  demo.queue(default_concurrency_limit=5).launch()
 
18
  )
19
  from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
20
  from src.populate import get_leaderboard_df
 
21
 
22
  def create_results_dataframe():
23
  """Create and return the results DataFrame for display"""
 
35
  sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
36
  sys.stderr.flush()
37
  # Return empty DataFrame with proper columns
38
+ return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
39
 
40
  sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
41
  sys.stderr.flush()
42
 
43
+ # Check if required columns exist - only p-values matter
44
  required_cols = [
45
  AutoEvalColumn.model.name,
 
46
  AutoEvalColumn.model_trace_p_value.name,
 
47
  AutoEvalColumn.model_type.name,
48
  AutoEvalColumn.precision.name,
49
  ]
 
65
  except Exception as e:
66
  sys.stderr.write(f"💥 Error selecting columns: {e}\n")
67
  sys.stderr.flush()
68
+ return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
69
 
70
  # Rename columns for better display
71
+ display_df.columns = ["Model", "Match P-Value", "Type", "Precision"]
72
 
73
  sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
74
  sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
 
81
  sys.stderr.flush()
82
  return display_df
83
 
84
+ # Perplexity testing removed - we only focus on p-values now
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # Initialize results repository and directory
87
  try:
 
113
 
114
  # Initialize allowed models
115
  import sys
116
+ from src.evaluation.initialize_models import initialize_allowed_models
117
 
118
  sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
119
  sys.stderr.write("📊 Initializing allowed models...\n")
 
145
  gr.Markdown("## Model Evaluation Results")
146
  results_table = gr.DataFrame(
147
  value=RESULTS_DF,
148
+ headers=["Model", "Match P-Value", "Type", "Precision"],
149
  interactive=False,
150
  wrap=False
151
  )
 
153
  with gr.TabItem("📝 About", elem_id="about-tab", id=1):
154
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
155
 
156
+ with gr.TabItem("🔬 Analysis", elem_id="analysis-tab", id=2):
157
+ gr.Markdown("## Model Tracing Analysis\n\nP-values are computed automatically for all supported models.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  gr.Markdown("""
160
+ ### Current Analysis Status:
161
+ - **P-values are computed automatically** using the model tracing pipeline
162
+ - **Lower p-values indicate higher structural similarity** to Llama-2-7B
163
+ - **Analysis compares neuron organization** across transformer layers
164
+ - **Results appear in the main table** once computation is complete
165
+
166
+ ### Supported Models:
167
+ - `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
168
+ - `ibm-granite/granite-7b-base` - IBM Granite 7B Base
169
+ - `EleutherAI/llemma_7b` - LLeMa 7B
170
 
171
  ### How it works:
172
+ 1. Models are automatically analyzed against Llama-2-7B base
173
+ 2. Match statistic with alignment is computed
174
+ 3. P-values indicate structural similarity preservation
175
+ 4. Results appear in the main Results tab
176
  """)
 
 
 
 
 
 
177
 
178
  sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
179
+ sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING ANALYSIS\n")
180
  sys.stderr.write("📊 Features enabled:\n")
181
+ sys.stderr.write(" - Model trace p-value computation (vs Llama-2-7B base)\n")
 
182
  sys.stderr.write(" - Match statistic with alignment\n")
183
+ sys.stderr.write(" - Structural similarity analysis\n")
184
+ sys.stderr.write("🎉 Ready to display p-values!\n")
185
  sys.stderr.flush()
186
 
187
  demo.queue(default_concurrency_limit=5).launch()
src/about.py CHANGED
@@ -10,10 +10,10 @@ class Task:
10
  # Select your tasks here
11
  # ---------------------------------------------------
12
  class Tasks(Enum):
13
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
- task0 = Task("perplexity", "perplexity", "Perplexity")
15
 
16
- NUM_FEWSHOT = 0 # Not used for perplexity
17
  # ---------------------------------------------------
18
 
19
  # Your leaderboard name
@@ -29,8 +29,7 @@ structural similarity to Llama-2-7B using model tracing analysis.
29
  - `ibm-granite/granite-7b-base` - IBM Granite 7B Base
30
  - `EleutherAI/llemma_7b` - LLeMa 7B
31
 
32
- **Metrics:**
33
- - **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
34
  - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
35
  """
36
 
@@ -38,18 +37,14 @@ structural similarity to Llama-2-7B using model tracing analysis.
38
  LLM_BENCHMARKS_TEXT = """
39
  ## How it works
40
 
41
- The evaluation runs two types of analysis on the supported language models:
42
 
43
  ### Supported Models
44
  - **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
45
  - **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
46
  - **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
47
 
48
- ### 1. Perplexity Evaluation
49
- Perplexity tests using a fixed test passage about artificial intelligence.
50
- Perplexity measures how well a model predicts text - lower scores mean better predictions.
51
-
52
- ### 2. Model Tracing Analysis
53
  Compares each model's internal structure to Llama-2-7B using the "match" statistic:
54
  - **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
55
  - **Comparison Models**: The 3 supported models listed above
@@ -59,29 +54,18 @@ Compares each model's internal structure to Llama-2-7B using the "match" statist
59
 
60
  The match statistic tests whether neurons in corresponding layers maintain similar functional roles
61
  between the base model and the comparison models.
62
-
63
- ## Test Text
64
-
65
- The evaluation uses the following passage:
66
- ```
67
- Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
68
- From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
69
- sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
70
- with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
71
- with these important social considerations.
72
- ```
73
  """
74
 
75
  EVALUATION_QUEUE_TEXT = """
76
- ## Testing Models
77
 
78
- This leaderboard focuses on comparing specific models:
79
 
80
  1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
81
  2. **IBM Granite 7B Base** - IBM's foundational language model
82
  3. **LLeMa 7B** - EleutherAI's mathematical language model
83
 
84
- Use the "Test Model" tab to run perplexity evaluation on any of these models.
85
  """
86
 
87
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
10
  # Select your tasks here
11
  # ---------------------------------------------------
12
  class Tasks(Enum):
13
+ # No tasks - we only care about p-values
14
+ pass
15
 
16
+ NUM_FEWSHOT = 0 # Not used
17
  # ---------------------------------------------------
18
 
19
  # Your leaderboard name
 
29
  - `ibm-granite/granite-7b-base` - IBM Granite 7B Base
30
  - `EleutherAI/llemma_7b` - LLeMa 7B
31
 
32
+ **Metric:**
 
33
  - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
34
  """
35
 
 
37
  LLM_BENCHMARKS_TEXT = """
38
  ## How it works
39
 
40
+ The evaluation runs model tracing analysis on the supported language models:
41
 
42
  ### Supported Models
43
  - **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
44
  - **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
45
  - **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
46
 
47
+ ### Model Tracing Analysis
 
 
 
 
48
  Compares each model's internal structure to Llama-2-7B using the "match" statistic:
49
  - **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
50
  - **Comparison Models**: The 3 supported models listed above
 
54
 
55
  The match statistic tests whether neurons in corresponding layers maintain similar functional roles
56
  between the base model and the comparison models.
 
 
 
 
 
 
 
 
 
 
 
57
  """
58
 
59
  EVALUATION_QUEUE_TEXT = """
60
+ ## Model Analysis
61
 
62
+ This leaderboard analyzes structural similarity between specific models and Llama-2-7B:
63
 
64
  1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
65
  2. **IBM Granite 7B Base** - IBM's foundational language model
66
  3. **LLeMa 7B** - EleutherAI's mathematical language model
67
 
68
+ The p-values are computed automatically using the model tracing analysis.
69
  """
70
 
71
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/display/utils.py CHANGED
@@ -26,15 +26,7 @@ auto_eval_column_dict = []
26
  # Init
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
- #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
- for task in Tasks:
32
- # Use exact column name from Tasks
33
- task_col_name = task.value.col_name
34
- sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
35
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
36
- sys.stderr.flush()
37
- # Model tracing p-value column
38
  auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
39
  # Model information
40
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
@@ -122,7 +114,7 @@ sys.stderr.write(f"COLS: {COLS}\n")
122
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
123
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
124
 
125
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
126
  sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
127
  sys.stderr.write(f"=== END COLUMN SETUP ===\n")
128
  sys.stderr.flush()
 
26
  # Init
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
+ # Only p-value column - no other scores
 
 
 
 
 
 
 
 
30
  auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
31
  # Model information
32
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 
114
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
115
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
116
 
117
+ BENCHMARK_COLS = [] # No benchmark columns - only p-values
118
  sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
119
  sys.stderr.write(f"=== END COLUMN SETUP ===\n")
120
  sys.stderr.flush()
src/evaluation/initialize_models.py CHANGED
@@ -53,9 +53,7 @@ def create_model_result_file(model_name, precision="float16"):
53
  "model_sha": "main"
54
  },
55
  "results": {
56
- "perplexity": {
57
- "perplexity": None # Will be populated when user tests
58
- }
59
  }
60
  }
61
 
 
53
  "model_sha": "main"
54
  },
55
  "results": {
56
+ # No perplexity - we only care about p-values
 
 
57
  }
58
  }
59
 
src/leaderboard/read_evals.py CHANGED
@@ -59,10 +59,8 @@ class EvalResult:
59
  if architectures:
60
  architecture = ";".join(architectures)
61
 
62
- # Extract perplexity result
63
  results = {}
64
- if "perplexity" in data["results"]:
65
- results["perplexity"] = data["results"]["perplexity"]["perplexity"]
66
 
67
  return self(
68
  eval_name=result_key,
@@ -88,29 +86,9 @@ class EvalResult:
88
  sys.stderr.write(f"Weight type: {self.weight_type}\n")
89
  sys.stderr.flush()
90
 
91
- # Calculate average, handling perplexity (lower is better)
92
- scores = []
93
- perplexity_score = None
94
- sys.stderr.write(f"Available tasks: {[task.name for task in Tasks]}\n")
95
-
96
- for task in Tasks:
97
- sys.stderr.write(f"Looking for task: {task.value.benchmark} in results\n")
98
- if task.value.benchmark in self.results:
99
- score = self.results[task.value.benchmark]
100
- perplexity_score = score # Save the raw score
101
- sys.stderr.write(f"Found score for {task.value.benchmark}: {score}\n")
102
- # Convert perplexity to a 0-100 scale where lower perplexity = higher score
103
- # Using a log scale since perplexity can vary widely
104
- # Cap at 100 for very low perplexity and 0 for very high perplexity
105
- score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
106
- scores.append(score)
107
- sys.stderr.write(f"Converted score: {score}\n")
108
- else:
109
- sys.stderr.write(f"Task {task.value.benchmark} not found in results\n")
110
- sys.stderr.flush()
111
-
112
- average = sum(scores) / len(scores) if scores else 0
113
- sys.stderr.write(f"Calculated average score: {average}\n")
114
  sys.stderr.flush()
115
 
116
  # Create data dictionary with comprehensive debugging
@@ -164,17 +142,9 @@ class EvalResult:
164
  sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
165
  sys.stderr.flush()
166
 
167
- # Add task-specific scores
168
- for task in Tasks:
169
- task_col_name = task.value.col_name
170
- if task.value.benchmark in self.results:
171
- task_score = self.results[task.value.benchmark]
172
- data_dict[task_col_name] = task_score
173
- sys.stderr.write(f"Added task score: {task_col_name} = {task_score}\n")
174
- else:
175
- data_dict[task_col_name] = None
176
- sys.stderr.write(f"Added None for missing task: {task_col_name}\n")
177
- sys.stderr.flush()
178
 
179
  sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
180
  sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
 
59
  if architectures:
60
  architecture = ";".join(architectures)
61
 
62
+ # No perplexity extraction - we only care about p-values
63
  results = {}
 
 
64
 
65
  return self(
66
  eval_name=result_key,
 
86
  sys.stderr.write(f"Weight type: {self.weight_type}\n")
87
  sys.stderr.flush()
88
 
89
+ # No task-based scoring - we only care about p-values
90
+ average = 0 # Default average since we don't have tasks
91
+ sys.stderr.write(f"No task-based scoring, using default average: {average}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  sys.stderr.flush()
93
 
94
  # Create data dictionary with comprehensive debugging
 
142
  sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
143
  sys.stderr.flush()
144
 
145
+ # No task-specific scores - we only have p-values
146
+ sys.stderr.write("No task-specific scores to add\n")
147
+ sys.stderr.flush()
 
 
 
 
 
 
 
 
148
 
149
  sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
150
  sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")