p50038325 commited on
Commit
7321625
·
1 Parent(s): 0b8fd4f

energy-score

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +53 -17
src/leaderboard/read_evals.py CHANGED
@@ -105,26 +105,45 @@ class EvalResult:
105
  # Handle metrics that could be None in the JSON
106
  metric_values = []
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # Check for results with the benchmark name
109
  for k, v in data["results"].items():
110
  if task.benchmark == k:
111
  # Try the expected metric name first
112
- metric_value = v.get(task.metric)
113
 
114
  # If not found, try alternative metric names
115
  if metric_value is None:
116
- # For FOLIO, also check for "folio_em"
117
- if task.benchmark == "folio:logical_reasoning" and "folio_em" in v:
118
- metric_value = v.get("folio_em")
119
- # For TELECOM-QnA, also check for "telecom_qna_em"
120
- elif task.benchmark == "telecom:qna" and "telecom_qna_em" in v:
121
- metric_value = v.get("telecom_qna_em")
122
- # For 3GPP-TSG, also check for generic "em"
123
- elif task.benchmark == "3gpp:tsg" and "em" in v:
124
- metric_value = v.get("em")
125
 
126
  if metric_value is not None:
127
  metric_values.append(metric_value)
 
128
 
129
  accs = np.array([v for v in metric_values if v is not None])
130
  if len(accs) == 0:
@@ -132,20 +151,22 @@ class EvalResult:
132
  if "all" in data["results"]:
133
  all_results = data["results"]["all"]
134
 
 
 
135
  # Try the expected metric name first
136
- metric_value = all_results.get(task.metric)
137
 
138
  # If not found, try alternative metric names
139
  if metric_value is None:
140
- if task.benchmark == "folio:logical_reasoning" and "folio_em" in all_results:
141
- metric_value = all_results.get("folio_em")
142
- elif task.benchmark == "telecom:qna" and "telecom_qna_em" in all_results:
143
- metric_value = all_results.get("telecom_qna_em")
144
- elif task.benchmark == "3gpp:tsg" and "em" in all_results:
145
- metric_value = all_results.get("em")
146
 
147
  if metric_value is not None:
148
  accs = np.array([metric_value])
 
149
  else:
150
  results[task.benchmark] = None
151
  continue
@@ -155,6 +176,7 @@ class EvalResult:
155
 
156
  mean_acc = np.mean(accs) * 100.0
157
  results[task.benchmark] = mean_acc
 
158
 
159
  # Extract energy score if available
160
  energy_score = "NA"
@@ -217,8 +239,10 @@ class EvalResult:
217
  AutoEvalColumn.energy_score.name: self.energy_score,
218
  }
219
 
 
220
  for task in Tasks:
221
  result = self.results.get(task.value.benchmark)
 
222
  data_dict[task.value.col_name] = "NA" if result is None else round(result, 2)
223
 
224
  return data_dict
@@ -268,20 +292,32 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
268
  for model_result_filepath in model_result_filepaths:
269
  try:
270
  # Creation of result
 
271
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
272
 
273
  # Skip entries with Unknown/Unknown model name
274
  if eval_result.full_model == "Unknown/Unknown":
275
  print(f"Skipping invalid result file: {model_result_filepath}")
276
  continue
 
 
 
 
 
277
 
278
  eval_result.update_with_request_file(requests_path)
 
 
 
 
279
 
280
  # Store results of same eval together
281
  eval_name = eval_result.eval_name
282
  if eval_name in eval_results.keys():
 
283
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
284
  else:
 
285
  eval_results[eval_name] = eval_result
286
  except Exception as e:
287
  print(f"Error processing result file {model_result_filepath}: {str(e)}")
 
105
  # Handle metrics that could be None in the JSON
106
  metric_values = []
107
 
108
+ # Define the expected metric name and alternative names for each benchmark
109
+ expected_metric = task.metric
110
+ alternative_metrics = []
111
+
112
+ print(f"Processing benchmark: {task.benchmark}, expected metric: {expected_metric}")
113
+
114
+ # Set up alternative metric names based on the benchmark
115
+ if task.benchmark == "custom|folio:logical_reasoning|0":
116
+ if expected_metric != "folio_em":
117
+ alternative_metrics = ["folio_em"]
118
+ elif task.benchmark == "custom|telecom:qna|0":
119
+ if expected_metric != "telecom_qna_em":
120
+ alternative_metrics = ["telecom_qna_em"]
121
+ elif task.benchmark == "custom|3gpp:tsg|0":
122
+ if expected_metric != "em":
123
+ alternative_metrics = ["em"]
124
+ elif task.benchmark == "custom|math:problem_solving|0":
125
+ if expected_metric != "math_metric":
126
+ alternative_metrics = ["math_metric"]
127
+ elif task.benchmark == "custom|spider:text2sql|0":
128
+ if expected_metric != "sql_metric":
129
+ alternative_metrics = ["sql_metric"]
130
+
131
  # Check for results with the benchmark name
132
  for k, v in data["results"].items():
133
  if task.benchmark == k:
134
  # Try the expected metric name first
135
+ metric_value = v.get(expected_metric)
136
 
137
  # If not found, try alternative metric names
138
  if metric_value is None:
139
+ for alt_metric in alternative_metrics:
140
+ if alt_metric in v:
141
+ metric_value = v.get(alt_metric)
142
+ break
 
 
 
 
 
143
 
144
  if metric_value is not None:
145
  metric_values.append(metric_value)
146
+ print(f"Found metric value for {task.benchmark}: {metric_value}")
147
 
148
  accs = np.array([v for v in metric_values if v is not None])
149
  if len(accs) == 0:
 
151
  if "all" in data["results"]:
152
  all_results = data["results"]["all"]
153
 
154
+ print(f"Checking 'all' section for {task.benchmark}, available keys: {list(all_results.keys())}")
155
+
156
  # Try the expected metric name first
157
+ metric_value = all_results.get(expected_metric)
158
 
159
  # If not found, try alternative metric names
160
  if metric_value is None:
161
+ for alt_metric in alternative_metrics:
162
+ if alt_metric in all_results:
163
+ metric_value = all_results.get(alt_metric)
164
+ print(f"Found alternative metric {alt_metric} in 'all' section")
165
+ break
 
166
 
167
  if metric_value is not None:
168
  accs = np.array([metric_value])
169
+ print(f"Found metric value in 'all' section for {task.benchmark}: {metric_value}")
170
  else:
171
  results[task.benchmark] = None
172
  continue
 
176
 
177
  mean_acc = np.mean(accs) * 100.0
178
  results[task.benchmark] = mean_acc
179
+ print(f"Final result for {task.benchmark}: {mean_acc}")
180
 
181
  # Extract energy score if available
182
  energy_score = "NA"
 
239
  AutoEvalColumn.energy_score.name: self.energy_score,
240
  }
241
 
242
+ print(f"\nConverting to dict for model: {self.full_model}")
243
  for task in Tasks:
244
  result = self.results.get(task.value.benchmark)
245
+ print(f" Task: {task.value.col_name}, Benchmark: {task.value.benchmark}, Result: {result}")
246
  data_dict[task.value.col_name] = "NA" if result is None else round(result, 2)
247
 
248
  return data_dict
 
292
  for model_result_filepath in model_result_filepaths:
293
  try:
294
  # Creation of result
295
+ print(f"\nProcessing file: {model_result_filepath}")
296
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
297
 
298
  # Skip entries with Unknown/Unknown model name
299
  if eval_result.full_model == "Unknown/Unknown":
300
  print(f"Skipping invalid result file: {model_result_filepath}")
301
  continue
302
+
303
+ print(f"Model: {eval_result.full_model}")
304
+ print(f"Results before update_with_request_file:")
305
+ for benchmark, value in eval_result.results.items():
306
+ print(f" {benchmark}: {value}")
307
 
308
  eval_result.update_with_request_file(requests_path)
309
+
310
+ print(f"Results after update_with_request_file:")
311
+ for benchmark, value in eval_result.results.items():
312
+ print(f" {benchmark}: {value}")
313
 
314
  # Store results of same eval together
315
  eval_name = eval_result.eval_name
316
  if eval_name in eval_results.keys():
317
+ print(f"Updating existing results for {eval_name}")
318
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
319
  else:
320
+ print(f"Adding new results for {eval_name}")
321
  eval_results[eval_name] = eval_result
322
  except Exception as e:
323
  print(f"Error processing result file {model_result_filepath}: {str(e)}")