p50038325
commited on
Commit
·
7321625
1
Parent(s):
0b8fd4f
energy-score
Browse files- src/leaderboard/read_evals.py +53 -17
src/leaderboard/read_evals.py
CHANGED
@@ -105,26 +105,45 @@ class EvalResult:
|
|
105 |
# Handle metrics that could be None in the JSON
|
106 |
metric_values = []
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
# Check for results with the benchmark name
|
109 |
for k, v in data["results"].items():
|
110 |
if task.benchmark == k:
|
111 |
# Try the expected metric name first
|
112 |
-
metric_value = v.get(
|
113 |
|
114 |
# If not found, try alternative metric names
|
115 |
if metric_value is None:
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
elif task.benchmark == "telecom:qna" and "telecom_qna_em" in v:
|
121 |
-
metric_value = v.get("telecom_qna_em")
|
122 |
-
# For 3GPP-TSG, also check for generic "em"
|
123 |
-
elif task.benchmark == "3gpp:tsg" and "em" in v:
|
124 |
-
metric_value = v.get("em")
|
125 |
|
126 |
if metric_value is not None:
|
127 |
metric_values.append(metric_value)
|
|
|
128 |
|
129 |
accs = np.array([v for v in metric_values if v is not None])
|
130 |
if len(accs) == 0:
|
@@ -132,20 +151,22 @@ class EvalResult:
|
|
132 |
if "all" in data["results"]:
|
133 |
all_results = data["results"]["all"]
|
134 |
|
|
|
|
|
135 |
# Try the expected metric name first
|
136 |
-
metric_value = all_results.get(
|
137 |
|
138 |
# If not found, try alternative metric names
|
139 |
if metric_value is None:
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
metric_value = all_results.get("em")
|
146 |
|
147 |
if metric_value is not None:
|
148 |
accs = np.array([metric_value])
|
|
|
149 |
else:
|
150 |
results[task.benchmark] = None
|
151 |
continue
|
@@ -155,6 +176,7 @@ class EvalResult:
|
|
155 |
|
156 |
mean_acc = np.mean(accs) * 100.0
|
157 |
results[task.benchmark] = mean_acc
|
|
|
158 |
|
159 |
# Extract energy score if available
|
160 |
energy_score = "NA"
|
@@ -217,8 +239,10 @@ class EvalResult:
|
|
217 |
AutoEvalColumn.energy_score.name: self.energy_score,
|
218 |
}
|
219 |
|
|
|
220 |
for task in Tasks:
|
221 |
result = self.results.get(task.value.benchmark)
|
|
|
222 |
data_dict[task.value.col_name] = "NA" if result is None else round(result, 2)
|
223 |
|
224 |
return data_dict
|
@@ -268,20 +292,32 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
268 |
for model_result_filepath in model_result_filepaths:
|
269 |
try:
|
270 |
# Creation of result
|
|
|
271 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
272 |
|
273 |
# Skip entries with Unknown/Unknown model name
|
274 |
if eval_result.full_model == "Unknown/Unknown":
|
275 |
print(f"Skipping invalid result file: {model_result_filepath}")
|
276 |
continue
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
eval_result.update_with_request_file(requests_path)
|
|
|
|
|
|
|
|
|
279 |
|
280 |
# Store results of same eval together
|
281 |
eval_name = eval_result.eval_name
|
282 |
if eval_name in eval_results.keys():
|
|
|
283 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
284 |
else:
|
|
|
285 |
eval_results[eval_name] = eval_result
|
286 |
except Exception as e:
|
287 |
print(f"Error processing result file {model_result_filepath}: {str(e)}")
|
|
|
105 |
# Handle metrics that could be None in the JSON
|
106 |
metric_values = []
|
107 |
|
108 |
+
# Define the expected metric name and alternative names for each benchmark
|
109 |
+
expected_metric = task.metric
|
110 |
+
alternative_metrics = []
|
111 |
+
|
112 |
+
print(f"Processing benchmark: {task.benchmark}, expected metric: {expected_metric}")
|
113 |
+
|
114 |
+
# Set up alternative metric names based on the benchmark
|
115 |
+
if task.benchmark == "custom|folio:logical_reasoning|0":
|
116 |
+
if expected_metric != "folio_em":
|
117 |
+
alternative_metrics = ["folio_em"]
|
118 |
+
elif task.benchmark == "custom|telecom:qna|0":
|
119 |
+
if expected_metric != "telecom_qna_em":
|
120 |
+
alternative_metrics = ["telecom_qna_em"]
|
121 |
+
elif task.benchmark == "custom|3gpp:tsg|0":
|
122 |
+
if expected_metric != "em":
|
123 |
+
alternative_metrics = ["em"]
|
124 |
+
elif task.benchmark == "custom|math:problem_solving|0":
|
125 |
+
if expected_metric != "math_metric":
|
126 |
+
alternative_metrics = ["math_metric"]
|
127 |
+
elif task.benchmark == "custom|spider:text2sql|0":
|
128 |
+
if expected_metric != "sql_metric":
|
129 |
+
alternative_metrics = ["sql_metric"]
|
130 |
+
|
131 |
# Check for results with the benchmark name
|
132 |
for k, v in data["results"].items():
|
133 |
if task.benchmark == k:
|
134 |
# Try the expected metric name first
|
135 |
+
metric_value = v.get(expected_metric)
|
136 |
|
137 |
# If not found, try alternative metric names
|
138 |
if metric_value is None:
|
139 |
+
for alt_metric in alternative_metrics:
|
140 |
+
if alt_metric in v:
|
141 |
+
metric_value = v.get(alt_metric)
|
142 |
+
break
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
if metric_value is not None:
|
145 |
metric_values.append(metric_value)
|
146 |
+
print(f"Found metric value for {task.benchmark}: {metric_value}")
|
147 |
|
148 |
accs = np.array([v for v in metric_values if v is not None])
|
149 |
if len(accs) == 0:
|
|
|
151 |
if "all" in data["results"]:
|
152 |
all_results = data["results"]["all"]
|
153 |
|
154 |
+
print(f"Checking 'all' section for {task.benchmark}, available keys: {list(all_results.keys())}")
|
155 |
+
|
156 |
# Try the expected metric name first
|
157 |
+
metric_value = all_results.get(expected_metric)
|
158 |
|
159 |
# If not found, try alternative metric names
|
160 |
if metric_value is None:
|
161 |
+
for alt_metric in alternative_metrics:
|
162 |
+
if alt_metric in all_results:
|
163 |
+
metric_value = all_results.get(alt_metric)
|
164 |
+
print(f"Found alternative metric {alt_metric} in 'all' section")
|
165 |
+
break
|
|
|
166 |
|
167 |
if metric_value is not None:
|
168 |
accs = np.array([metric_value])
|
169 |
+
print(f"Found metric value in 'all' section for {task.benchmark}: {metric_value}")
|
170 |
else:
|
171 |
results[task.benchmark] = None
|
172 |
continue
|
|
|
176 |
|
177 |
mean_acc = np.mean(accs) * 100.0
|
178 |
results[task.benchmark] = mean_acc
|
179 |
+
print(f"Final result for {task.benchmark}: {mean_acc}")
|
180 |
|
181 |
# Extract energy score if available
|
182 |
energy_score = "NA"
|
|
|
239 |
AutoEvalColumn.energy_score.name: self.energy_score,
|
240 |
}
|
241 |
|
242 |
+
print(f"\nConverting to dict for model: {self.full_model}")
|
243 |
for task in Tasks:
|
244 |
result = self.results.get(task.value.benchmark)
|
245 |
+
print(f" Task: {task.value.col_name}, Benchmark: {task.value.benchmark}, Result: {result}")
|
246 |
data_dict[task.value.col_name] = "NA" if result is None else round(result, 2)
|
247 |
|
248 |
return data_dict
|
|
|
292 |
for model_result_filepath in model_result_filepaths:
|
293 |
try:
|
294 |
# Creation of result
|
295 |
+
print(f"\nProcessing file: {model_result_filepath}")
|
296 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
297 |
|
298 |
# Skip entries with Unknown/Unknown model name
|
299 |
if eval_result.full_model == "Unknown/Unknown":
|
300 |
print(f"Skipping invalid result file: {model_result_filepath}")
|
301 |
continue
|
302 |
+
|
303 |
+
print(f"Model: {eval_result.full_model}")
|
304 |
+
print(f"Results before update_with_request_file:")
|
305 |
+
for benchmark, value in eval_result.results.items():
|
306 |
+
print(f" {benchmark}: {value}")
|
307 |
|
308 |
eval_result.update_with_request_file(requests_path)
|
309 |
+
|
310 |
+
print(f"Results after update_with_request_file:")
|
311 |
+
for benchmark, value in eval_result.results.items():
|
312 |
+
print(f" {benchmark}: {value}")
|
313 |
|
314 |
# Store results of same eval together
|
315 |
eval_name = eval_result.eval_name
|
316 |
if eval_name in eval_results.keys():
|
317 |
+
print(f"Updating existing results for {eval_name}")
|
318 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
319 |
else:
|
320 |
+
print(f"Adding new results for {eval_name}")
|
321 |
eval_results[eval_name] = eval_result
|
322 |
except Exception as e:
|
323 |
print(f"Error processing result file {model_result_filepath}: {str(e)}")
|