p50038325
commited on
Commit
·
0b8fd4f
1
Parent(s):
f834299
energy-score
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -103,12 +103,55 @@ class EvalResult:
|
|
103 |
|
104 |
# We average all scores of a given metric (not all metrics are present in all files)
|
105 |
# Handle metrics that could be None in the JSON
|
106 |
-
metric_values = [
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
accs = np.array([v for v in metric_values if v is not None])
|
109 |
if len(accs) == 0:
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
mean_acc = np.mean(accs) * 100.0
|
114 |
results[task.benchmark] = mean_acc
|
|
|
103 |
|
104 |
# We average all scores of a given metric (not all metrics are present in all files)
|
105 |
# Handle metrics that could be None in the JSON
|
106 |
+
metric_values = []
|
107 |
+
|
108 |
+
# Check for results with the benchmark name
|
109 |
+
for k, v in data["results"].items():
|
110 |
+
if task.benchmark == k:
|
111 |
+
# Try the expected metric name first
|
112 |
+
metric_value = v.get(task.metric)
|
113 |
+
|
114 |
+
# If not found, try alternative metric names
|
115 |
+
if metric_value is None:
|
116 |
+
# For FOLIO, also check for "folio_em"
|
117 |
+
if task.benchmark == "folio:logical_reasoning" and "folio_em" in v:
|
118 |
+
metric_value = v.get("folio_em")
|
119 |
+
# For TELECOM-QnA, also check for "telecom_qna_em"
|
120 |
+
elif task.benchmark == "telecom:qna" and "telecom_qna_em" in v:
|
121 |
+
metric_value = v.get("telecom_qna_em")
|
122 |
+
# For 3GPP-TSG, also check for generic "em"
|
123 |
+
elif task.benchmark == "3gpp:tsg" and "em" in v:
|
124 |
+
metric_value = v.get("em")
|
125 |
+
|
126 |
+
if metric_value is not None:
|
127 |
+
metric_values.append(metric_value)
|
128 |
+
|
129 |
accs = np.array([v for v in metric_values if v is not None])
|
130 |
if len(accs) == 0:
|
131 |
+
# Also check the "all" section for metrics
|
132 |
+
if "all" in data["results"]:
|
133 |
+
all_results = data["results"]["all"]
|
134 |
+
|
135 |
+
# Try the expected metric name first
|
136 |
+
metric_value = all_results.get(task.metric)
|
137 |
+
|
138 |
+
# If not found, try alternative metric names
|
139 |
+
if metric_value is None:
|
140 |
+
if task.benchmark == "folio:logical_reasoning" and "folio_em" in all_results:
|
141 |
+
metric_value = all_results.get("folio_em")
|
142 |
+
elif task.benchmark == "telecom:qna" and "telecom_qna_em" in all_results:
|
143 |
+
metric_value = all_results.get("telecom_qna_em")
|
144 |
+
elif task.benchmark == "3gpp:tsg" and "em" in all_results:
|
145 |
+
metric_value = all_results.get("em")
|
146 |
+
|
147 |
+
if metric_value is not None:
|
148 |
+
accs = np.array([metric_value])
|
149 |
+
else:
|
150 |
+
results[task.benchmark] = None
|
151 |
+
continue
|
152 |
+
else:
|
153 |
+
results[task.benchmark] = None
|
154 |
+
continue
|
155 |
|
156 |
mean_acc = np.mean(accs) * 100.0
|
157 |
results[task.benchmark] = mean_acc
|