p50038325 commited on
Commit
0b8fd4f
·
1 Parent(s): f834299

energy-score

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +47 -4
src/leaderboard/read_evals.py CHANGED
@@ -103,12 +103,55 @@ class EvalResult:
103
 
104
  # We average all scores of a given metric (not all metrics are present in all files)
105
  # Handle metrics that could be None in the JSON
106
- metric_values = [v.get(task.metric) for k, v in data["results"].items() if task.benchmark == k]
107
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  accs = np.array([v for v in metric_values if v is not None])
109
  if len(accs) == 0:
110
- results[task.benchmark] = None
111
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  mean_acc = np.mean(accs) * 100.0
114
  results[task.benchmark] = mean_acc
 
103
 
104
  # We average all scores of a given metric (not all metrics are present in all files)
105
  # Handle metrics that could be None in the JSON
106
+ metric_values = []
107
+
108
+ # Check for results with the benchmark name
109
+ for k, v in data["results"].items():
110
+ if task.benchmark == k:
111
+ # Try the expected metric name first
112
+ metric_value = v.get(task.metric)
113
+
114
+ # If not found, try alternative metric names
115
+ if metric_value is None:
116
+ # For FOLIO, also check for "folio_em"
117
+ if task.benchmark == "folio:logical_reasoning" and "folio_em" in v:
118
+ metric_value = v.get("folio_em")
119
+ # For TELECOM-QnA, also check for "telecom_qna_em"
120
+ elif task.benchmark == "telecom:qna" and "telecom_qna_em" in v:
121
+ metric_value = v.get("telecom_qna_em")
122
+ # For 3GPP-TSG, also check for generic "em"
123
+ elif task.benchmark == "3gpp:tsg" and "em" in v:
124
+ metric_value = v.get("em")
125
+
126
+ if metric_value is not None:
127
+ metric_values.append(metric_value)
128
+
129
  accs = np.array([v for v in metric_values if v is not None])
130
  if len(accs) == 0:
131
+ # Also check the "all" section for metrics
132
+ if "all" in data["results"]:
133
+ all_results = data["results"]["all"]
134
+
135
+ # Try the expected metric name first
136
+ metric_value = all_results.get(task.metric)
137
+
138
+ # If not found, try alternative metric names
139
+ if metric_value is None:
140
+ if task.benchmark == "folio:logical_reasoning" and "folio_em" in all_results:
141
+ metric_value = all_results.get("folio_em")
142
+ elif task.benchmark == "telecom:qna" and "telecom_qna_em" in all_results:
143
+ metric_value = all_results.get("telecom_qna_em")
144
+ elif task.benchmark == "3gpp:tsg" and "em" in all_results:
145
+ metric_value = all_results.get("em")
146
+
147
+ if metric_value is not None:
148
+ accs = np.array([metric_value])
149
+ else:
150
+ results[task.benchmark] = None
151
+ continue
152
+ else:
153
+ results[task.benchmark] = None
154
+ continue
155
 
156
  mean_acc = np.mean(accs) * 100.0
157
  results[task.benchmark] = mean_acc