Spaces:

otellm
/

open-telecom-llm-leaderboard

Running

App Files Files Community

p50038325 commited on May 11

Commit

0b8fd4f

1 Parent(s): f834299

energy-score

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +47 -4

src/leaderboard/read_evals.py CHANGED Viewed

@@ -103,12 +103,55 @@ class EvalResult:
                 # We average all scores of a given metric (not all metrics are present in all files)
                 # Handle metrics that could be None in the JSON
-                metric_values = [v.get(task.metric) for k, v in data["results"].items() if task.benchmark == k]
                 accs = np.array([v for v in metric_values if v is not None])
                 if len(accs) == 0:
-                    results[task.benchmark] = None
-                    continue
                 mean_acc = np.mean(accs) * 100.0
                 results[task.benchmark] = mean_acc

                 # We average all scores of a given metric (not all metrics are present in all files)
                 # Handle metrics that could be None in the JSON
+                metric_values = []
+                # Check for results with the benchmark name
+                for k, v in data["results"].items():
+                    if task.benchmark == k:
+                        # Try the expected metric name first
+                        metric_value = v.get(task.metric)
+                        # If not found, try alternative metric names
+                        if metric_value is None:
+                            # For FOLIO, also check for "folio_em"
+                            if task.benchmark == "folio:logical_reasoning" and "folio_em" in v:
+                                metric_value = v.get("folio_em")
+                            # For TELECOM-QnA, also check for "telecom_qna_em"
+                            elif task.benchmark == "telecom:qna" and "telecom_qna_em" in v:
+                                metric_value = v.get("telecom_qna_em")
+                            # For 3GPP-TSG, also check for generic "em"
+                            elif task.benchmark == "3gpp:tsg" and "em" in v:
+                                metric_value = v.get("em")
+                        if metric_value is not None:
+                            metric_values.append(metric_value)
                 accs = np.array([v for v in metric_values if v is not None])
                 if len(accs) == 0:
+                    # Also check the "all" section for metrics
+                    if "all" in data["results"]:
+                        all_results = data["results"]["all"]
+                        # Try the expected metric name first
+                        metric_value = all_results.get(task.metric)
+                        # If not found, try alternative metric names
+                        if metric_value is None:
+                            if task.benchmark == "folio:logical_reasoning" and "folio_em" in all_results:
+                                metric_value = all_results.get("folio_em")
+                            elif task.benchmark == "telecom:qna" and "telecom_qna_em" in all_results:
+                                metric_value = all_results.get("telecom_qna_em")
+                            elif task.benchmark == "3gpp:tsg" and "em" in all_results:
+                                metric_value = all_results.get("em")
+                        if metric_value is not None:
+                            accs = np.array([metric_value])
+                        else:
+                            results[task.benchmark] = None
+                            continue
+                    else:
+                        results[task.benchmark] = None
+                        continue
                 mean_acc = np.mean(accs) * 100.0
                 results[task.benchmark] = mean_acc