Spaces:

otellm
/

open-telecom-llm-leaderboard

Running

App Files Files Community

p50038325 commited on May 11

Commit

7321625

1 Parent(s): 0b8fd4f

energy-score

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +53 -17

src/leaderboard/read_evals.py CHANGED Viewed

@@ -105,26 +105,45 @@ class EvalResult:
                 # Handle metrics that could be None in the JSON
                 metric_values = []
                 # Check for results with the benchmark name
                 for k, v in data["results"].items():
                     if task.benchmark == k:
                         # Try the expected metric name first
-                        metric_value = v.get(task.metric)
                         # If not found, try alternative metric names
                         if metric_value is None:
-                            # For FOLIO, also check for "folio_em"
-                            if task.benchmark == "folio:logical_reasoning" and "folio_em" in v:
-                                metric_value = v.get("folio_em")
-                            # For TELECOM-QnA, also check for "telecom_qna_em"
-                            elif task.benchmark == "telecom:qna" and "telecom_qna_em" in v:
-                                metric_value = v.get("telecom_qna_em")
-                            # For 3GPP-TSG, also check for generic "em"
-                            elif task.benchmark == "3gpp:tsg" and "em" in v:
-                                metric_value = v.get("em")
                         if metric_value is not None:
                             metric_values.append(metric_value)
                 accs = np.array([v for v in metric_values if v is not None])
                 if len(accs) == 0:
@@ -132,20 +151,22 @@ class EvalResult:
                     if "all" in data["results"]:
                         all_results = data["results"]["all"]
                         # Try the expected metric name first
-                        metric_value = all_results.get(task.metric)
                         # If not found, try alternative metric names
                         if metric_value is None:
-                            if task.benchmark == "folio:logical_reasoning" and "folio_em" in all_results:
-                                metric_value = all_results.get("folio_em")
-                            elif task.benchmark == "telecom:qna" and "telecom_qna_em" in all_results:
-                                metric_value = all_results.get("telecom_qna_em")
-                            elif task.benchmark == "3gpp:tsg" and "em" in all_results:
-                                metric_value = all_results.get("em")
                         if metric_value is not None:
                             accs = np.array([metric_value])
                         else:
                             results[task.benchmark] = None
                             continue
@@ -155,6 +176,7 @@ class EvalResult:
                 mean_acc = np.mean(accs) * 100.0
                 results[task.benchmark] = mean_acc
         # Extract energy score if available
         energy_score = "NA"
@@ -217,8 +239,10 @@ class EvalResult:
             AutoEvalColumn.energy_score.name: self.energy_score,
         }
         for task in Tasks:
             result = self.results.get(task.value.benchmark)
             data_dict[task.value.col_name] = "NA" if result is None else round(result, 2)
         return data_dict
@@ -268,20 +292,32 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for model_result_filepath in model_result_filepaths:
         try:
             # Creation of result
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
             # Skip entries with Unknown/Unknown model name
             if eval_result.full_model == "Unknown/Unknown":
                 print(f"Skipping invalid result file: {model_result_filepath}")
                 continue
             eval_result.update_with_request_file(requests_path)
             # Store results of same eval together
             eval_name = eval_result.eval_name
             if eval_name in eval_results.keys():
                 eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
             else:
                 eval_results[eval_name] = eval_result
         except Exception as e:
             print(f"Error processing result file {model_result_filepath}: {str(e)}")

                 # Handle metrics that could be None in the JSON
                 metric_values = []
+                # Define the expected metric name and alternative names for each benchmark
+                expected_metric = task.metric
+                alternative_metrics = []
+                print(f"Processing benchmark: {task.benchmark}, expected metric: {expected_metric}")
+                # Set up alternative metric names based on the benchmark
+                if task.benchmark == "custom|folio:logical_reasoning|0":
+                    if expected_metric != "folio_em":
+                        alternative_metrics = ["folio_em"]
+                elif task.benchmark == "custom|telecom:qna|0":
+                    if expected_metric != "telecom_qna_em":
+                        alternative_metrics = ["telecom_qna_em"]
+                elif task.benchmark == "custom|3gpp:tsg|0":
+                    if expected_metric != "em":
+                        alternative_metrics = ["em"]
+                elif task.benchmark == "custom|math:problem_solving|0":
+                    if expected_metric != "math_metric":
+                        alternative_metrics = ["math_metric"]
+                elif task.benchmark == "custom|spider:text2sql|0":
+                    if expected_metric != "sql_metric":
+                        alternative_metrics = ["sql_metric"]
                 # Check for results with the benchmark name
                 for k, v in data["results"].items():
                     if task.benchmark == k:
                         # Try the expected metric name first
+                        metric_value = v.get(expected_metric)
                         # If not found, try alternative metric names
                         if metric_value is None:
+                            for alt_metric in alternative_metrics:
+                                if alt_metric in v:
+                                    metric_value = v.get(alt_metric)
+                                    break
                         if metric_value is not None:
                             metric_values.append(metric_value)
+                            print(f"Found metric value for {task.benchmark}: {metric_value}")
                 accs = np.array([v for v in metric_values if v is not None])
                 if len(accs) == 0:
                     if "all" in data["results"]:
                         all_results = data["results"]["all"]
+                        print(f"Checking 'all' section for {task.benchmark}, available keys: {list(all_results.keys())}")
                         # Try the expected metric name first
+                        metric_value = all_results.get(expected_metric)
                         # If not found, try alternative metric names
                         if metric_value is None:
+                            for alt_metric in alternative_metrics:
+                                if alt_metric in all_results:
+                                    metric_value = all_results.get(alt_metric)
+                                    print(f"Found alternative metric {alt_metric} in 'all' section")
+                                    break
                         if metric_value is not None:
                             accs = np.array([metric_value])
+                            print(f"Found metric value in 'all' section for {task.benchmark}: {metric_value}")
                         else:
                             results[task.benchmark] = None
                             continue
                 mean_acc = np.mean(accs) * 100.0
                 results[task.benchmark] = mean_acc
+                print(f"Final result for {task.benchmark}: {mean_acc}")
         # Extract energy score if available
         energy_score = "NA"
             AutoEvalColumn.energy_score.name: self.energy_score,
         }
+        print(f"\nConverting to dict for model: {self.full_model}")
         for task in Tasks:
             result = self.results.get(task.value.benchmark)
+            print(f"  Task: {task.value.col_name}, Benchmark: {task.value.benchmark}, Result: {result}")
             data_dict[task.value.col_name] = "NA" if result is None else round(result, 2)
         return data_dict
     for model_result_filepath in model_result_filepaths:
         try:
             # Creation of result
+            print(f"\nProcessing file: {model_result_filepath}")
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
             # Skip entries with Unknown/Unknown model name
             if eval_result.full_model == "Unknown/Unknown":
                 print(f"Skipping invalid result file: {model_result_filepath}")
                 continue
+            print(f"Model: {eval_result.full_model}")
+            print(f"Results before update_with_request_file:")
+            for benchmark, value in eval_result.results.items():
+                print(f"  {benchmark}: {value}")
             eval_result.update_with_request_file(requests_path)
+            print(f"Results after update_with_request_file:")
+            for benchmark, value in eval_result.results.items():
+                print(f"  {benchmark}: {value}")
             # Store results of same eval together
             eval_name = eval_result.eval_name
             if eval_name in eval_results.keys():
+                print(f"Updating existing results for {eval_name}")
                 eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
             else:
+                print(f"Adding new results for {eval_name}")
                 eval_results[eval_name] = eval_result
         except Exception as e:
             print(f"Error processing result file {model_result_filepath}: {str(e)}")