Ahmed Ahmed commited on
Commit
c99a049
·
1 Parent(s): 25de5ef

consolidate

Browse files
src/display/utils.py CHANGED
@@ -28,8 +28,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- # All perplexity scores show with ⬇️ since lower is better
32
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"{task.value.col_name} ⬇️", "number", True)])
33
  # Model information
34
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
+ # Use exact column name from Tasks
32
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/leaderboard/read_evals.py CHANGED
@@ -78,9 +78,11 @@ class EvalResult:
78
  """Converts the Eval Result to a dict compatible with our dataframe display"""
79
  # Calculate average, handling perplexity (lower is better)
80
  scores = []
 
81
  for task in Tasks:
82
  if task.value.benchmark in self.results:
83
  score = self.results[task.value.benchmark]
 
84
  # Convert perplexity to a 0-100 scale where lower perplexity = higher score
85
  # Using a log scale since perplexity can vary widely
86
  # Cap at 100 for very low perplexity and 0 for very high perplexity
@@ -106,14 +108,11 @@ class EvalResult:
106
  AutoEvalColumn.likes.name: 0, # Default likes
107
  }
108
 
109
- for task in Tasks:
110
- benchmark = task.value.benchmark
111
- if benchmark in self.results:
112
- score = self.results[benchmark]
113
- # Store original perplexity score (lower is better)
114
- data_dict[f"{task.value.col_name} ⬇️"] = score
115
- else:
116
- data_dict[f"{task.value.col_name} ⬇️"] = None
117
 
118
  return data_dict
119
 
@@ -131,22 +130,27 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
131
 
132
  eval_results = {}
133
  for model_result_filepath in model_result_filepaths:
134
- # Creation of result
135
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
136
 
137
- # Store results of same eval together
138
- eval_name = eval_result.eval_name
139
- if eval_name in eval_results.keys():
140
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
141
- else:
142
- eval_results[eval_name] = eval_result
 
 
 
143
 
144
  results = []
145
  for v in eval_results.values():
146
  try:
147
  v.to_dict() # we test if the dict version is complete
148
  results.append(v)
149
- except KeyError: # not all eval values present
 
150
  continue
151
 
152
  return results
 
78
  """Converts the Eval Result to a dict compatible with our dataframe display"""
79
  # Calculate average, handling perplexity (lower is better)
80
  scores = []
81
+ perplexity_score = None
82
  for task in Tasks:
83
  if task.value.benchmark in self.results:
84
  score = self.results[task.value.benchmark]
85
+ perplexity_score = score # Save the raw score
86
  # Convert perplexity to a 0-100 scale where lower perplexity = higher score
87
  # Using a log scale since perplexity can vary widely
88
  # Cap at 100 for very low perplexity and 0 for very high perplexity
 
108
  AutoEvalColumn.likes.name: 0, # Default likes
109
  }
110
 
111
+ # Add perplexity score with the exact column name from Tasks
112
+ if perplexity_score is not None:
113
+ data_dict[Tasks.task0.value.col_name] = perplexity_score
114
+ else:
115
+ data_dict[Tasks.task0.value.col_name] = None
 
 
 
116
 
117
  return data_dict
118
 
 
130
 
131
  eval_results = {}
132
  for model_result_filepath in model_result_filepaths:
133
+ try:
134
+ # Creation of result
135
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
136
 
137
+ # Store results of same eval together
138
+ eval_name = eval_result.eval_name
139
+ if eval_name in eval_results.keys():
140
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
141
+ else:
142
+ eval_results[eval_name] = eval_result
143
+ except Exception as e:
144
+ print(f"Error processing result file {model_result_filepath}: {e}")
145
+ continue
146
 
147
  results = []
148
  for v in eval_results.values():
149
  try:
150
  v.to_dict() # we test if the dict version is complete
151
  results.append(v)
152
+ except KeyError as e: # not all eval values present
153
+ print(f"Error converting result to dict: {e}")
154
  continue
155
 
156
  return results