Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
c99a049
1
Parent(s):
25de5ef
consolidate
Browse files- src/display/utils.py +2 -2
- src/leaderboard/read_evals.py +21 -17
src/display/utils.py
CHANGED
@@ -28,8 +28,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
-
#
|
32 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(
|
33 |
# Model information
|
34 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
+
# Use exact column name from Tasks
|
32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
34 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -78,9 +78,11 @@ class EvalResult:
|
|
78 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
79 |
# Calculate average, handling perplexity (lower is better)
|
80 |
scores = []
|
|
|
81 |
for task in Tasks:
|
82 |
if task.value.benchmark in self.results:
|
83 |
score = self.results[task.value.benchmark]
|
|
|
84 |
# Convert perplexity to a 0-100 scale where lower perplexity = higher score
|
85 |
# Using a log scale since perplexity can vary widely
|
86 |
# Cap at 100 for very low perplexity and 0 for very high perplexity
|
@@ -106,14 +108,11 @@ class EvalResult:
|
|
106 |
AutoEvalColumn.likes.name: 0, # Default likes
|
107 |
}
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
data_dict[f"{task.value.col_name} ⬇️"] = score
|
115 |
-
else:
|
116 |
-
data_dict[f"{task.value.col_name} ⬇️"] = None
|
117 |
|
118 |
return data_dict
|
119 |
|
@@ -131,22 +130,27 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
131 |
|
132 |
eval_results = {}
|
133 |
for model_result_filepath in model_result_filepaths:
|
134 |
-
|
135 |
-
|
|
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
143 |
|
144 |
results = []
|
145 |
for v in eval_results.values():
|
146 |
try:
|
147 |
v.to_dict() # we test if the dict version is complete
|
148 |
results.append(v)
|
149 |
-
except KeyError: # not all eval values present
|
|
|
150 |
continue
|
151 |
|
152 |
return results
|
|
|
78 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
79 |
# Calculate average, handling perplexity (lower is better)
|
80 |
scores = []
|
81 |
+
perplexity_score = None
|
82 |
for task in Tasks:
|
83 |
if task.value.benchmark in self.results:
|
84 |
score = self.results[task.value.benchmark]
|
85 |
+
perplexity_score = score # Save the raw score
|
86 |
# Convert perplexity to a 0-100 scale where lower perplexity = higher score
|
87 |
# Using a log scale since perplexity can vary widely
|
88 |
# Cap at 100 for very low perplexity and 0 for very high perplexity
|
|
|
108 |
AutoEvalColumn.likes.name: 0, # Default likes
|
109 |
}
|
110 |
|
111 |
+
# Add perplexity score with the exact column name from Tasks
|
112 |
+
if perplexity_score is not None:
|
113 |
+
data_dict[Tasks.task0.value.col_name] = perplexity_score
|
114 |
+
else:
|
115 |
+
data_dict[Tasks.task0.value.col_name] = None
|
|
|
|
|
|
|
116 |
|
117 |
return data_dict
|
118 |
|
|
|
130 |
|
131 |
eval_results = {}
|
132 |
for model_result_filepath in model_result_filepaths:
|
133 |
+
try:
|
134 |
+
# Creation of result
|
135 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
136 |
|
137 |
+
# Store results of same eval together
|
138 |
+
eval_name = eval_result.eval_name
|
139 |
+
if eval_name in eval_results.keys():
|
140 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
141 |
+
else:
|
142 |
+
eval_results[eval_name] = eval_result
|
143 |
+
except Exception as e:
|
144 |
+
print(f"Error processing result file {model_result_filepath}: {e}")
|
145 |
+
continue
|
146 |
|
147 |
results = []
|
148 |
for v in eval_results.values():
|
149 |
try:
|
150 |
v.to_dict() # we test if the dict version is complete
|
151 |
results.append(v)
|
152 |
+
except KeyError as e: # not all eval values present
|
153 |
+
print(f"Error converting result to dict: {e}")
|
154 |
continue
|
155 |
|
156 |
return results
|