Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
25de5ef
1
Parent(s):
c1fc4e2
consolidate
Browse files- src/display/utils.py +2 -3
- src/leaderboard/read_evals.py +6 -2
src/display/utils.py
CHANGED
@@ -28,9 +28,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
-
#
|
32 |
-
|
33 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"{task.value.col_name} {arrow}", "number", True)])
|
34 |
# Model information
|
35 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
36 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
+
# All perplexity scores show with ⬇️ since lower is better
|
32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"{task.value.col_name} ⬇️", "number", True)])
|
|
|
33 |
# Model information
|
34 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -100,6 +100,10 @@ class EvalResult:
|
|
100 |
AutoEvalColumn.revision.name: self.revision,
|
101 |
AutoEvalColumn.average.name: average,
|
102 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
|
|
|
|
|
|
103 |
}
|
104 |
|
105 |
for task in Tasks:
|
@@ -107,9 +111,9 @@ class EvalResult:
|
|
107 |
if benchmark in self.results:
|
108 |
score = self.results[benchmark]
|
109 |
# Store original perplexity score (lower is better)
|
110 |
-
data_dict[task.value.col_name] = score
|
111 |
else:
|
112 |
-
data_dict[task.value.col_name] = None
|
113 |
|
114 |
return data_dict
|
115 |
|
|
|
100 |
AutoEvalColumn.revision.name: self.revision,
|
101 |
AutoEvalColumn.average.name: average,
|
102 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
103 |
+
# Add missing columns with default values
|
104 |
+
AutoEvalColumn.license.name: "Unknown", # Default license
|
105 |
+
AutoEvalColumn.params.name: 0, # Default params
|
106 |
+
AutoEvalColumn.likes.name: 0, # Default likes
|
107 |
}
|
108 |
|
109 |
for task in Tasks:
|
|
|
111 |
if benchmark in self.results:
|
112 |
score = self.results[benchmark]
|
113 |
# Store original perplexity score (lower is better)
|
114 |
+
data_dict[f"{task.value.col_name} ⬇️"] = score
|
115 |
else:
|
116 |
+
data_dict[f"{task.value.col_name} ⬇️"] = None
|
117 |
|
118 |
return data_dict
|
119 |
|