Spaces:
Running
Running
Basma Boussaha
commited on
Commit
·
ae5099a
1
Parent(s):
b738c66
leaderboard
Browse files- app.py +14 -15
- src/about.py +13 -14
- src/display/utils.py +60 -68
- src/leaderboard/read_evals.py +97 -139
- src/populate.py +4 -4
app.py
CHANGED
@@ -14,15 +14,11 @@ from src.about import (
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
-
BENCHMARK_COLS,
|
18 |
COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
-
ModelType,
|
23 |
fields,
|
24 |
-
WeightType,
|
25 |
-
Precision
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
@@ -34,14 +30,14 @@ def restart_space():
|
|
34 |
|
35 |
### Space initialisation
|
36 |
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
39 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
)
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
snapshot_download(
|
46 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
)
|
@@ -49,7 +45,7 @@ except Exception:
|
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
@@ -63,12 +59,15 @@ def init_leaderboard(dataframe):
|
|
63 |
return Leaderboard(
|
64 |
value=dataframe,
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
),
|
71 |
-
search_columns=[
|
|
|
|
|
|
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
# filter_columns=[
|
74 |
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
@@ -85,14 +84,14 @@ def init_leaderboard(dataframe):
|
|
85 |
# ),
|
86 |
# ],
|
87 |
# bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=
|
89 |
)
|
90 |
|
91 |
|
92 |
demo = gr.Blocks(css=custom_css)
|
93 |
with demo:
|
94 |
gr.HTML(TITLE)
|
95 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
with gr.TabItem("🏅 E2LMC Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
|
|
17 |
COLS,
|
18 |
EVAL_COLS,
|
19 |
EVAL_TYPES,
|
20 |
AutoEvalColumn,
|
|
|
21 |
fields,
|
|
|
|
|
22 |
)
|
23 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
24 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
30 |
|
31 |
### Space initialisation
|
32 |
try:
|
33 |
+
# print(EVAL_REQUESTS_PATH)
|
34 |
snapshot_download(
|
35 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
36 |
)
|
37 |
except Exception:
|
38 |
restart_space()
|
39 |
try:
|
40 |
+
# print(EVAL_RESULTS_PATH)
|
41 |
snapshot_download(
|
42 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
43 |
)
|
|
|
45 |
restart_space()
|
46 |
|
47 |
|
48 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS)
|
49 |
|
50 |
(
|
51 |
finished_eval_queue_df,
|
|
|
59 |
return Leaderboard(
|
60 |
value=dataframe,
|
61 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
62 |
+
# select_columns=SelectColumns(
|
63 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
64 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
65 |
+
# label="Select Columns to Display:",
|
66 |
+
# ),
|
67 |
+
search_columns=[
|
68 |
+
AutoEvalColumn.team.name,
|
69 |
+
AutoEvalColumn.submitter.name,
|
70 |
+
],
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
# filter_columns=[
|
73 |
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
|
|
84 |
# ),
|
85 |
# ],
|
86 |
# bool_checkboxgroup_label="Hide models",
|
87 |
+
interactive=True,
|
88 |
)
|
89 |
|
90 |
|
91 |
demo = gr.Blocks(css=custom_css)
|
92 |
with demo:
|
93 |
gr.HTML(TITLE)
|
94 |
+
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
95 |
|
96 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
97 |
with gr.TabItem("🏅 E2LMC Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
src/about.py
CHANGED
@@ -1,31 +1,30 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
-
@dataclass
|
5 |
-
class Task:
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
|
10 |
|
11 |
-
# Select your tasks here
|
12 |
-
# ---------------------------------------------------
|
13 |
-
class Tasks(Enum):
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
Intro text
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
# @dataclass
|
5 |
+
# class Task:
|
6 |
+
# benchmark: str
|
7 |
+
# metric: str
|
8 |
+
# col_name: str
|
9 |
|
10 |
|
11 |
+
# # Select your tasks here
|
12 |
+
# # ---------------------------------------------------
|
13 |
+
# class Tasks(Enum):
|
14 |
+
# # task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
# task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
|
18 |
+
# NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">NeurIPS 2025 E2LM Competition: Early Training Evaluation of Language Models Leaderboard</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
|
|
28 |
"""
|
29 |
|
30 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/display/utils.py
CHANGED
@@ -3,8 +3,6 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
-
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
@@ -23,22 +21,16 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["
|
27 |
-
auto_eval_column_dict.append(["
|
28 |
-
|
29 |
-
auto_eval_column_dict.append(["
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
auto_eval_column_dict.append(["
|
34 |
-
auto_eval_column_dict.append(["
|
35 |
-
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -46,59 +38,59 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
precision = ColumnContent("precision", "str", True)
|
53 |
-
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
56 |
## All the model information that we might need
|
57 |
-
@dataclass
|
58 |
-
class ModelDetails:
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
class ModelType(Enum):
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
class WeightType(Enum):
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
class Precision(Enum):
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
|
103 |
# Column selection
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
@@ -106,5 +98,5 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
110 |
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
|
|
6 |
def fields(raw_class):
|
7 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
8 |
|
|
|
21 |
## Leaderboard columns
|
22 |
auto_eval_column_dict = []
|
23 |
# Init
|
24 |
+
auto_eval_column_dict.append(["submission_hash", ColumnContent, ColumnContent("Submission Hash", "str", True, never_hidden=True)])
|
25 |
+
auto_eval_column_dict.append(["submitter", ColumnContent, ColumnContent("Submitter", "str", True, never_hidden=True)])
|
26 |
+
auto_eval_column_dict.append(["description", ColumnContent, ColumnContent("Description", "str", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["team", ColumnContent, ColumnContent("Team", "str", True, never_hidden=True)])
|
28 |
+
auto_eval_column_dict.append(["submitted_at", ColumnContent, ColumnContent("Submission Time", "str", True, never_hidden=True)])
|
29 |
+
auto_eval_column_dict.append(["consistency_score", ColumnContent, ColumnContent("Consistency Score", "number", True, never_hidden=True)])
|
30 |
+
auto_eval_column_dict.append(["evaluation_score", ColumnContent, ColumnContent("Evaluation Score", "number", True, never_hidden=True)])
|
31 |
+
# auto_eval_column_dict.append(["score3", ColumnContent, ColumnContent("Score 3", "number", True, never_hidden=True)])
|
32 |
+
auto_eval_column_dict.append(["global_score", ColumnContent, ColumnContent("Global Score", "number", True, never_hidden=True)])
|
33 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# We use make dataclass to dynamically fill the scores from Tasks
|
36 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
38 |
## For the queue columns in the submission tab
|
39 |
@dataclass(frozen=True)
|
40 |
class EvalQueueColumn: # Queue column
|
41 |
+
submission_hash = ColumnContent("submission_hash", "str", True)
|
42 |
+
submitter = ColumnContent("submitter", "str", True)
|
43 |
+
team = ColumnContent("team", "bool", True)
|
44 |
precision = ColumnContent("precision", "str", True)
|
45 |
+
submitted_at = ColumnContent("submitted_at", "str", True)
|
46 |
status = ColumnContent("status", "str", True)
|
47 |
|
48 |
## All the model information that we might need
|
49 |
+
# @dataclass
|
50 |
+
# class ModelDetails:
|
51 |
+
# name: str
|
52 |
+
# display_name: str = ""
|
53 |
+
# symbol: str = "" # emoji
|
54 |
+
|
55 |
+
|
56 |
+
# class ModelType(Enum):
|
57 |
+
# PT = ModelDetails(name="pretrained", symbol="🟢")
|
58 |
+
# FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
59 |
+
# IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
60 |
+
# RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
61 |
+
# Unknown = ModelDetails(name="", symbol="?")
|
62 |
+
|
63 |
+
# def to_str(self, separator=" "):
|
64 |
+
# return f"{self.value.symbol}{separator}{self.value.name}"
|
65 |
+
|
66 |
+
# @staticmethod
|
67 |
+
# def from_str(type):
|
68 |
+
# if "fine-tuned" in type or "🔶" in type:
|
69 |
+
# return ModelType.FT
|
70 |
+
# if "pretrained" in type or "🟢" in type:
|
71 |
+
# return ModelType.PT
|
72 |
+
# if "RL-tuned" in type or "🟦" in type:
|
73 |
+
# return ModelType.RL
|
74 |
+
# if "instruction-tuned" in type or "⭕" in type:
|
75 |
+
# return ModelType.IFT
|
76 |
+
# return ModelType.Unknown
|
77 |
+
|
78 |
+
# class WeightType(Enum):
|
79 |
+
# Adapter = ModelDetails("Adapter")
|
80 |
+
# Original = ModelDetails("Original")
|
81 |
+
# Delta = ModelDetails("Delta")
|
82 |
+
|
83 |
+
# class Precision(Enum):
|
84 |
+
# float16 = ModelDetails("float16")
|
85 |
+
# bfloat16 = ModelDetails("bfloat16")
|
86 |
+
# Unknown = ModelDetails("?")
|
87 |
+
|
88 |
+
# def from_str(precision):
|
89 |
+
# if precision in ["torch.float16", "float16"]:
|
90 |
+
# return Precision.float16
|
91 |
+
# if precision in ["torch.bfloat16", "bfloat16"]:
|
92 |
+
# return Precision.bfloat16
|
93 |
+
# return Precision.Unknown
|
94 |
|
95 |
# Column selection
|
96 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
98 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
99 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
100 |
|
101 |
+
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
102 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -16,21 +16,16 @@ from src.submission.check_validity import is_model_on_hub
|
|
16 |
class EvalResult:
|
17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
"""
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
license: str = "?"
|
30 |
-
likes: int = 0
|
31 |
-
num_params: int = 0
|
32 |
-
date: str = "" # submission date of request file
|
33 |
-
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
36 |
def init_from_json_file(self, json_filepath):
|
@@ -38,120 +33,78 @@ class EvalResult:
|
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
-
continue
|
78 |
-
|
79 |
-
mean_acc = np.mean(accs) * 100.0
|
80 |
-
results[task.benchmark] = mean_acc
|
81 |
-
|
82 |
-
return self(
|
83 |
-
eval_name=result_key,
|
84 |
-
full_model=full_model,
|
85 |
-
org=org,
|
86 |
-
model=model,
|
87 |
-
results=results,
|
88 |
-
precision=precision,
|
89 |
-
revision= config.get("model_sha", ""),
|
90 |
-
still_on_hub=still_on_hub,
|
91 |
-
architecture=architecture
|
92 |
-
)
|
93 |
-
|
94 |
-
def update_with_request_file(self, requests_path):
|
95 |
-
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
-
try:
|
99 |
-
with open(request_file, "r") as f:
|
100 |
-
request = json.load(f)
|
101 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
102 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
-
self.license = request.get("license", "?")
|
104 |
-
self.likes = request.get("likes", 0)
|
105 |
-
self.num_params = request.get("params", 0)
|
106 |
-
self.date = request.get("submitted_time", "")
|
107 |
-
except Exception:
|
108 |
-
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.
|
116 |
-
AutoEvalColumn.
|
117 |
-
AutoEvalColumn.
|
118 |
-
AutoEvalColumn.
|
119 |
-
AutoEvalColumn.
|
120 |
-
AutoEvalColumn.
|
121 |
-
AutoEvalColumn.
|
122 |
-
AutoEvalColumn.
|
123 |
-
AutoEvalColumn.
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
-
|
129 |
-
for task in Tasks:
|
130 |
-
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
131 |
-
|
132 |
return data_dict
|
133 |
|
134 |
|
135 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
|
156 |
|
157 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
@@ -159,38 +112,43 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
159 |
model_result_filepaths = []
|
160 |
|
161 |
for root, _, files in os.walk(results_path):
|
|
|
162 |
# We should only have json files in model results
|
163 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
-
|
|
|
165 |
|
166 |
# Sort the files by date
|
167 |
-
try:
|
168 |
-
|
169 |
-
except dateutil.parser._parser.ParserError:
|
170 |
-
|
171 |
|
172 |
for file in files:
|
173 |
-
|
|
|
174 |
|
175 |
-
eval_results =
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
|
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
-
|
181 |
# Store results of same eval together
|
182 |
-
eval_name = eval_result.eval_name
|
183 |
-
if eval_name in eval_results.keys():
|
184 |
-
|
185 |
-
else:
|
186 |
-
|
|
|
187 |
|
188 |
results = []
|
189 |
-
for v in eval_results
|
190 |
try:
|
191 |
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
195 |
|
196 |
-
return
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
16 |
class EvalResult:
|
17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
"""
|
19 |
+
|
20 |
+
submission_hash: str
|
21 |
+
submitter: str = "Unknown" # Who submitted the model, if available
|
22 |
+
description: str = "No description provided" # Description of the model, if available
|
23 |
+
team: str = "Unknown" # Team or organization behind the model, if available
|
24 |
+
submitted_at : str = "" # Date when the model was submitted, if available
|
25 |
+
consistency_score: float = 0.0 # Score for the first metric, if available
|
26 |
+
evaluation_score: float = 0.0 # Score for the second metric, if
|
27 |
+
# score3: float = 0.0 # Score for the third metric, if available
|
28 |
+
global_score: float = 0.0 # Global score, if available
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
@classmethod
|
31 |
def init_from_json_file(self, json_filepath):
|
|
|
33 |
with open(json_filepath) as fp:
|
34 |
data = json.load(fp)
|
35 |
|
36 |
+
for key in data.keys():
|
37 |
+
if isinstance(data[key], dict):
|
38 |
+
submission_hash = key
|
39 |
+
submission_details = data[key]
|
40 |
+
if isinstance(submission_details, dict):
|
41 |
+
# Extracting details from the submission
|
42 |
+
return self(
|
43 |
+
submission_hash = submission_hash,
|
44 |
+
submitter = submission_details.get("submitter", "Unknown"),
|
45 |
+
description = submission_details.get("description", "No description provided"),
|
46 |
+
team = submission_details.get("team", "Unknown"),
|
47 |
+
submitted_at = submission_details.get("submitted_at", ""),
|
48 |
+
consistency_score = submission_details.get("consistency_score", 0.0),
|
49 |
+
evaluation_score = submission_details.get("evaluation_score", 0.0),
|
50 |
+
# score3 = submission_details.get("score3", 0.0),
|
51 |
+
global_score = submission_details.get("global_score", 0.0)
|
52 |
+
)
|
53 |
+
|
54 |
+
# def update_with_request_file(self, requests_path):
|
55 |
+
# """Finds the relevant request file for the current model and updates info with it"""
|
56 |
+
# request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
57 |
+
|
58 |
+
# try:
|
59 |
+
# with open(request_file, "r") as f:
|
60 |
+
# request = json.load(f)
|
61 |
+
# self.model_type = ModelType.from_str(request.get("model_type", ""))
|
62 |
+
# self.weight_type = WeightType[request.get("weight_type", "Original")]
|
63 |
+
# self.license = request.get("license", "?")
|
64 |
+
# self.likes = request.get("likes", 0)
|
65 |
+
# self.num_params = request.get("params", 0)
|
66 |
+
# self.date = request.get("submitted_time", "")
|
67 |
+
# except Exception:
|
68 |
+
# print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
def to_dict(self):
|
71 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
72 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
73 |
data_dict = {
|
74 |
+
# "eval_name": self.eval_name, # not a column, just a save name,
|
75 |
+
AutoEvalColumn.submission_hash.name: self.submission_hash,
|
76 |
+
AutoEvalColumn.submitter.name: self.submitter,
|
77 |
+
AutoEvalColumn.description.name: self.description,
|
78 |
+
AutoEvalColumn.team.name: self.team,
|
79 |
+
AutoEvalColumn.submitted_at.name: self.submitted_at,
|
80 |
+
AutoEvalColumn.consistency_score.name: self.consistency_score,
|
81 |
+
AutoEvalColumn.evaluation_score.name: self.evaluation_score,
|
82 |
+
# AutoEvalColumn.score3.name: self.score1,
|
83 |
+
AutoEvalColumn.global_score.name: self.global_score,
|
|
|
|
|
|
|
84 |
}
|
|
|
|
|
|
|
|
|
85 |
return data_dict
|
86 |
|
87 |
|
88 |
+
# def get_request_file_for_model(requests_path, model_name, precision):
|
89 |
+
# """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
90 |
+
# request_files = os.path.join(
|
91 |
+
# requests_path,
|
92 |
+
# f"{model_name}_eval_request_*.json",
|
93 |
+
# )
|
94 |
+
# request_files = glob.glob(request_files)
|
95 |
+
|
96 |
+
# # Select correct request file (precision)
|
97 |
+
# request_file = ""
|
98 |
+
# request_files = sorted(request_files, reverse=True)
|
99 |
+
# for tmp_request_file in request_files:
|
100 |
+
# with open(tmp_request_file, "r") as f:
|
101 |
+
# req_content = json.load(f)
|
102 |
+
# if (
|
103 |
+
# req_content["status"] in ["FINISHED"]
|
104 |
+
# and req_content["precision"] == precision.split(".")[-1]
|
105 |
+
# ):
|
106 |
+
# request_file = tmp_request_file
|
107 |
+
# return request_file
|
108 |
|
109 |
|
110 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
|
|
112 |
model_result_filepaths = []
|
113 |
|
114 |
for root, _, files in os.walk(results_path):
|
115 |
+
# print(files)
|
116 |
# We should only have json files in model results
|
117 |
+
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
118 |
+
# print(files, "continue")
|
119 |
+
# continue
|
120 |
|
121 |
# Sort the files by date
|
122 |
+
# try:
|
123 |
+
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
124 |
+
# except dateutil.parser._parser.ParserError:
|
125 |
+
# files = [files[-1]]
|
126 |
|
127 |
for file in files:
|
128 |
+
if file.endswith(".json"):
|
129 |
+
model_result_filepaths.append(os.path.join(root, file))
|
130 |
|
131 |
+
eval_results = []
|
132 |
for model_result_filepath in model_result_filepaths:
|
133 |
# Creation of result
|
134 |
+
# print(model_result_filepath)
|
135 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
136 |
+
# eval_result.update_with_request_file(requests_path)
|
137 |
+
# print('---------', eval_result)
|
138 |
# Store results of same eval together
|
139 |
+
# eval_name = eval_result.eval_name
|
140 |
+
# if eval_name in eval_results.keys():
|
141 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
142 |
+
# else:
|
143 |
+
# eval_results[eval_name] = eval_result
|
144 |
+
eval_results.append(eval_result)
|
145 |
|
146 |
results = []
|
147 |
+
for v in eval_results:
|
148 |
try:
|
149 |
v.to_dict() # we test if the dict version is complete
|
150 |
results.append(v)
|
151 |
except KeyError: # not all eval values present
|
152 |
continue
|
153 |
|
154 |
+
return eval_results
|
src/populate.py
CHANGED
@@ -8,17 +8,17 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
-
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
24 |
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
# print('=========', all_data_json)
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
df = df.sort_values(by=[AutoEvalColumn.global_score.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
24 |
|