Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
77c0f20
1
Parent(s):
70ea05e
consolidate
Browse files- app.py +32 -182
- src/about.py +24 -41
- src/display/utils.py +3 -1
- src/envs.py +2 -11
- src/leaderboard/read_evals.py +31 -79
- src/populate.py +4 -45
app.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard
|
3 |
import pandas as pd
|
4 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
6 |
|
7 |
from src.about import (
|
8 |
-
CITATION_BUTTON_LABEL,
|
9 |
-
CITATION_BUTTON_TEXT,
|
10 |
-
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
@@ -16,80 +12,24 @@ from src.display.css_html_js import custom_css
|
|
16 |
from src.display.utils import (
|
17 |
BENCHMARK_COLS,
|
18 |
COLS,
|
19 |
-
EVAL_COLS,
|
20 |
-
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
-
ModelType,
|
23 |
-
fields,
|
24 |
-
WeightType,
|
25 |
-
Precision
|
26 |
)
|
27 |
-
from src.envs import API,
|
28 |
-
from src.populate import
|
29 |
-
from src.submission.submit import add_new_eval
|
30 |
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
31 |
|
32 |
-
|
33 |
-
def restart_space():
|
34 |
-
API.restart_space(repo_id=REPO_ID)
|
35 |
-
|
36 |
-
### Space initialisation
|
37 |
-
try:
|
38 |
-
print(EVAL_REQUESTS_PATH)
|
39 |
-
snapshot_download(
|
40 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
41 |
-
)
|
42 |
-
except Exception:
|
43 |
-
restart_space()
|
44 |
-
try:
|
45 |
-
print(EVAL_RESULTS_PATH)
|
46 |
-
snapshot_download(
|
47 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
48 |
-
)
|
49 |
-
except Exception:
|
50 |
-
restart_space()
|
51 |
-
|
52 |
-
|
53 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
-
|
55 |
-
(
|
56 |
-
finished_eval_queue_df,
|
57 |
-
running_eval_queue_df,
|
58 |
-
pending_eval_queue_df,
|
59 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
60 |
-
|
61 |
def init_leaderboard(dataframe):
|
62 |
if dataframe is None or dataframe.empty:
|
63 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
64 |
return Leaderboard(
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
label="Select Columns to Display:",
|
71 |
-
),
|
72 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
73 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
74 |
-
filter_columns=[
|
75 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
76 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
77 |
-
ColumnFilter(
|
78 |
-
AutoEvalColumn.params.name,
|
79 |
-
type="slider",
|
80 |
-
min=0.01,
|
81 |
-
max=150,
|
82 |
-
label="Select the number of parameters (B)",
|
83 |
-
),
|
84 |
-
ColumnFilter(
|
85 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
86 |
-
),
|
87 |
-
],
|
88 |
-
bool_checkboxgroup_label="Hide models",
|
89 |
-
interactive=False,
|
90 |
)
|
91 |
|
92 |
-
|
93 |
def run_perplexity_test(model_name, revision, precision):
|
94 |
"""Run perplexity evaluation on demand."""
|
95 |
if not model_name:
|
@@ -102,140 +42,50 @@ def run_perplexity_test(model_name, revision, precision):
|
|
102 |
else:
|
103 |
return f"❌ Evaluation failed: {result}"
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
|
|
|
|
|
|
|
|
106 |
demo = gr.Blocks(css=custom_css)
|
107 |
with demo:
|
108 |
gr.HTML(TITLE)
|
109 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
110 |
|
111 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
112 |
-
with gr.TabItem("🏅
|
113 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
114 |
|
115 |
-
with gr.TabItem("📝 About", elem_id="
|
116 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
117 |
|
118 |
-
with gr.TabItem("
|
119 |
-
with gr.Column():
|
120 |
-
with gr.Row():
|
121 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
122 |
-
|
123 |
-
with gr.Column():
|
124 |
-
with gr.Accordion(
|
125 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
126 |
-
open=False,
|
127 |
-
):
|
128 |
-
with gr.Row():
|
129 |
-
finished_eval_table = gr.components.Dataframe(
|
130 |
-
value=finished_eval_queue_df,
|
131 |
-
headers=EVAL_COLS,
|
132 |
-
datatype=EVAL_TYPES,
|
133 |
-
row_count=5,
|
134 |
-
)
|
135 |
-
with gr.Accordion(
|
136 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
137 |
-
open=False,
|
138 |
-
):
|
139 |
-
with gr.Row():
|
140 |
-
running_eval_table = gr.components.Dataframe(
|
141 |
-
value=running_eval_queue_df,
|
142 |
-
headers=EVAL_COLS,
|
143 |
-
datatype=EVAL_TYPES,
|
144 |
-
row_count=5,
|
145 |
-
)
|
146 |
-
|
147 |
-
with gr.Accordion(
|
148 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
149 |
-
open=False,
|
150 |
-
):
|
151 |
-
with gr.Row():
|
152 |
-
pending_eval_table = gr.components.Dataframe(
|
153 |
-
value=pending_eval_queue_df,
|
154 |
-
headers=EVAL_COLS,
|
155 |
-
datatype=EVAL_TYPES,
|
156 |
-
row_count=5,
|
157 |
-
)
|
158 |
-
with gr.Row():
|
159 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
160 |
-
|
161 |
with gr.Row():
|
162 |
with gr.Column():
|
163 |
-
|
164 |
-
|
165 |
-
model_type = gr.Dropdown(
|
166 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
167 |
-
label="Model type",
|
168 |
-
multiselect=False,
|
169 |
-
value=None,
|
170 |
-
interactive=True,
|
171 |
-
)
|
172 |
-
|
173 |
-
with gr.Column():
|
174 |
precision = gr.Dropdown(
|
175 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
176 |
-
label="Precision",
|
177 |
-
multiselect=False,
|
178 |
-
value="float16",
|
179 |
-
interactive=True,
|
180 |
-
)
|
181 |
-
weight_type = gr.Dropdown(
|
182 |
-
choices=[i.value.name for i in WeightType],
|
183 |
-
label="Weights type",
|
184 |
-
multiselect=False,
|
185 |
-
value="Original",
|
186 |
-
interactive=True,
|
187 |
-
)
|
188 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
189 |
-
|
190 |
-
submit_button = gr.Button("Submit Eval")
|
191 |
-
submission_result = gr.Markdown()
|
192 |
-
submit_button.click(
|
193 |
-
add_new_eval,
|
194 |
-
[
|
195 |
-
model_name_textbox,
|
196 |
-
base_model_name_textbox,
|
197 |
-
revision_name_textbox,
|
198 |
-
precision,
|
199 |
-
weight_type,
|
200 |
-
model_type,
|
201 |
-
],
|
202 |
-
submission_result,
|
203 |
-
)
|
204 |
-
|
205 |
-
with gr.TabItem("🧪 Dynamic Testing", elem_id="dynamic-testing-tab", id=4):
|
206 |
-
gr.Markdown("## Run Perplexity Evaluation")
|
207 |
-
|
208 |
-
with gr.Row():
|
209 |
-
with gr.Column():
|
210 |
-
dynamic_model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
|
211 |
-
dynamic_revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
212 |
-
dynamic_precision = gr.Dropdown(
|
213 |
choices=["float16", "bfloat16"],
|
214 |
label="Precision",
|
215 |
value="float16"
|
216 |
)
|
217 |
|
218 |
with gr.Column():
|
219 |
-
|
220 |
-
|
221 |
|
222 |
-
|
223 |
run_perplexity_test,
|
224 |
-
[
|
225 |
-
|
226 |
-
)
|
227 |
-
|
228 |
-
with gr.Row():
|
229 |
-
with gr.Accordion("📙 Citation", open=False):
|
230 |
-
citation_button = gr.Textbox(
|
231 |
-
value=CITATION_BUTTON_TEXT,
|
232 |
-
label=CITATION_BUTTON_LABEL,
|
233 |
-
lines=20,
|
234 |
-
elem_id="citation-button",
|
235 |
-
show_copy_button=True,
|
236 |
)
|
237 |
|
238 |
-
|
239 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
240 |
-
scheduler.start()
|
241 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard
|
3 |
import pandas as pd
|
|
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
from src.about import (
|
|
|
|
|
|
|
7 |
INTRODUCTION_TEXT,
|
8 |
LLM_BENCHMARKS_TEXT,
|
9 |
TITLE,
|
|
|
12 |
from src.display.utils import (
|
13 |
BENCHMARK_COLS,
|
14 |
COLS,
|
|
|
|
|
15 |
AutoEvalColumn,
|
|
|
|
|
|
|
|
|
16 |
)
|
17 |
+
from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN
|
18 |
+
from src.populate import get_leaderboard_df
|
|
|
19 |
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def init_leaderboard(dataframe):
|
22 |
if dataframe is None or dataframe.empty:
|
23 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
24 |
+
|
25 |
return Leaderboard(
|
26 |
+
dataframe,
|
27 |
+
headers=COLS,
|
28 |
+
column_config={
|
29 |
+
AutoEvalColumn.model.name: "markdown",
|
30 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
)
|
32 |
|
|
|
33 |
def run_perplexity_test(model_name, revision, precision):
|
34 |
"""Run perplexity evaluation on demand."""
|
35 |
if not model_name:
|
|
|
42 |
else:
|
43 |
return f"❌ Evaluation failed: {result}"
|
44 |
|
45 |
+
# Initialize results directory
|
46 |
+
try:
|
47 |
+
print(EVAL_RESULTS_PATH)
|
48 |
+
snapshot_download(
|
49 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
50 |
+
)
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error initializing results: {e}")
|
53 |
|
54 |
+
# Get initial leaderboard data
|
55 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
56 |
+
|
57 |
+
# Create the Gradio interface
|
58 |
demo = gr.Blocks(css=custom_css)
|
59 |
with demo:
|
60 |
gr.HTML(TITLE)
|
61 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
62 |
|
63 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
64 |
+
with gr.TabItem("🏅 Leaderboard", elem_id="leaderboard-tab", id=0):
|
65 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
66 |
|
67 |
+
with gr.TabItem("📝 About", elem_id="about-tab", id=1):
|
68 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
69 |
|
70 |
+
with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
with gr.Row():
|
72 |
with gr.Column():
|
73 |
+
model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
|
74 |
+
revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
precision = gr.Dropdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
choices=["float16", "bfloat16"],
|
77 |
label="Precision",
|
78 |
value="float16"
|
79 |
)
|
80 |
|
81 |
with gr.Column():
|
82 |
+
test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
|
83 |
+
result = gr.Markdown()
|
84 |
|
85 |
+
test_button.click(
|
86 |
run_perplexity_test,
|
87 |
+
[model_name, revision, precision],
|
88 |
+
result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
)
|
90 |
|
91 |
+
demo.queue(default_concurrency_limit=5).launch()
|
|
|
|
|
|
src/about.py
CHANGED
@@ -7,67 +7,50 @@ class Task:
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
|
10 |
-
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
-
task2 = Task("perplexity", "perplexity", "Perplexity")
|
18 |
|
19 |
-
NUM_FEWSHOT = 0 #
|
20 |
# ---------------------------------------------------
|
21 |
|
22 |
-
|
23 |
-
|
24 |
# Your leaderboard name
|
25 |
-
TITLE = """<h1 align="center" id="space-title">
|
26 |
|
27 |
# What does your leaderboard evaluate?
|
28 |
INTRODUCTION_TEXT = """
|
29 |
-
|
|
|
30 |
"""
|
31 |
|
32 |
-
# Which evaluations are you running?
|
33 |
-
LLM_BENCHMARKS_TEXT =
|
34 |
## How it works
|
35 |
|
36 |
-
|
37 |
-
|
38 |
|
39 |
-
|
40 |
|
41 |
-
|
42 |
-
## Some good practices before submitting a model
|
43 |
-
|
44 |
-
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
45 |
-
```python
|
46 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
47 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
48 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
49 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
50 |
```
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
### 3) Make sure your model has an open license!
|
60 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
61 |
|
62 |
-
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
69 |
"""
|
70 |
|
71 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
72 |
-
CITATION_BUTTON_TEXT =
|
73 |
-
"""
|
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
|
|
|
10 |
# Select your tasks here
|
11 |
# ---------------------------------------------------
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
+
task0 = Task("perplexity", "perplexity", "Perplexity")
|
|
|
|
|
15 |
|
16 |
+
NUM_FEWSHOT = 0 # Not used for perplexity
|
17 |
# ---------------------------------------------------
|
18 |
|
|
|
|
|
19 |
# Your leaderboard name
|
20 |
+
TITLE = """<h1 align="center" id="space-title">Model Perplexity Leaderboard</h1>"""
|
21 |
|
22 |
# What does your leaderboard evaluate?
|
23 |
INTRODUCTION_TEXT = """
|
24 |
+
This leaderboard evaluates language models based on their perplexity scores on a fixed test passage.
|
25 |
+
Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
26 |
"""
|
27 |
|
28 |
+
# Which evaluations are you running?
|
29 |
+
LLM_BENCHMARKS_TEXT = """
|
30 |
## How it works
|
31 |
|
32 |
+
The evaluation runs perplexity tests on language models using a fixed test passage about artificial intelligence.
|
33 |
+
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
34 |
|
35 |
+
## Test Text
|
36 |
|
37 |
+
The evaluation uses the following passage:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
```
|
39 |
+
Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
|
40 |
+
From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
|
41 |
+
sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
|
42 |
+
with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
|
43 |
+
with these important social considerations.
|
44 |
+
```
|
45 |
+
"""
|
|
|
|
|
|
|
46 |
|
47 |
+
EVALUATION_QUEUE_TEXT = """
|
48 |
+
## Before submitting a model
|
49 |
|
50 |
+
1. Make sure your model is public on the Hugging Face Hub
|
51 |
+
2. The model should be loadable with AutoModelForCausalLM
|
52 |
+
3. The model should support text generation tasks
|
|
|
53 |
"""
|
54 |
|
55 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
56 |
+
CITATION_BUTTON_TEXT = ""
|
|
src/display/utils.py
CHANGED
@@ -28,7 +28,9 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
-
|
|
|
|
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
+
# Add ⬆️ for metrics where higher is better, ⬇️ for metrics where lower is better
|
32 |
+
arrow = "⬇️" if task.value.benchmark == "perplexity" else "⬆️"
|
33 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"{task.value.col_name} {arrow}", "number", True)])
|
34 |
# Model information
|
35 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
36 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/envs.py
CHANGED
@@ -1,25 +1,16 @@
|
|
1 |
import os
|
2 |
-
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
-
|
9 |
-
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
-
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
-
|
19 |
# Local caches
|
20 |
-
|
21 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
|
25 |
API = HfApi(token=TOKEN)
|
|
|
1 |
import os
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
|
4 |
# Info to change for your repository
|
5 |
# ----------------------------------
|
6 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
7 |
+
OWNER = "model-trace" # Change to your org
|
|
|
8 |
# ----------------------------------
|
9 |
|
|
|
|
|
10 |
RESULTS_REPO = f"{OWNER}/results"
|
11 |
|
|
|
|
|
|
|
12 |
# Local caches
|
13 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
14 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
|
15 |
|
16 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -4,18 +4,13 @@ import math
|
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
-
import dateutil
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
-
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
-
"""Represents one
|
18 |
-
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
full_model: str # org/model (path on hub)
|
21 |
org: str
|
@@ -23,13 +18,9 @@ class EvalResult:
|
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.
|
27 |
-
weight_type: WeightType = WeightType.Original
|
28 |
-
architecture: str = "Unknown"
|
29 |
-
license: str = "?"
|
30 |
-
likes: int = 0
|
31 |
-
num_params: int = 0
|
32 |
-
date: str = "" # submission date of request file
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
@@ -66,18 +57,10 @@ class EvalResult:
|
|
66 |
if architectures:
|
67 |
architecture = ";".join(architectures)
|
68 |
|
69 |
-
# Extract
|
70 |
results = {}
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
-
continue
|
78 |
-
|
79 |
-
mean_acc = np.mean(accs) * 100.0
|
80 |
-
results[task.benchmark] = mean_acc
|
81 |
|
82 |
return self(
|
83 |
eval_name=result_key,
|
@@ -85,31 +68,27 @@ class EvalResult:
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
89 |
-
revision=
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture
|
92 |
)
|
93 |
|
94 |
-
def update_with_request_file(self, requests_path):
|
95 |
-
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
-
try:
|
99 |
-
with open(request_file, "r") as f:
|
100 |
-
request = json.load(f)
|
101 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
102 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
-
self.license = request.get("license", "?")
|
104 |
-
self.likes = request.get("likes", 0)
|
105 |
-
self.num_params = request.get("params", 0)
|
106 |
-
self.date = request.get("submitted_time", "")
|
107 |
-
except Exception:
|
108 |
-
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
-
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -120,42 +99,22 @@ class EvalResult:
|
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.params.name: self.num_params,
|
126 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
return data_dict
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
-
request_files = os.path.join(
|
138 |
-
requests_path,
|
139 |
-
f"{model_name}_eval_request_*.json",
|
140 |
-
)
|
141 |
-
request_files = glob.glob(request_files)
|
142 |
-
|
143 |
-
# Select correct request file (precision)
|
144 |
-
request_file = ""
|
145 |
-
request_files = sorted(request_files, reverse=True)
|
146 |
-
for tmp_request_file in request_files:
|
147 |
-
with open(tmp_request_file, "r") as f:
|
148 |
-
req_content = json.load(f)
|
149 |
-
if (
|
150 |
-
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
152 |
-
):
|
153 |
-
request_file = tmp_request_file
|
154 |
-
return request_file
|
155 |
-
|
156 |
-
|
157 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
model_result_filepaths = []
|
160 |
|
161 |
for root, _, files in os.walk(results_path):
|
@@ -163,12 +122,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
163 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
continue
|
165 |
|
166 |
-
# Sort the files by date
|
167 |
-
try:
|
168 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
-
except dateutil.parser._parser.ParserError:
|
170 |
-
files = [files[-1]]
|
171 |
-
|
172 |
for file in files:
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
174 |
|
@@ -176,7 +129,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
|
181 |
# Store results of same eval together
|
182 |
eval_name = eval_result.eval_name
|
|
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
|
|
|
|
|
|
|
7 |
from src.display.formatting import make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
9 |
from src.submission.check_validity import is_model_on_hub
|
10 |
|
|
|
11 |
@dataclass
|
12 |
class EvalResult:
|
13 |
+
"""Represents one perplexity evaluation result."""
|
|
|
14 |
eval_name: str # org_model_precision (uid)
|
15 |
full_model: str # org/model (path on hub)
|
16 |
org: str
|
|
|
18 |
revision: str # commit hash, "" if main
|
19 |
results: dict
|
20 |
precision: Precision = Precision.Unknown
|
21 |
+
model_type: ModelType = ModelType.PT # Default to pretrained
|
22 |
+
weight_type: WeightType = WeightType.Original
|
23 |
+
architecture: str = "Unknown"
|
|
|
|
|
|
|
|
|
24 |
still_on_hub: bool = False
|
25 |
|
26 |
@classmethod
|
|
|
57 |
if architectures:
|
58 |
architecture = ";".join(architectures)
|
59 |
|
60 |
+
# Extract perplexity result
|
61 |
results = {}
|
62 |
+
if "perplexity" in data["results"]:
|
63 |
+
results["perplexity"] = data["results"]["perplexity"]["perplexity"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
return self(
|
66 |
eval_name=result_key,
|
|
|
68 |
org=org,
|
69 |
model=model,
|
70 |
results=results,
|
71 |
+
precision=precision,
|
72 |
+
revision=config.get("model_sha", ""),
|
73 |
still_on_hub=still_on_hub,
|
74 |
architecture=architecture
|
75 |
)
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
def to_dict(self):
|
78 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
79 |
+
# Calculate average, handling perplexity (lower is better)
|
80 |
+
scores = []
|
81 |
+
for task in Tasks:
|
82 |
+
if task.value.benchmark in self.results:
|
83 |
+
score = self.results[task.value.benchmark]
|
84 |
+
# Convert perplexity to a 0-100 scale where lower perplexity = higher score
|
85 |
+
# Using a log scale since perplexity can vary widely
|
86 |
+
# Cap at 100 for very low perplexity and 0 for very high perplexity
|
87 |
+
score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
|
88 |
+
scores.append(score)
|
89 |
+
|
90 |
+
average = sum(scores) / len(scores) if scores else 0
|
91 |
+
|
92 |
data_dict = {
|
93 |
"eval_name": self.eval_name, # not a column, just a save name,
|
94 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
99 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
100 |
AutoEvalColumn.revision.name: self.revision,
|
101 |
AutoEvalColumn.average.name: average,
|
|
|
|
|
|
|
102 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
103 |
}
|
104 |
|
105 |
for task in Tasks:
|
106 |
+
benchmark = task.value.benchmark
|
107 |
+
if benchmark in self.results:
|
108 |
+
score = self.results[benchmark]
|
109 |
+
# Store original perplexity score (lower is better)
|
110 |
+
data_dict[task.value.col_name] = score
|
111 |
+
else:
|
112 |
+
data_dict[task.value.col_name] = None
|
113 |
|
114 |
return data_dict
|
115 |
|
116 |
+
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
117 |
+
"""From the path of the results folder root, extract all perplexity results"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
model_result_filepaths = []
|
119 |
|
120 |
for root, _, files in os.walk(results_path):
|
|
|
122 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
123 |
continue
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
for file in files:
|
126 |
model_result_filepaths.append(os.path.join(root, file))
|
127 |
|
|
|
129 |
for model_result_filepath in model_result_filepaths:
|
130 |
# Creation of result
|
131 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
132 |
|
133 |
# Store results of same eval together
|
134 |
eval_name = eval_result.eval_name
|
src/populate.py
CHANGED
@@ -1,58 +1,17 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
import pandas as pd
|
5 |
-
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
-
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
-
# filter out if
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
-
|
24 |
-
|
25 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
-
"""Creates the different dataframes for the evaluation queues requestes"""
|
27 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
28 |
-
all_evals = []
|
29 |
-
|
30 |
-
for entry in entries:
|
31 |
-
if ".json" in entry:
|
32 |
-
file_path = os.path.join(save_path, entry)
|
33 |
-
with open(file_path) as fp:
|
34 |
-
data = json.load(fp)
|
35 |
-
|
36 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
-
|
39 |
-
all_evals.append(data)
|
40 |
-
elif ".md" not in entry:
|
41 |
-
# this is a folder
|
42 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
43 |
-
for sub_entry in sub_entries:
|
44 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
-
with open(file_path) as fp:
|
46 |
-
data = json.load(fp)
|
47 |
-
|
48 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
-
all_evals.append(data)
|
51 |
-
|
52 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
54 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
55 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
|
|
2 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
3 |
+
from src.display.utils import AutoEvalColumn
|
4 |
from src.leaderboard.read_evals import get_raw_eval_results
|
5 |
|
6 |
+
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
7 |
"""Creates a dataframe from all the individual experiment results"""
|
8 |
+
raw_data = get_raw_eval_results(results_path)
|
9 |
all_data_json = [v.to_dict() for v in raw_data]
|
10 |
|
11 |
df = pd.DataFrame.from_records(all_data_json)
|
12 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
13 |
df = df[cols].round(decimals=2)
|
14 |
|
15 |
+
# filter out if perplexity hasn't been evaluated
|
16 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
17 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|