Spaces:
Restarting
Restarting
yangzhitao
commited on
Commit
·
c85dcc4
1
Parent(s):
d1fd905
feat: enhance leaderboard functionality and refactor app structure; update token handling and project metadata; update environment settings
Browse files- .env.example +4 -0
- .vscode/cspell.json +2 -0
- app.py +80 -76
- pyproject.toml +5 -3
- requirements.txt +2 -0
- src/about.py +42 -32
- src/backend/config.py +22 -3
- src/backend/routes/hf.py +2 -2
- src/display/formatting.py +4 -1
- src/display/utils.py +25 -14
- src/envs.py +31 -31
- src/leaderboard/read_evals.py +22 -13
- src/populate.py +1 -1
- src/prepare.py +137 -0
- src/submission/submit.py +3 -3
- uv.lock +88 -62
.env.example
CHANGED
|
@@ -1,2 +1,6 @@
|
|
| 1 |
HF_TOKEN=changethis
|
| 2 |
HF_HOME=.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
HF_TOKEN=changethis
|
| 2 |
HF_HOME=.
|
| 3 |
+
HF_OWNER=lmms-lab
|
| 4 |
+
HF_REPO_NAME=EASI-Leaderboard
|
| 5 |
+
HF_RESULTS_REPO_NAME=EASI-Leaderboard-Results
|
| 6 |
+
HF_REQUESTS_REPO_NAME=EASI-Leaderboard-Requests
|
.vscode/cspell.json
CHANGED
|
@@ -3,8 +3,10 @@
|
|
| 3 |
"accs",
|
| 4 |
"changethis",
|
| 5 |
"checkboxgroup",
|
|
|
|
| 6 |
"evals",
|
| 7 |
"initialisation",
|
|
|
|
| 8 |
"modelcard",
|
| 9 |
"sentencepiece"
|
| 10 |
]
|
|
|
|
| 3 |
"accs",
|
| 4 |
"changethis",
|
| 5 |
"checkboxgroup",
|
| 6 |
+
"EASI",
|
| 7 |
"evals",
|
| 8 |
"initialisation",
|
| 9 |
+
"lmms",
|
| 10 |
"modelcard",
|
| 11 |
"sentencepiece"
|
| 12 |
]
|
app.py
CHANGED
|
@@ -6,17 +6,16 @@ import pandas as pd
|
|
| 6 |
import requests
|
| 7 |
import uvicorn
|
| 8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 9 |
-
from huggingface_hub import snapshot_download
|
| 10 |
from rich import print
|
| 11 |
|
| 12 |
from src.about import (
|
| 13 |
-
BENCHMARKS,
|
| 14 |
CITATION_BUTTON_LABEL,
|
| 15 |
CITATION_BUTTON_TEXT,
|
| 16 |
EVALUATION_QUEUE_TEXT,
|
| 17 |
INTRODUCTION_TEXT,
|
| 18 |
LLM_BENCHMARKS_TEXT,
|
| 19 |
TITLE,
|
|
|
|
| 20 |
)
|
| 21 |
from src.backend.app import create_app
|
| 22 |
from src.display.css_html_js import (
|
|
@@ -28,9 +27,9 @@ from src.display.css_html_js import (
|
|
| 28 |
from src.display.utils import (
|
| 29 |
BASE_COLS,
|
| 30 |
BENCHMARK_COLS,
|
| 31 |
-
COLS,
|
| 32 |
EVAL_COLS,
|
| 33 |
EVAL_TYPES,
|
|
|
|
| 34 |
AutoEvalColumn,
|
| 35 |
ModelType,
|
| 36 |
Precision,
|
|
@@ -38,8 +37,12 @@ from src.display.utils import (
|
|
| 38 |
)
|
| 39 |
from src.envs import API, settings
|
| 40 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
|
| 41 |
from src.submission.submit import add_new_eval
|
| 42 |
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def restart_space():
|
| 45 |
API.restart_space(repo_id=settings.REPO_ID)
|
|
@@ -47,37 +50,13 @@ def restart_space():
|
|
| 47 |
|
| 48 |
print("///// --- Settings --- /////", settings.model_dump())
|
| 49 |
|
| 50 |
-
# Space initialisation
|
| 51 |
-
try:
|
| 52 |
-
snapshot_download(
|
| 53 |
-
repo_id=settings.QUEUE_REPO,
|
| 54 |
-
local_dir=settings.EVAL_REQUESTS_PATH,
|
| 55 |
-
repo_type="dataset",
|
| 56 |
-
tqdm_class=None,
|
| 57 |
-
etag_timeout=30,
|
| 58 |
-
token=settings.TOKEN,
|
| 59 |
-
)
|
| 60 |
-
except Exception:
|
| 61 |
-
restart_space()
|
| 62 |
-
try:
|
| 63 |
-
snapshot_download(
|
| 64 |
-
repo_id=settings.RESULTS_REPO,
|
| 65 |
-
local_dir=settings.EVAL_RESULTS_PATH,
|
| 66 |
-
repo_type="dataset",
|
| 67 |
-
tqdm_class=None,
|
| 68 |
-
etag_timeout=30,
|
| 69 |
-
token=settings.TOKEN,
|
| 70 |
-
)
|
| 71 |
-
except Exception:
|
| 72 |
-
restart_space()
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
)
|
| 81 |
|
| 82 |
(
|
| 83 |
finished_eval_queue_df,
|
|
@@ -90,8 +69,9 @@ def filter_dataframe_by_columns(selected_cols: list[str], original_df: pd.DataFr
|
|
| 90 |
"""
|
| 91 |
根据选择的列过滤 DataFrame
|
| 92 |
"""
|
| 93 |
-
# 始终包含基础列 'T' 和 'Model'
|
| 94 |
-
base_cols = ['T', 'Model']
|
|
|
|
| 95 |
all_selected_cols = [col for col in base_cols if col in original_df.columns]
|
| 96 |
|
| 97 |
# 添加用户选择的列(排除已存在的基础列)
|
|
@@ -175,7 +155,11 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
|
|
| 175 |
return filtered_df
|
| 176 |
|
| 177 |
|
| 178 |
-
def init_leaderboard_tabs(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
# 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
|
| 180 |
original_df = dataframe.copy()
|
| 181 |
|
|
@@ -187,7 +171,9 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
|
| 187 |
)
|
| 188 |
|
| 189 |
# 初始化显示的列(包含基础列和默认选中的列)
|
| 190 |
-
default_selected = [col for col in dataframe.columns if col in cols] + [
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# 先按 precision 筛选 original_df
|
| 193 |
precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
|
|
@@ -197,8 +183,13 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
|
| 197 |
with gr.Row():
|
| 198 |
with gr.Column(scale=1):
|
| 199 |
search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
show_columns = gr.CheckboxGroup(
|
| 201 |
-
choices=
|
| 202 |
label="Select Columns to Display",
|
| 203 |
value=default_selected,
|
| 204 |
interactive=True,
|
|
@@ -271,24 +262,37 @@ def main():
|
|
| 271 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 272 |
|
| 273 |
with gr.Tabs(elem_classes="tab-buttons") as _tabs:
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
-
with gr.TabItem("📝 About", elem_id="about-tab", id=
|
| 289 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 290 |
|
| 291 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=
|
| 292 |
with gr.Column():
|
| 293 |
with gr.Row():
|
| 294 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
@@ -405,6 +409,28 @@ def main():
|
|
| 405 |
submission_result,
|
| 406 |
)
|
| 407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
with gr.Row():
|
| 409 |
with gr.Accordion("📙 Citation", open=False):
|
| 410 |
_citation_button = gr.Textbox(
|
|
@@ -414,28 +440,6 @@ def main():
|
|
| 414 |
elem_id="citation-button",
|
| 415 |
show_copy_button=True,
|
| 416 |
)
|
| 417 |
-
|
| 418 |
-
# Backend status indicator
|
| 419 |
-
backend_status = gr.HTML(
|
| 420 |
-
value=get_backend_status_undefined_html(),
|
| 421 |
-
elem_id="backend-status-container",
|
| 422 |
-
)
|
| 423 |
-
# trigger button to bind the click event
|
| 424 |
-
status_trigger = gr.Button(elem_id="backend-status-trigger-btn", visible=False)
|
| 425 |
-
status_trigger.click(
|
| 426 |
-
fn=lambda: check_backend_health()[1],
|
| 427 |
-
inputs=None,
|
| 428 |
-
outputs=backend_status,
|
| 429 |
-
)
|
| 430 |
-
# load external JavaScript file
|
| 431 |
-
js_content = backend_status_js()
|
| 432 |
-
status_trigger_js_html = f'<script>{js_content}</script>'
|
| 433 |
-
gr.HTML(status_trigger_js_html, visible=False)
|
| 434 |
-
demo.load(
|
| 435 |
-
fn=lambda: check_backend_health()[1],
|
| 436 |
-
inputs=None,
|
| 437 |
-
outputs=backend_status,
|
| 438 |
-
)
|
| 439 |
return demo
|
| 440 |
|
| 441 |
|
|
@@ -480,7 +484,7 @@ if __name__ == "__main__":
|
|
| 480 |
def run_fastapi():
|
| 481 |
host = settings.BACKEND_HOST
|
| 482 |
port = settings.BACKEND_PORT
|
| 483 |
-
print(
|
| 484 |
uvicorn.run(
|
| 485 |
app,
|
| 486 |
host=host,
|
|
|
|
| 6 |
import requests
|
| 7 |
import uvicorn
|
| 8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
| 9 |
from rich import print
|
| 10 |
|
| 11 |
from src.about import (
|
|
|
|
| 12 |
CITATION_BUTTON_LABEL,
|
| 13 |
CITATION_BUTTON_TEXT,
|
| 14 |
EVALUATION_QUEUE_TEXT,
|
| 15 |
INTRODUCTION_TEXT,
|
| 16 |
LLM_BENCHMARKS_TEXT,
|
| 17 |
TITLE,
|
| 18 |
+
get_benchmarks,
|
| 19 |
)
|
| 20 |
from src.backend.app import create_app
|
| 21 |
from src.display.css_html_js import (
|
|
|
|
| 27 |
from src.display.utils import (
|
| 28 |
BASE_COLS,
|
| 29 |
BENCHMARK_COLS,
|
|
|
|
| 30 |
EVAL_COLS,
|
| 31 |
EVAL_TYPES,
|
| 32 |
+
NOT_SUPPORTED_COLS,
|
| 33 |
AutoEvalColumn,
|
| 34 |
ModelType,
|
| 35 |
Precision,
|
|
|
|
| 37 |
)
|
| 38 |
from src.envs import API, settings
|
| 39 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 40 |
+
from src.prepare import prepare_space
|
| 41 |
from src.submission.submit import add_new_eval
|
| 42 |
|
| 43 |
+
prepare_space()
|
| 44 |
+
BENCHMARKS = get_benchmarks()
|
| 45 |
+
|
| 46 |
|
| 47 |
def restart_space():
|
| 48 |
API.restart_space(repo_id=settings.REPO_ID)
|
|
|
|
| 50 |
|
| 51 |
print("///// --- Settings --- /////", settings.model_dump())
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
# LEADERBOARD_DF = get_leaderboard_df(
|
| 55 |
+
# settings.EVAL_RESULTS_PATH,
|
| 56 |
+
# settings.EVAL_REQUESTS_PATH,
|
| 57 |
+
# COLS,
|
| 58 |
+
# BENCHMARK_COLS,
|
| 59 |
+
# )
|
|
|
|
| 60 |
|
| 61 |
(
|
| 62 |
finished_eval_queue_df,
|
|
|
|
| 69 |
"""
|
| 70 |
根据选择的列过滤 DataFrame
|
| 71 |
"""
|
| 72 |
+
# # 始终包含基础列 'T' 和 'Model'
|
| 73 |
+
# base_cols = ['T', 'Model']
|
| 74 |
+
base_cols = ['Model']
|
| 75 |
all_selected_cols = [col for col in base_cols if col in original_df.columns]
|
| 76 |
|
| 77 |
# 添加用户选择的列(排除已存在的基础列)
|
|
|
|
| 155 |
return filtered_df
|
| 156 |
|
| 157 |
|
| 158 |
+
def init_leaderboard_tabs(
|
| 159 |
+
dataframe: pd.DataFrame,
|
| 160 |
+
cols: list[str],
|
| 161 |
+
not_supported_cols: list[str],
|
| 162 |
+
):
|
| 163 |
# 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
|
| 164 |
original_df = dataframe.copy()
|
| 165 |
|
|
|
|
| 171 |
)
|
| 172 |
|
| 173 |
# 初始化显示的列(包含基础列和默认选中的列)
|
| 174 |
+
default_selected = [col for col in dataframe.columns if col in cols and col not in not_supported_cols] + [
|
| 175 |
+
'Average ⬆️'
|
| 176 |
+
]
|
| 177 |
|
| 178 |
# 先按 precision 筛选 original_df
|
| 179 |
precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
|
|
|
|
| 183 |
with gr.Row():
|
| 184 |
with gr.Column(scale=1):
|
| 185 |
search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
|
| 186 |
+
column_choices = [
|
| 187 |
+
col
|
| 188 |
+
for col in dataframe.columns
|
| 189 |
+
if col not in ['T', 'Model'] and (not not_supported_cols or col not in not_supported_cols)
|
| 190 |
+
]
|
| 191 |
show_columns = gr.CheckboxGroup(
|
| 192 |
+
choices=column_choices,
|
| 193 |
label="Select Columns to Display",
|
| 194 |
value=default_selected,
|
| 195 |
interactive=True,
|
|
|
|
| 262 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 263 |
|
| 264 |
with gr.Tabs(elem_classes="tab-buttons") as _tabs:
|
| 265 |
+
with gr.TabItem("📝 Overview", elem_id="benchmark-overview-tab", id=0):
|
| 266 |
+
benchmark_cols = BENCHMARK_COLS.copy()
|
| 267 |
+
print("benchmark_cols:", benchmark_cols)
|
| 268 |
+
cols = BASE_COLS + benchmark_cols
|
| 269 |
+
benchmark_df = get_leaderboard_df(
|
| 270 |
+
settings.EVAL_RESULTS_PATH,
|
| 271 |
+
settings.EVAL_REQUESTS_PATH,
|
| 272 |
+
cols,
|
| 273 |
+
benchmark_cols,
|
| 274 |
+
)
|
| 275 |
+
_leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
|
| 276 |
+
|
| 277 |
+
i_bench = 1
|
| 278 |
+
if False:
|
| 279 |
+
for i_bench, benchmark in enumerate(sorted(BENCHMARKS), start=1):
|
| 280 |
+
with gr.TabItem(f"🏅 {benchmark.title}", elem_id="llm-benchmark-tab-table", id=i_bench):
|
| 281 |
+
print(f"benchmark.title: {benchmark.title!r}")
|
| 282 |
+
benchmark_cols = [col for col in BENCHMARK_COLS if col.startswith(benchmark.title)]
|
| 283 |
+
cols = BASE_COLS + benchmark_cols
|
| 284 |
+
benchmark_df = get_leaderboard_df(
|
| 285 |
+
settings.EVAL_RESULTS_PATH,
|
| 286 |
+
settings.EVAL_REQUESTS_PATH,
|
| 287 |
+
cols,
|
| 288 |
+
benchmark_cols,
|
| 289 |
+
)
|
| 290 |
+
_leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
|
| 291 |
|
| 292 |
+
with gr.TabItem("📝 About", elem_id="about-tab", id=i_bench + 1):
|
| 293 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 294 |
|
| 295 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=i_bench + 2):
|
| 296 |
with gr.Column():
|
| 297 |
with gr.Row():
|
| 298 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
|
| 409 |
submission_result,
|
| 410 |
)
|
| 411 |
|
| 412 |
+
# Backend status indicator
|
| 413 |
+
backend_status = gr.HTML(
|
| 414 |
+
value=get_backend_status_undefined_html(),
|
| 415 |
+
elem_id="backend-status-container",
|
| 416 |
+
)
|
| 417 |
+
# trigger button to bind the click event
|
| 418 |
+
status_trigger = gr.Button(elem_id="backend-status-trigger-btn", visible=False)
|
| 419 |
+
status_trigger.click(
|
| 420 |
+
fn=lambda: check_backend_health()[1],
|
| 421 |
+
inputs=None,
|
| 422 |
+
outputs=backend_status,
|
| 423 |
+
)
|
| 424 |
+
# load external JavaScript file
|
| 425 |
+
js_content = backend_status_js()
|
| 426 |
+
status_trigger_js_html = f'<script>{js_content}</script>'
|
| 427 |
+
gr.HTML(status_trigger_js_html, visible=False)
|
| 428 |
+
demo.load(
|
| 429 |
+
fn=lambda: check_backend_health()[1],
|
| 430 |
+
inputs=None,
|
| 431 |
+
outputs=backend_status,
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
with gr.Row():
|
| 435 |
with gr.Accordion("📙 Citation", open=False):
|
| 436 |
_citation_button = gr.Textbox(
|
|
|
|
| 440 |
elem_id="citation-button",
|
| 441 |
show_copy_button=True,
|
| 442 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
return demo
|
| 444 |
|
| 445 |
|
|
|
|
| 484 |
def run_fastapi():
|
| 485 |
host = settings.BACKEND_HOST
|
| 486 |
port = settings.BACKEND_PORT
|
| 487 |
+
print("Starting FastAPI server:")
|
| 488 |
uvicorn.run(
|
| 489 |
app,
|
| 490 |
host=host,
|
pyproject.toml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
[project]
|
| 2 |
-
name = "leaderboard"
|
| 3 |
version = "0.1.0"
|
| 4 |
-
description = "Leaderboard for
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.10,<3.11"
|
| 7 |
|
|
@@ -28,7 +28,9 @@ dependencies = [
|
|
| 28 |
"fastapi>=0.120.0",
|
| 29 |
"loguru>=0.7.3",
|
| 30 |
"uvicorn>=0.38.0",
|
|
|
|
|
|
|
| 31 |
]
|
| 32 |
|
| 33 |
[dependency-groups]
|
| 34 |
-
dev = ["ruff>=0.14.0,<0.15.0"]
|
|
|
|
| 1 |
[project]
|
| 2 |
+
name = "easi-leaderboard"
|
| 3 |
version = "0.1.0"
|
| 4 |
+
description = "Leaderboard for EASI: Holistic Evaluation and Analysis for Spatial Intelligence Made Easy"
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.10,<3.11"
|
| 7 |
|
|
|
|
| 28 |
"fastapi>=0.120.0",
|
| 29 |
"loguru>=0.7.3",
|
| 30 |
"uvicorn>=0.38.0",
|
| 31 |
+
"tomli>=2.3.0 ; python_full_version < '3.11'",
|
| 32 |
+
"typing-extensions>=4.15.0",
|
| 33 |
]
|
| 34 |
|
| 35 |
[dependency-groups]
|
| 36 |
+
dev = ["ruff>=0.14.0,<0.15.0", "tabulate"]
|
requirements.txt
CHANGED
|
@@ -21,3 +21,5 @@ rich>=14.2.0
|
|
| 21 |
fastapi>=0.120.0
|
| 22 |
loguru>=0.7.3
|
| 23 |
uvicorn>=0.38.0
|
|
|
|
|
|
|
|
|
| 21 |
fastapi>=0.120.0
|
| 22 |
loguru>=0.7.3
|
| 23 |
uvicorn>=0.38.0
|
| 24 |
+
tomli>=2.3.0; python_version < '3.11'
|
| 25 |
+
typing_extensions>=4.15.0
|
src/about.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
| 1 |
from enum import Enum
|
|
|
|
| 2 |
from typing import Annotated
|
| 3 |
|
| 4 |
from pydantic import BaseModel, Field
|
| 5 |
|
|
|
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
benchmark: Annotated[str, Field(description="The benchmark name")]
|
| 9 |
metric: Annotated[str, Field(description="The metric name")]
|
| 10 |
col_name: Annotated[str, Field(description="The column name")]
|
|
@@ -12,59 +17,64 @@ class Task(BaseModel):
|
|
| 12 |
|
| 13 |
# Select your tasks here
|
| 14 |
# ---------------------------------------------------
|
| 15 |
-
class
|
| 16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 17 |
|
| 18 |
# acc
|
| 19 |
-
task1_1 =
|
| 20 |
-
task2_1 =
|
| 21 |
-
task3_1 =
|
| 22 |
-
task4_1 =
|
| 23 |
-
task5_1 =
|
| 24 |
-
task6_1 =
|
| 25 |
-
task7_1 =
|
| 26 |
-
task8_1 =
|
| 27 |
|
| 28 |
# caa
|
| 29 |
-
task1_2 =
|
| 30 |
-
task2_2 =
|
| 31 |
-
task3_2 =
|
| 32 |
-
task4_2 =
|
| 33 |
-
task5_2 =
|
| 34 |
-
task6_2 =
|
| 35 |
-
task7_2 =
|
| 36 |
-
task8_2 =
|
| 37 |
|
| 38 |
# rand
|
| 39 |
-
task1_3 =
|
| 40 |
-
task2_3 =
|
| 41 |
-
task3_3 =
|
| 42 |
-
task4_3 =
|
| 43 |
-
task5_3 =
|
| 44 |
-
task6_3 =
|
| 45 |
-
task7_3 =
|
| 46 |
-
task8_3 =
|
|
|
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
BENCHMARKS = {m.value.benchmark for m in Tasks}
|
| 50 |
-
METRICS = {m.value.metric for m in Tasks}
|
| 51 |
-
COL_NAMES = {m.value.col_name for m in Tasks}
|
| 52 |
|
| 53 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 54 |
# ---------------------------------------------------
|
| 55 |
|
| 56 |
|
| 57 |
# Your leaderboard name
|
| 58 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 59 |
|
| 60 |
# What does your leaderboard evaluate?
|
| 61 |
INTRODUCTION_TEXT = """
|
| 62 |
-
|
| 63 |
"""
|
| 64 |
|
| 65 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 66 |
LLM_BENCHMARKS_TEXT = """
|
| 67 |
-
##
|
| 68 |
|
| 69 |
## Reproducibility
|
| 70 |
To reproduce our results, here is the commands you can run:
|
|
|
|
| 1 |
from enum import Enum
|
| 2 |
+
from functools import lru_cache
|
| 3 |
from typing import Annotated
|
| 4 |
|
| 5 |
from pydantic import BaseModel, Field
|
| 6 |
|
| 7 |
+
from src.prepare import load_meta_toml, prepare_space
|
| 8 |
|
| 9 |
+
prepare_space()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class _Task(BaseModel):
|
| 13 |
benchmark: Annotated[str, Field(description="The benchmark name")]
|
| 14 |
metric: Annotated[str, Field(description="The metric name")]
|
| 15 |
col_name: Annotated[str, Field(description="The column name")]
|
|
|
|
| 17 |
|
| 18 |
# Select your tasks here
|
| 19 |
# ---------------------------------------------------
|
| 20 |
+
class _Tasks(Enum):
|
| 21 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 22 |
|
| 23 |
# acc
|
| 24 |
+
task1_1 = _Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
|
| 25 |
+
task2_1 = _Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
|
| 26 |
+
task3_1 = _Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
|
| 27 |
+
task4_1 = _Task(benchmark="Core", metric="acc", col_name="Core(acc)")
|
| 28 |
+
task5_1 = _Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
|
| 29 |
+
task6_1 = _Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
|
| 30 |
+
task7_1 = _Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
|
| 31 |
+
task8_1 = _Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
|
| 32 |
|
| 33 |
# caa
|
| 34 |
+
task1_2 = _Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
|
| 35 |
+
task2_2 = _Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
|
| 36 |
+
task3_2 = _Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
|
| 37 |
+
task4_2 = _Task(benchmark="Core", metric="caa", col_name="Core(caa)")
|
| 38 |
+
task5_2 = _Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
|
| 39 |
+
task6_2 = _Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
|
| 40 |
+
task7_2 = _Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
|
| 41 |
+
task8_2 = _Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
|
| 42 |
|
| 43 |
# rand
|
| 44 |
+
task1_3 = _Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
|
| 45 |
+
task2_3 = _Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
|
| 46 |
+
task3_3 = _Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
|
| 47 |
+
task4_3 = _Task(benchmark="Core", metric="rand", col_name="Core(rand)")
|
| 48 |
+
task5_3 = _Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
|
| 49 |
+
task6_3 = _Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
|
| 50 |
+
task7_3 = _Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
|
| 51 |
+
task8_3 = _Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
|
| 52 |
+
|
| 53 |
|
| 54 |
+
# BENCHMARKS = {m.value.benchmark for m in Tasks}
|
| 55 |
+
# METRICS = {m.value.metric for m in Tasks}
|
| 56 |
+
# COL_NAMES = {m.value.col_name for m in Tasks}
|
| 57 |
+
@lru_cache(maxsize=1)
|
| 58 |
+
def get_benchmarks():
|
| 59 |
+
meta_toml = load_meta_toml()
|
| 60 |
+
return sorted(meta_toml.benchmarks)
|
| 61 |
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 64 |
# ---------------------------------------------------
|
| 65 |
|
| 66 |
|
| 67 |
# Your leaderboard name
|
| 68 |
+
TITLE = """<h1 align="center" id="space-title">EASI Leaderboard</h1>"""
|
| 69 |
|
| 70 |
# What does your leaderboard evaluate?
|
| 71 |
INTRODUCTION_TEXT = """
|
| 72 |
+
EASI: Holistic Evaluation and Analysis for Spatial Intelligence Made Easy
|
| 73 |
"""
|
| 74 |
|
| 75 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 76 |
LLM_BENCHMARKS_TEXT = """
|
| 77 |
+
## Leaderboard
|
| 78 |
|
| 79 |
## Reproducibility
|
| 80 |
To reproduce our results, here is the commands you can run:
|
src/backend/config.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
from functools import cached_property
|
|
|
|
| 2 |
from typing import Annotated
|
| 3 |
|
| 4 |
from dotenv import load_dotenv
|
| 5 |
-
from pydantic import Field, SecretStr
|
| 6 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 7 |
|
| 8 |
load_dotenv()
|
|
@@ -31,8 +32,26 @@ class Settings(BaseSettings):
|
|
| 31 |
token=token,
|
| 32 |
)
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
settings = Settings()
|
|
|
|
| 1 |
from functools import cached_property
|
| 2 |
+
from pathlib import Path
|
| 3 |
from typing import Annotated
|
| 4 |
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
+
from pydantic import Field, SecretStr, computed_field
|
| 7 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 8 |
|
| 9 |
load_dotenv()
|
|
|
|
| 32 |
token=token,
|
| 33 |
)
|
| 34 |
|
| 35 |
+
# Settings for Hugging Face repos
|
| 36 |
+
HF_OWNER: str = "lmms-lab"
|
| 37 |
+
HF_REPO_NAME: Annotated[str, Field(description="Name of leaderboard repo")] = "EASI-Leaderboard"
|
| 38 |
+
HF_RESULTS_REPO_NAME: Annotated[str, Field(description="Name of results repo")] = "EASI-Leaderboard-Results"
|
| 39 |
+
HF_REQUESTS_REPO_NAME: Annotated[str, Field(description="Name of requests repo")] = "EASI-Leaderboard-Requests"
|
| 40 |
+
|
| 41 |
+
@computed_field
|
| 42 |
+
@cached_property
|
| 43 |
+
def REPO_ID(self) -> str:
|
| 44 |
+
return (Path(self.HF_OWNER) / self.HF_REPO_NAME).as_posix()
|
| 45 |
+
|
| 46 |
+
@computed_field
|
| 47 |
+
@cached_property
|
| 48 |
+
def RESULTS_REPO_ID(self) -> str:
|
| 49 |
+
return (Path(self.HF_OWNER) / self.HF_RESULTS_REPO_NAME).as_posix()
|
| 50 |
+
|
| 51 |
+
@computed_field
|
| 52 |
+
@cached_property
|
| 53 |
+
def QUEUE_REPO_ID(self) -> str:
|
| 54 |
+
return (Path(self.HF_OWNER) / self.HF_REQUESTS_REPO_NAME).as_posix()
|
| 55 |
|
| 56 |
|
| 57 |
settings = Settings()
|
src/backend/routes/hf.py
CHANGED
|
@@ -52,7 +52,7 @@ async def upload_file_content(
|
|
| 52 |
path_or_fileobj=file_obj,
|
| 53 |
path_in_repo=params.path_in_repo,
|
| 54 |
commit_message=params.commit_message,
|
| 55 |
-
repo_id=settings.
|
| 56 |
repo_type="dataset",
|
| 57 |
)
|
| 58 |
return ResponseData(data=data)
|
|
@@ -104,7 +104,7 @@ async def community_submit(
|
|
| 104 |
path_or_fileobj=file_obj,
|
| 105 |
path_in_repo=path_in_repo,
|
| 106 |
commit_message=params.commit_message,
|
| 107 |
-
repo_id=settings.
|
| 108 |
repo_type="dataset",
|
| 109 |
)
|
| 110 |
return ResponseData(data=data)
|
|
|
|
| 52 |
path_or_fileobj=file_obj,
|
| 53 |
path_in_repo=params.path_in_repo,
|
| 54 |
commit_message=params.commit_message,
|
| 55 |
+
repo_id=settings.QUEUE_REPO_ID,
|
| 56 |
repo_type="dataset",
|
| 57 |
)
|
| 58 |
return ResponseData(data=data)
|
|
|
|
| 104 |
path_or_fileobj=file_obj,
|
| 105 |
path_in_repo=path_in_repo,
|
| 106 |
commit_message=params.commit_message,
|
| 107 |
+
repo_id=settings.QUEUE_REPO_ID,
|
| 108 |
repo_type="dataset",
|
| 109 |
)
|
| 110 |
return ResponseData(data=data)
|
src/display/formatting.py
CHANGED
|
@@ -9,6 +9,9 @@ def model_hyperlink(link: str, model_name: str) -> str:
|
|
| 9 |
|
| 10 |
|
| 11 |
def make_clickable_model(model_name: str) -> str:
|
|
|
|
|
|
|
|
|
|
| 12 |
link = f"https://huggingface.co/{model_name}"
|
| 13 |
return model_hyperlink(link, model_name)
|
| 14 |
|
|
@@ -26,7 +29,7 @@ def styled_message(message: str) -> str:
|
|
| 26 |
|
| 27 |
|
| 28 |
def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
|
| 29 |
-
return df.loc[:, columns].notna().
|
| 30 |
|
| 31 |
|
| 32 |
def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def make_clickable_model(model_name: str) -> str:
|
| 12 |
+
if "/" not in model_name:
|
| 13 |
+
# not a full model name, cannot be clicked
|
| 14 |
+
return model_name
|
| 15 |
link = f"https://huggingface.co/{model_name}"
|
| 16 |
return model_hyperlink(link, model_name)
|
| 17 |
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
|
| 32 |
+
return df.loc[:, columns].notna().any(axis=1)
|
| 33 |
|
| 34 |
|
| 35 |
def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
|
src/display/utils.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Literal, Union
|
|
| 9 |
from pydantic import BaseModel, ConfigDict, create_model
|
| 10 |
from typing_extensions import Self
|
| 11 |
|
| 12 |
-
from src.about import
|
| 13 |
|
| 14 |
|
| 15 |
def fields(
|
|
@@ -33,6 +33,8 @@ class ColumnContent(BaseModel):
|
|
| 33 |
hidden: bool = False
|
| 34 |
never_hidden: bool = False
|
| 35 |
|
|
|
|
|
|
|
| 36 |
@classmethod
|
| 37 |
def new(
|
| 38 |
cls,
|
|
@@ -42,6 +44,7 @@ class ColumnContent(BaseModel):
|
|
| 42 |
*,
|
| 43 |
hidden: bool = False,
|
| 44 |
never_hidden: bool = False,
|
|
|
|
| 45 |
) -> Self:
|
| 46 |
return cls(
|
| 47 |
name=name,
|
|
@@ -49,6 +52,7 @@ class ColumnContent(BaseModel):
|
|
| 49 |
displayed_by_default=displayed_by_default,
|
| 50 |
hidden=hidden,
|
| 51 |
never_hidden=never_hidden,
|
|
|
|
| 52 |
)
|
| 53 |
|
| 54 |
|
|
@@ -56,29 +60,34 @@ class _AutoEvalColumnBase(BaseModel):
|
|
| 56 |
model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
|
| 57 |
|
| 58 |
model_type_symbol: ColumnContent = ColumnContent(
|
| 59 |
-
name="T",
|
|
|
|
|
|
|
|
|
|
| 60 |
)
|
| 61 |
model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
|
| 62 |
average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
|
| 63 |
|
| 64 |
model_type: ColumnContent = ColumnContent.new("Type", "str")
|
| 65 |
-
architecture: ColumnContent = ColumnContent.new("Architecture", "str")
|
| 66 |
weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
|
| 67 |
-
precision: ColumnContent = ColumnContent.new("Precision", "str")
|
| 68 |
-
license: ColumnContent = ColumnContent.new("Hub License", "str")
|
| 69 |
-
params: ColumnContent = ColumnContent.new("#Params (B)", "number")
|
| 70 |
-
likes: ColumnContent = ColumnContent.new("Hub ❤️", "number")
|
| 71 |
-
still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool")
|
| 72 |
-
revision: ColumnContent = ColumnContent.new("Model sha", "str")
|
|
|
|
| 73 |
|
|
|
|
| 74 |
|
| 75 |
# We use create_model to dynamically fill the scores from Tasks
|
| 76 |
field_definitions = {
|
| 77 |
-
task.
|
| 78 |
ColumnContent,
|
| 79 |
-
ColumnContent.new(task.
|
| 80 |
)
|
| 81 |
-
for task in
|
| 82 |
}
|
| 83 |
AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model( # pyright: ignore[reportCallIssue]
|
| 84 |
'_AutoEvalColumnCls',
|
|
@@ -156,9 +165,11 @@ class Precision(Enum):
|
|
| 156 |
|
| 157 |
|
| 158 |
# Column selection
|
| 159 |
-
COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
|
| 160 |
BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
|
| 161 |
EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
|
| 162 |
EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
|
|
|
|
| 163 |
|
| 164 |
-
BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
| 9 |
from pydantic import BaseModel, ConfigDict, create_model
|
| 10 |
from typing_extensions import Self
|
| 11 |
|
| 12 |
+
from src.about import get_benchmarks
|
| 13 |
|
| 14 |
|
| 15 |
def fields(
|
|
|
|
| 33 |
hidden: bool = False
|
| 34 |
never_hidden: bool = False
|
| 35 |
|
| 36 |
+
not_supported: bool = False # for not supported columns, should not be displayed
|
| 37 |
+
|
| 38 |
@classmethod
|
| 39 |
def new(
|
| 40 |
cls,
|
|
|
|
| 44 |
*,
|
| 45 |
hidden: bool = False,
|
| 46 |
never_hidden: bool = False,
|
| 47 |
+
not_supported: bool = False,
|
| 48 |
) -> Self:
|
| 49 |
return cls(
|
| 50 |
name=name,
|
|
|
|
| 52 |
displayed_by_default=displayed_by_default,
|
| 53 |
hidden=hidden,
|
| 54 |
never_hidden=never_hidden,
|
| 55 |
+
not_supported=not_supported,
|
| 56 |
)
|
| 57 |
|
| 58 |
|
|
|
|
| 60 |
model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
|
| 61 |
|
| 62 |
model_type_symbol: ColumnContent = ColumnContent(
|
| 63 |
+
name="T",
|
| 64 |
+
type="str",
|
| 65 |
+
displayed_by_default=True,
|
| 66 |
+
# never_hidden=True,
|
| 67 |
)
|
| 68 |
model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
|
| 69 |
average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
|
| 70 |
|
| 71 |
model_type: ColumnContent = ColumnContent.new("Type", "str")
|
| 72 |
+
architecture: ColumnContent = ColumnContent.new("Architecture", "str", not_supported=True)
|
| 73 |
weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
|
| 74 |
+
precision: ColumnContent = ColumnContent.new("Precision", "str", not_supported=True)
|
| 75 |
+
license: ColumnContent = ColumnContent.new("Hub License", "str", not_supported=True)
|
| 76 |
+
params: ColumnContent = ColumnContent.new("#Params (B)", "number", not_supported=True)
|
| 77 |
+
likes: ColumnContent = ColumnContent.new("Hub ❤️", "number", not_supported=True)
|
| 78 |
+
still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool", not_supported=True)
|
| 79 |
+
revision: ColumnContent = ColumnContent.new("Model sha", "str", not_supported=True)
|
| 80 |
+
|
| 81 |
|
| 82 |
+
BENCHMARKS = get_benchmarks()
|
| 83 |
|
| 84 |
# We use create_model to dynamically fill the scores from Tasks
|
| 85 |
field_definitions = {
|
| 86 |
+
task.key: (
|
| 87 |
ColumnContent,
|
| 88 |
+
ColumnContent.new(task.title, "number", True),
|
| 89 |
)
|
| 90 |
+
for task in BENCHMARKS
|
| 91 |
}
|
| 92 |
AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model( # pyright: ignore[reportCallIssue]
|
| 93 |
'_AutoEvalColumnCls',
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
# Column selection
|
| 168 |
+
# COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
|
| 169 |
BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
|
| 170 |
EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
|
| 171 |
EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
|
| 172 |
+
NOT_SUPPORTED_COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if c.not_supported]
|
| 173 |
|
| 174 |
+
# BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]
|
| 175 |
+
BENCHMARK_COLS: list[str] = [t.title for t in BENCHMARKS]
|
src/envs.py
CHANGED
|
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
| 3 |
from typing import Annotated
|
| 4 |
|
| 5 |
from huggingface_hub import HfApi
|
| 6 |
-
from pydantic import Field, computed_field
|
| 7 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 8 |
|
| 9 |
# ----------------------------------
|
|
@@ -14,64 +14,64 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
| 14 |
class Settings(BaseSettings):
|
| 15 |
model_config = SettingsConfigDict(env_file=".env")
|
| 16 |
|
| 17 |
-
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
]
|
| 24 |
-
|
| 25 |
-
BACKEND_HOST: Annotated[str, Field("127.0.0.1", description="Backend host")]
|
| 26 |
-
BACKEND_PORT: Annotated[int, Field(8000, description="Backend port")]
|
| 27 |
|
| 28 |
@computed_field
|
| 29 |
@cached_property
|
| 30 |
def REPO_ID(self) -> str:
|
| 31 |
-
return (Path(self.
|
| 32 |
|
| 33 |
@computed_field
|
| 34 |
@cached_property
|
| 35 |
-
def
|
| 36 |
-
return (Path(self.
|
| 37 |
|
| 38 |
@computed_field
|
| 39 |
@cached_property
|
| 40 |
-
def
|
| 41 |
-
return (Path(self.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
Field(
|
|
|
|
|
|
|
|
|
|
| 46 |
]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# Local caches
|
| 49 |
|
| 50 |
@computed_field
|
| 51 |
@cached_property
|
| 52 |
def EVAL_REQUESTS_PATH(self) -> str:
|
| 53 |
-
return (
|
| 54 |
|
| 55 |
@computed_field
|
| 56 |
@cached_property
|
| 57 |
def EVAL_RESULTS_PATH(self) -> str:
|
| 58 |
-
return (
|
| 59 |
-
|
| 60 |
-
@computed_field
|
| 61 |
-
@cached_property
|
| 62 |
-
def EVAL_REQUESTS_PATH_BACKEND(self) -> str:
|
| 63 |
-
return (Path(self.CACHE_PATH) / "eval-queue-bk").as_posix()
|
| 64 |
|
| 65 |
@computed_field
|
| 66 |
@cached_property
|
| 67 |
-
def
|
| 68 |
-
return (
|
| 69 |
|
| 70 |
@computed_field
|
| 71 |
@cached_property
|
| 72 |
-
def
|
| 73 |
-
return
|
| 74 |
|
| 75 |
|
| 76 |
settings = Settings() # pyright: ignore[reportCallIssue]
|
| 77 |
-
API = settings.
|
|
|
|
| 3 |
from typing import Annotated
|
| 4 |
|
| 5 |
from huggingface_hub import HfApi
|
| 6 |
+
from pydantic import Field, SecretStr, computed_field
|
| 7 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 8 |
|
| 9 |
# ----------------------------------
|
|
|
|
| 14 |
class Settings(BaseSettings):
|
| 15 |
model_config = SettingsConfigDict(env_file=".env")
|
| 16 |
|
| 17 |
+
HF_TOKEN: Annotated[SecretStr, Field(..., description="A read/write token for your org")]
|
| 18 |
|
| 19 |
+
# Settings for Hugging Face repos
|
| 20 |
+
HF_OWNER: str = "lmms-lab"
|
| 21 |
+
HF_REPO_NAME: Annotated[str, Field(description="Name of leaderboard repo")] = "EASI-Leaderboard"
|
| 22 |
+
HF_RESULTS_REPO_NAME: Annotated[str, Field(description="Name of results repo")] = "EASI-Leaderboard-Results"
|
| 23 |
+
HF_REQUESTS_REPO_NAME: Annotated[str, Field(description="Name of requests repo")] = "EASI-Leaderboard-Requests"
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
@computed_field
|
| 26 |
@cached_property
|
| 27 |
def REPO_ID(self) -> str:
|
| 28 |
+
return (Path(self.HF_OWNER) / self.HF_REPO_NAME).as_posix()
|
| 29 |
|
| 30 |
@computed_field
|
| 31 |
@cached_property
|
| 32 |
+
def RESULTS_REPO_ID(self) -> str:
|
| 33 |
+
return (Path(self.HF_OWNER) / self.HF_RESULTS_REPO_NAME).as_posix()
|
| 34 |
|
| 35 |
@computed_field
|
| 36 |
@cached_property
|
| 37 |
+
def QUEUE_REPO_ID(self) -> str:
|
| 38 |
+
return (Path(self.HF_OWNER) / self.HF_REQUESTS_REPO_NAME).as_posix()
|
| 39 |
+
|
| 40 |
+
HF_HOME: Annotated[
|
| 41 |
+
Path,
|
| 42 |
+
Field(
|
| 43 |
+
default_factory=lambda: Path(".").resolve(),
|
| 44 |
+
description="If you setup a cache later, just change `HF_HOME`",
|
| 45 |
+
),
|
| 46 |
]
|
| 47 |
|
| 48 |
+
# Backend settings
|
| 49 |
+
|
| 50 |
+
BACKEND_HOST: Annotated[str, Field("127.0.0.1", description="Backend host")]
|
| 51 |
+
BACKEND_PORT: Annotated[int, Field(8000, description="Backend port")]
|
| 52 |
+
|
| 53 |
# Local caches
|
| 54 |
|
| 55 |
@computed_field
|
| 56 |
@cached_property
|
| 57 |
def EVAL_REQUESTS_PATH(self) -> str:
|
| 58 |
+
return (self.HF_HOME / "eval-queue").as_posix()
|
| 59 |
|
| 60 |
@computed_field
|
| 61 |
@cached_property
|
| 62 |
def EVAL_RESULTS_PATH(self) -> str:
|
| 63 |
+
return (self.HF_HOME / "eval-results").as_posix()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
@computed_field
|
| 66 |
@cached_property
|
| 67 |
+
def EVAL_REQUESTS_PATH_BACKUP(self) -> str:
|
| 68 |
+
return (self.HF_HOME / "eval-queue-bk").as_posix()
|
| 69 |
|
| 70 |
@computed_field
|
| 71 |
@cached_property
|
| 72 |
+
def EVAL_RESULTS_PATH_BACKUP(self) -> str:
|
| 73 |
+
return (self.HF_HOME / "eval-results-bk").as_posix()
|
| 74 |
|
| 75 |
|
| 76 |
settings = Settings() # pyright: ignore[reportCallIssue]
|
| 77 |
+
API = HfApi(token=settings.HF_TOKEN.get_secret_value())
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -15,10 +15,14 @@ import numpy as np
|
|
| 15 |
from pydantic import BaseModel, ConfigDict, Field
|
| 16 |
from typing_extensions import Self
|
| 17 |
|
|
|
|
| 18 |
from src.display.formatting import make_clickable_model
|
| 19 |
-
from src.display.utils import AutoEvalColumn, ModelType, Precision,
|
|
|
|
| 20 |
from src.submission.check_validity import is_model_on_hub
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
class EvalResultJson(BaseModel):
|
| 24 |
"""Model of the eval result json file."""
|
|
@@ -34,8 +38,8 @@ class EvalResultJson_Config(BaseModel):
|
|
| 34 |
|
| 35 |
model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
|
| 36 |
|
| 37 |
-
model_dtype: Annotated[str, Field(..., description="The model precision. e.g. torch.bfloat16")]
|
| 38 |
model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
|
|
|
|
| 39 |
model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
|
| 40 |
model_args: Annotated[str | None, Field(description="The model args.")] = None
|
| 41 |
|
|
@@ -70,6 +74,7 @@ class EvalResult(BaseModel):
|
|
| 70 |
precision = Precision.from_str(config.model_dtype)
|
| 71 |
|
| 72 |
# Get model and org
|
|
|
|
| 73 |
org_and_model = config.model_name or config.model_args or ""
|
| 74 |
org_and_model = org_and_model.split("/", 1)
|
| 75 |
|
|
@@ -83,6 +88,10 @@ class EvalResult(BaseModel):
|
|
| 83 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 84 |
full_model = "/".join(org_and_model)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 87 |
full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
|
| 88 |
)
|
|
@@ -94,16 +103,15 @@ class EvalResult(BaseModel):
|
|
| 94 |
|
| 95 |
# Extract results available in this file (some results are split in several files)
|
| 96 |
results: dict[str, float] = {}
|
| 97 |
-
for
|
| 98 |
-
task = t.value
|
| 99 |
-
|
| 100 |
# We average all scores of a given metric (not all metrics are present in all files)
|
| 101 |
-
|
|
|
|
| 102 |
if accs.size == 0 or any(acc is None for acc in accs):
|
| 103 |
continue
|
| 104 |
|
| 105 |
mean_acc = np.mean(accs) * 100.0
|
| 106 |
-
results[task.
|
| 107 |
|
| 108 |
return cls.model_validate({
|
| 109 |
"eval_name": result_key,
|
|
@@ -119,6 +127,8 @@ class EvalResult(BaseModel):
|
|
| 119 |
|
| 120 |
def update_with_request_file(self, requests_path: str) -> None:
|
| 121 |
"""Finds the relevant request file for the current model and updates info with it"""
|
|
|
|
|
|
|
| 122 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
| 123 |
|
| 124 |
try:
|
|
@@ -137,7 +147,7 @@ class EvalResult(BaseModel):
|
|
| 137 |
|
| 138 |
def to_dict(self) -> dict:
|
| 139 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 140 |
-
average = sum(v for v in self.results.values() if v is not None) / len(
|
| 141 |
data_dict = {
|
| 142 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 143 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -154,8 +164,8 @@ class EvalResult(BaseModel):
|
|
| 154 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 155 |
}
|
| 156 |
|
| 157 |
-
for task in
|
| 158 |
-
data_dict[task.
|
| 159 |
|
| 160 |
return data_dict
|
| 161 |
|
|
@@ -181,8 +191,6 @@ def get_request_file_for_model(requests_path, model_name, precision) -> str:
|
|
| 181 |
|
| 182 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 183 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 184 |
-
from rich import print as rprint # FIXME: DEBUG
|
| 185 |
-
|
| 186 |
model_result_filepaths: list[str] = []
|
| 187 |
|
| 188 |
for root, _, files in os.walk(results_path):
|
|
@@ -208,7 +216,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 208 |
# Store results of same eval together
|
| 209 |
eval_name = eval_result.eval_name
|
| 210 |
if eval_name in eval_results.keys():
|
| 211 |
-
|
|
|
|
| 212 |
else:
|
| 213 |
eval_results[eval_name] = eval_result
|
| 214 |
|
|
|
|
| 15 |
from pydantic import BaseModel, ConfigDict, Field
|
| 16 |
from typing_extensions import Self
|
| 17 |
|
| 18 |
+
from src.about import get_benchmarks
|
| 19 |
from src.display.formatting import make_clickable_model
|
| 20 |
+
from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
|
| 21 |
+
from src.prepare import load_meta_toml
|
| 22 |
from src.submission.check_validity import is_model_on_hub
|
| 23 |
|
| 24 |
+
BENCHMARKS = get_benchmarks()
|
| 25 |
+
|
| 26 |
|
| 27 |
class EvalResultJson(BaseModel):
|
| 28 |
"""Model of the eval result json file."""
|
|
|
|
| 38 |
|
| 39 |
model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
|
| 40 |
|
|
|
|
| 41 |
model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
|
| 42 |
+
model_dtype: Annotated[str | None, Field(description="The model precision. e.g. torch.bfloat16")] = None
|
| 43 |
model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
|
| 44 |
model_args: Annotated[str | None, Field(description="The model args.")] = None
|
| 45 |
|
|
|
|
| 74 |
precision = Precision.from_str(config.model_dtype)
|
| 75 |
|
| 76 |
# Get model and org
|
| 77 |
+
|
| 78 |
org_and_model = config.model_name or config.model_args or ""
|
| 79 |
org_and_model = org_and_model.split("/", 1)
|
| 80 |
|
|
|
|
| 88 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 89 |
full_model = "/".join(org_and_model)
|
| 90 |
|
| 91 |
+
meta_toml = load_meta_toml()
|
| 92 |
+
# update full_model from meta_toml if it exists
|
| 93 |
+
full_model = meta_toml.model_title_to_repo_id.get(full_model, full_model)
|
| 94 |
+
|
| 95 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 96 |
full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
|
| 97 |
)
|
|
|
|
| 103 |
|
| 104 |
# Extract results available in this file (some results are split in several files)
|
| 105 |
results: dict[str, float] = {}
|
| 106 |
+
for task in BENCHMARKS:
|
|
|
|
|
|
|
| 107 |
# We average all scores of a given metric (not all metrics are present in all files)
|
| 108 |
+
# TODO: support multiple metrics
|
| 109 |
+
accs = np.array([v.get("acc", None) for k, v in data.results.items() if task.key == k])
|
| 110 |
if accs.size == 0 or any(acc is None for acc in accs):
|
| 111 |
continue
|
| 112 |
|
| 113 |
mean_acc = np.mean(accs) * 100.0
|
| 114 |
+
results[task.title] = float(mean_acc)
|
| 115 |
|
| 116 |
return cls.model_validate({
|
| 117 |
"eval_name": result_key,
|
|
|
|
| 127 |
|
| 128 |
def update_with_request_file(self, requests_path: str) -> None:
|
| 129 |
"""Finds the relevant request file for the current model and updates info with it"""
|
| 130 |
+
# TODO: do nothing for now
|
| 131 |
+
return
|
| 132 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
| 133 |
|
| 134 |
try:
|
|
|
|
| 147 |
|
| 148 |
def to_dict(self) -> dict:
|
| 149 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 150 |
+
average = sum(v for v in self.results.values() if v is not None) / len(BENCHMARKS)
|
| 151 |
data_dict = {
|
| 152 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 153 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 164 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 165 |
}
|
| 166 |
|
| 167 |
+
for task in BENCHMARKS:
|
| 168 |
+
data_dict[task.title] = self.results.get(task.title, None)
|
| 169 |
|
| 170 |
return data_dict
|
| 171 |
|
|
|
|
| 191 |
|
| 192 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 193 |
"""From the path of the results folder root, extract all needed info for results"""
|
|
|
|
|
|
|
| 194 |
model_result_filepaths: list[str] = []
|
| 195 |
|
| 196 |
for root, _, files in os.walk(results_path):
|
|
|
|
| 216 |
# Store results of same eval together
|
| 217 |
eval_name = eval_result.eval_name
|
| 218 |
if eval_name in eval_results.keys():
|
| 219 |
+
results_loaded = {k: v for k, v in eval_result.results.items() if v is not None}
|
| 220 |
+
eval_results[eval_name].results.update(results_loaded)
|
| 221 |
else:
|
| 222 |
eval_results[eval_name] = eval_result
|
| 223 |
|
src/populate.py
CHANGED
|
@@ -60,7 +60,7 @@ def get_leaderboard_df(
|
|
| 60 |
df = df.loc[:, cols].round(decimals=2)
|
| 61 |
|
| 62 |
# filter out if any of the benchmarks have not been produced
|
| 63 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 64 |
return df
|
| 65 |
|
| 66 |
|
|
|
|
| 60 |
df = df.loc[:, cols].round(decimals=2)
|
| 61 |
|
| 62 |
# filter out if any of the benchmarks have not been produced
|
| 63 |
+
df = df.loc[has_no_nan_values(df, benchmark_cols), :]
|
| 64 |
return df
|
| 65 |
|
| 66 |
|
src/prepare.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from functools import cached_property, lru_cache
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from huggingface_hub import snapshot_download
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from pydantic import BaseModel, ConfigDict
|
| 9 |
+
from typing_extensions import Self
|
| 10 |
+
|
| 11 |
+
from src.envs import API, settings
|
| 12 |
+
|
| 13 |
+
if sys.version_info < (3, 11):
|
| 14 |
+
from tomli import load as toml_load
|
| 15 |
+
else:
|
| 16 |
+
from tomllib import load as toml_load
|
| 17 |
+
|
| 18 |
+
PREPARED_FLAG: bool = os.getenv("NO_DOWNLOAD", 0) == 1
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def prepare_space():
|
| 22 |
+
"""Space initialisation"""
|
| 23 |
+
|
| 24 |
+
def _restart_space():
|
| 25 |
+
API.restart_space(repo_id=settings.REPO_ID)
|
| 26 |
+
|
| 27 |
+
global PREPARED_FLAG
|
| 28 |
+
if not PREPARED_FLAG:
|
| 29 |
+
try:
|
| 30 |
+
snapshot_download(
|
| 31 |
+
repo_id=settings.QUEUE_REPO_ID,
|
| 32 |
+
local_dir=settings.EVAL_REQUESTS_PATH,
|
| 33 |
+
repo_type="dataset",
|
| 34 |
+
tqdm_class=None,
|
| 35 |
+
etag_timeout=30,
|
| 36 |
+
token=settings.HF_TOKEN.get_secret_value(),
|
| 37 |
+
)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.error(f"Error downloading eval queue: {e!s}")
|
| 40 |
+
_restart_space()
|
| 41 |
+
try:
|
| 42 |
+
snapshot_download(
|
| 43 |
+
repo_id=settings.RESULTS_REPO_ID,
|
| 44 |
+
local_dir=settings.EVAL_RESULTS_PATH,
|
| 45 |
+
repo_type="dataset",
|
| 46 |
+
tqdm_class=None,
|
| 47 |
+
etag_timeout=30,
|
| 48 |
+
allow_patterns=["leaderboard/*.toml", "leaderboard/**/*.json"],
|
| 49 |
+
token=settings.HF_TOKEN.get_secret_value(),
|
| 50 |
+
)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"Error downloading eval queue: {e!s}")
|
| 53 |
+
_restart_space()
|
| 54 |
+
PREPARED_FLAG = True
|
| 55 |
+
|
| 56 |
+
load_meta_toml()
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class MetaToml(BaseModel):
|
| 60 |
+
model_config = ConfigDict(extra="allow", frozen=True)
|
| 61 |
+
|
| 62 |
+
models: list["MetaToml_Model"]
|
| 63 |
+
benchmarks: list["MetaToml_Benchmark"]
|
| 64 |
+
model_repos: list["MetaToml_ModelRepo"]
|
| 65 |
+
|
| 66 |
+
@cached_property
|
| 67 |
+
def model_title_to_key(self) -> dict[str, str]:
|
| 68 |
+
return {model.title: model.key for model in self.models}
|
| 69 |
+
|
| 70 |
+
@cached_property
|
| 71 |
+
def benchmark_title_to_key(self) -> dict[str, str]:
|
| 72 |
+
return {benchmark.title: benchmark.key for benchmark in self.benchmarks}
|
| 73 |
+
|
| 74 |
+
@cached_property
|
| 75 |
+
def model_key_to_repo_id(self) -> dict[str, str]:
|
| 76 |
+
return {model.key: model.repo_id for model in self.model_repos if model.repo_id is not None}
|
| 77 |
+
|
| 78 |
+
@cached_property
|
| 79 |
+
def model_title_to_repo_id(self) -> dict[str, str]:
|
| 80 |
+
mapping: dict[str, str] = {}
|
| 81 |
+
for model in self.models:
|
| 82 |
+
model_key = self.model_title_to_key.get(model.title)
|
| 83 |
+
if model_key:
|
| 84 |
+
model_repo_id = self.model_key_to_repo_id.get(model_key)
|
| 85 |
+
if model_repo_id:
|
| 86 |
+
mapping[model.title] = model_repo_id
|
| 87 |
+
return mapping
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class _HashableComparableMixin(BaseModel):
|
| 91 |
+
model_config = ConfigDict(extra="allow", frozen=True)
|
| 92 |
+
|
| 93 |
+
key: str
|
| 94 |
+
title: str
|
| 95 |
+
|
| 96 |
+
def __hash__(self) -> int:
|
| 97 |
+
return hash(self.key)
|
| 98 |
+
|
| 99 |
+
def __eq__(self, other: Self) -> bool:
|
| 100 |
+
return (self.key, self.title) == (other.key, other.title)
|
| 101 |
+
|
| 102 |
+
def __lt__(self, other: Self) -> bool:
|
| 103 |
+
return (self.key, self.title) < (other.key, other.title)
|
| 104 |
+
|
| 105 |
+
def __gt__(self, other: Self) -> bool:
|
| 106 |
+
return (self.key, self.title) > (other.key, other.title)
|
| 107 |
+
|
| 108 |
+
def __le__(self, other: Self) -> bool:
|
| 109 |
+
return (self.key, self.title) <= (other.key, other.title)
|
| 110 |
+
|
| 111 |
+
def __ge__(self, other: Self) -> bool:
|
| 112 |
+
return (self.key, self.title) >= (other.key, other.title)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class MetaToml_Benchmark(_HashableComparableMixin): ...
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class MetaToml_Model(_HashableComparableMixin): ...
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class MetaToml_ModelRepo(BaseModel):
|
| 122 |
+
model_config = ConfigDict(extra="allow", frozen=True)
|
| 123 |
+
|
| 124 |
+
key: str
|
| 125 |
+
repo_id: str | None = None
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@lru_cache(maxsize=1)
|
| 129 |
+
def load_meta_toml() -> MetaToml:
|
| 130 |
+
meta_toml_path = Path(settings.EVAL_RESULTS_PATH) / "leaderboard" / "meta.toml"
|
| 131 |
+
logger.info(f'Loading meta.toml from: {meta_toml_path.as_posix()!r}')
|
| 132 |
+
with meta_toml_path.open("rb") as f:
|
| 133 |
+
data = toml_load(f)
|
| 134 |
+
meta_toml = MetaToml.model_validate(data)
|
| 135 |
+
logger.info("Loaded meta.toml")
|
| 136 |
+
assert meta_toml is not None, f"Failed to load meta.toml: {meta_toml_path.as_posix()!r}"
|
| 137 |
+
return meta_toml
|
src/submission/submit.py
CHANGED
|
@@ -53,14 +53,14 @@ def add_new_eval(
|
|
| 53 |
# Is the model on the hub?
|
| 54 |
if weight_type in ["Delta", "Adapter"]:
|
| 55 |
base_model_on_hub, error, _ = is_model_on_hub(
|
| 56 |
-
model_name=base_model, revision=revision, token=settings.
|
| 57 |
)
|
| 58 |
if not base_model_on_hub:
|
| 59 |
return styled_error(f'Base model "{base_model}" {error}')
|
| 60 |
|
| 61 |
if not weight_type == "Adapter":
|
| 62 |
model_on_hub, error, _ = is_model_on_hub(
|
| 63 |
-
model_name=model, revision=revision, token=settings.
|
| 64 |
)
|
| 65 |
if not model_on_hub:
|
| 66 |
return styled_error(f'Model "{model}" {error}')
|
|
@@ -117,7 +117,7 @@ def add_new_eval(
|
|
| 117 |
API.upload_file(
|
| 118 |
path_or_fileobj=out_path,
|
| 119 |
path_in_repo=out_path.split("eval-queue/")[1],
|
| 120 |
-
repo_id=settings.
|
| 121 |
repo_type="dataset",
|
| 122 |
commit_message=f"Add {model} to eval queue",
|
| 123 |
)
|
|
|
|
| 53 |
# Is the model on the hub?
|
| 54 |
if weight_type in ["Delta", "Adapter"]:
|
| 55 |
base_model_on_hub, error, _ = is_model_on_hub(
|
| 56 |
+
model_name=base_model, revision=revision, token=settings.HF_TOKEN.get_secret_value(), test_tokenizer=True
|
| 57 |
)
|
| 58 |
if not base_model_on_hub:
|
| 59 |
return styled_error(f'Base model "{base_model}" {error}')
|
| 60 |
|
| 61 |
if not weight_type == "Adapter":
|
| 62 |
model_on_hub, error, _ = is_model_on_hub(
|
| 63 |
+
model_name=model, revision=revision, token=settings.HF_TOKEN.get_secret_value(), test_tokenizer=True
|
| 64 |
)
|
| 65 |
if not model_on_hub:
|
| 66 |
return styled_error(f'Model "{model}" {error}')
|
|
|
|
| 117 |
API.upload_file(
|
| 118 |
path_or_fileobj=out_path,
|
| 119 |
path_in_repo=out_path.split("eval-queue/")[1],
|
| 120 |
+
repo_id=settings.QUEUE_REPO_ID,
|
| 121 |
repo_type="dataset",
|
| 122 |
commit_message=f"Add {model} to eval queue",
|
| 123 |
)
|
uv.lock
CHANGED
|
@@ -356,6 +356,76 @@ wheels = [
|
|
| 356 |
{ url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" },
|
| 357 |
]
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
[[package]]
|
| 360 |
name = "exceptiongroup"
|
| 361 |
version = "1.3.0"
|
|
@@ -667,68 +737,6 @@ wheels = [
|
|
| 667 |
{ url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" },
|
| 668 |
]
|
| 669 |
|
| 670 |
-
[[package]]
|
| 671 |
-
name = "leaderboard"
|
| 672 |
-
version = "0.1.0"
|
| 673 |
-
source = { virtual = "." }
|
| 674 |
-
dependencies = [
|
| 675 |
-
{ name = "apscheduler" },
|
| 676 |
-
{ name = "datasets" },
|
| 677 |
-
{ name = "fastapi" },
|
| 678 |
-
{ name = "gradio", extra = ["oauth"] },
|
| 679 |
-
{ name = "gradio-client" },
|
| 680 |
-
{ name = "gradio-leaderboard" },
|
| 681 |
-
{ name = "huggingface-hub" },
|
| 682 |
-
{ name = "loguru" },
|
| 683 |
-
{ name = "matplotlib" },
|
| 684 |
-
{ name = "numpy" },
|
| 685 |
-
{ name = "pandas" },
|
| 686 |
-
{ name = "pydantic" },
|
| 687 |
-
{ name = "pydantic-settings" },
|
| 688 |
-
{ name = "python-dateutil" },
|
| 689 |
-
{ name = "python-dotenv" },
|
| 690 |
-
{ name = "rich" },
|
| 691 |
-
{ name = "sentencepiece" },
|
| 692 |
-
{ name = "tokenizers" },
|
| 693 |
-
{ name = "tqdm" },
|
| 694 |
-
{ name = "transformers" },
|
| 695 |
-
{ name = "uvicorn" },
|
| 696 |
-
]
|
| 697 |
-
|
| 698 |
-
[package.dev-dependencies]
|
| 699 |
-
dev = [
|
| 700 |
-
{ name = "ruff" },
|
| 701 |
-
]
|
| 702 |
-
|
| 703 |
-
[package.metadata]
|
| 704 |
-
requires-dist = [
|
| 705 |
-
{ name = "apscheduler" },
|
| 706 |
-
{ name = "datasets" },
|
| 707 |
-
{ name = "fastapi", specifier = ">=0.120.0" },
|
| 708 |
-
{ name = "gradio" },
|
| 709 |
-
{ name = "gradio", extras = ["oauth"] },
|
| 710 |
-
{ name = "gradio-client" },
|
| 711 |
-
{ name = "gradio-leaderboard", specifier = "==0.0.13" },
|
| 712 |
-
{ name = "huggingface-hub", specifier = ">=0.18.0" },
|
| 713 |
-
{ name = "loguru", specifier = ">=0.7.3" },
|
| 714 |
-
{ name = "matplotlib" },
|
| 715 |
-
{ name = "numpy" },
|
| 716 |
-
{ name = "pandas" },
|
| 717 |
-
{ name = "pydantic", specifier = ">=2.11.10" },
|
| 718 |
-
{ name = "pydantic-settings", specifier = ">=2.11.0" },
|
| 719 |
-
{ name = "python-dateutil" },
|
| 720 |
-
{ name = "python-dotenv", specifier = ">=1.2.1" },
|
| 721 |
-
{ name = "rich", specifier = ">=14.2.0" },
|
| 722 |
-
{ name = "sentencepiece" },
|
| 723 |
-
{ name = "tokenizers", specifier = ">=0.15.0" },
|
| 724 |
-
{ name = "tqdm" },
|
| 725 |
-
{ name = "transformers" },
|
| 726 |
-
{ name = "uvicorn", specifier = ">=0.38.0" },
|
| 727 |
-
]
|
| 728 |
-
|
| 729 |
-
[package.metadata.requires-dev]
|
| 730 |
-
dev = [{ name = "ruff", specifier = ">=0.14.0,<0.15.0" }]
|
| 731 |
-
|
| 732 |
[[package]]
|
| 733 |
name = "loguru"
|
| 734 |
version = "0.7.3"
|
|
@@ -1324,6 +1332,15 @@ wheels = [
|
|
| 1324 |
{ url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
|
| 1325 |
]
|
| 1326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1327 |
[[package]]
|
| 1328 |
name = "tokenizers"
|
| 1329 |
version = "0.22.1"
|
|
@@ -1349,6 +1366,15 @@ wheels = [
|
|
| 1349 |
{ url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
|
| 1350 |
]
|
| 1351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1352 |
[[package]]
|
| 1353 |
name = "tomlkit"
|
| 1354 |
version = "0.13.3"
|
|
|
|
| 356 |
{ url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" },
|
| 357 |
]
|
| 358 |
|
| 359 |
+
[[package]]
|
| 360 |
+
name = "easi-leaderboard"
|
| 361 |
+
version = "0.1.0"
|
| 362 |
+
source = { virtual = "." }
|
| 363 |
+
dependencies = [
|
| 364 |
+
{ name = "apscheduler" },
|
| 365 |
+
{ name = "datasets" },
|
| 366 |
+
{ name = "fastapi" },
|
| 367 |
+
{ name = "gradio", extra = ["oauth"] },
|
| 368 |
+
{ name = "gradio-client" },
|
| 369 |
+
{ name = "gradio-leaderboard" },
|
| 370 |
+
{ name = "huggingface-hub" },
|
| 371 |
+
{ name = "loguru" },
|
| 372 |
+
{ name = "matplotlib" },
|
| 373 |
+
{ name = "numpy" },
|
| 374 |
+
{ name = "pandas" },
|
| 375 |
+
{ name = "pydantic" },
|
| 376 |
+
{ name = "pydantic-settings" },
|
| 377 |
+
{ name = "python-dateutil" },
|
| 378 |
+
{ name = "python-dotenv" },
|
| 379 |
+
{ name = "rich" },
|
| 380 |
+
{ name = "sentencepiece" },
|
| 381 |
+
{ name = "tokenizers" },
|
| 382 |
+
{ name = "tomli" },
|
| 383 |
+
{ name = "tqdm" },
|
| 384 |
+
{ name = "transformers" },
|
| 385 |
+
{ name = "typing-extensions" },
|
| 386 |
+
{ name = "uvicorn" },
|
| 387 |
+
]
|
| 388 |
+
|
| 389 |
+
[package.dev-dependencies]
|
| 390 |
+
dev = [
|
| 391 |
+
{ name = "ruff" },
|
| 392 |
+
{ name = "tabulate" },
|
| 393 |
+
]
|
| 394 |
+
|
| 395 |
+
[package.metadata]
|
| 396 |
+
requires-dist = [
|
| 397 |
+
{ name = "apscheduler" },
|
| 398 |
+
{ name = "datasets" },
|
| 399 |
+
{ name = "fastapi", specifier = ">=0.120.0" },
|
| 400 |
+
{ name = "gradio" },
|
| 401 |
+
{ name = "gradio", extras = ["oauth"] },
|
| 402 |
+
{ name = "gradio-client" },
|
| 403 |
+
{ name = "gradio-leaderboard", specifier = "==0.0.13" },
|
| 404 |
+
{ name = "huggingface-hub", specifier = ">=0.18.0" },
|
| 405 |
+
{ name = "loguru", specifier = ">=0.7.3" },
|
| 406 |
+
{ name = "matplotlib" },
|
| 407 |
+
{ name = "numpy" },
|
| 408 |
+
{ name = "pandas" },
|
| 409 |
+
{ name = "pydantic", specifier = ">=2.11.10" },
|
| 410 |
+
{ name = "pydantic-settings", specifier = ">=2.11.0" },
|
| 411 |
+
{ name = "python-dateutil" },
|
| 412 |
+
{ name = "python-dotenv", specifier = ">=1.2.1" },
|
| 413 |
+
{ name = "rich", specifier = ">=14.2.0" },
|
| 414 |
+
{ name = "sentencepiece" },
|
| 415 |
+
{ name = "tokenizers", specifier = ">=0.15.0" },
|
| 416 |
+
{ name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.3.0" },
|
| 417 |
+
{ name = "tqdm" },
|
| 418 |
+
{ name = "transformers" },
|
| 419 |
+
{ name = "typing-extensions", specifier = ">=4.15.0" },
|
| 420 |
+
{ name = "uvicorn", specifier = ">=0.38.0" },
|
| 421 |
+
]
|
| 422 |
+
|
| 423 |
+
[package.metadata.requires-dev]
|
| 424 |
+
dev = [
|
| 425 |
+
{ name = "ruff", specifier = ">=0.14.0,<0.15.0" },
|
| 426 |
+
{ name = "tabulate" },
|
| 427 |
+
]
|
| 428 |
+
|
| 429 |
[[package]]
|
| 430 |
name = "exceptiongroup"
|
| 431 |
version = "1.3.0"
|
|
|
|
| 737 |
{ url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" },
|
| 738 |
]
|
| 739 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
[[package]]
|
| 741 |
name = "loguru"
|
| 742 |
version = "0.7.3"
|
|
|
|
| 1332 |
{ url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
|
| 1333 |
]
|
| 1334 |
|
| 1335 |
+
[[package]]
|
| 1336 |
+
name = "tabulate"
|
| 1337 |
+
version = "0.9.0"
|
| 1338 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1339 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
|
| 1340 |
+
wheels = [
|
| 1341 |
+
{ url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
|
| 1342 |
+
]
|
| 1343 |
+
|
| 1344 |
[[package]]
|
| 1345 |
name = "tokenizers"
|
| 1346 |
version = "0.22.1"
|
|
|
|
| 1366 |
{ url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
|
| 1367 |
]
|
| 1368 |
|
| 1369 |
+
[[package]]
|
| 1370 |
+
name = "tomli"
|
| 1371 |
+
version = "2.3.0"
|
| 1372 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1373 |
+
sdist = { url = "https://files.pythonhosted.org/packages/52/ed/3f73f72945444548f33eba9a87fc7a6e969915e7b1acc8260b30e1f76a2f/tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549", size = 17392, upload-time = "2025-10-08T22:01:47.119Z" }
|
| 1374 |
+
wheels = [
|
| 1375 |
+
{ url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" },
|
| 1376 |
+
]
|
| 1377 |
+
|
| 1378 |
[[package]]
|
| 1379 |
name = "tomlkit"
|
| 1380 |
version = "0.13.3"
|