Spaces:

lmms-lab-si
/

EASI-Leaderboard

Restarting

App Files Files Community

yangzhitao commited on 30 days ago

Commit

c85dcc4

1 Parent(s): d1fd905

feat: enhance leaderboard functionality and refactor app structure; update token handling and project metadata; update environment settings

Browse files

Files changed (16) hide show

.env.example +4 -0
.vscode/cspell.json +2 -0
app.py +80 -76
pyproject.toml +5 -3
requirements.txt +2 -0
src/about.py +42 -32
src/backend/config.py +22 -3
src/backend/routes/hf.py +2 -2
src/display/formatting.py +4 -1
src/display/utils.py +25 -14
src/envs.py +31 -31
src/leaderboard/read_evals.py +22 -13
src/populate.py +1 -1
src/prepare.py +137 -0
src/submission/submit.py +3 -3
uv.lock +88 -62

.env.example CHANGED Viewed

@@ -1,2 +1,6 @@
 HF_TOKEN=changethis
 HF_HOME=.

 HF_TOKEN=changethis
 HF_HOME=.
+HF_OWNER=lmms-lab
+HF_REPO_NAME=EASI-Leaderboard
+HF_RESULTS_REPO_NAME=EASI-Leaderboard-Results
+HF_REQUESTS_REPO_NAME=EASI-Leaderboard-Requests

.vscode/cspell.json CHANGED Viewed

@@ -3,8 +3,10 @@
         "accs",
         "changethis",
         "checkboxgroup",
         "evals",
         "initialisation",
         "modelcard",
         "sentencepiece"
     ]

         "accs",
         "changethis",
         "checkboxgroup",
+        "EASI",
         "evals",
         "initialisation",
+        "lmms",
         "modelcard",
         "sentencepiece"
     ]

app.py CHANGED Viewed

@@ -6,17 +6,16 @@ import pandas as pd
 import requests
 import uvicorn
 from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
 from rich import print
 from src.about import (
-    BENCHMARKS,
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.backend.app import create_app
 from src.display.css_html_js import (
@@ -28,9 +27,9 @@ from src.display.css_html_js import (
 from src.display.utils import (
     BASE_COLS,
     BENCHMARK_COLS,
-    COLS,
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
     ModelType,
     Precision,
@@ -38,8 +37,12 @@ from src.display.utils import (
 )
 from src.envs import API, settings
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=settings.REPO_ID)
@@ -47,37 +50,13 @@ def restart_space():
 print("///// --- Settings --- /////", settings.model_dump())
-# Space initialisation
-try:
-    snapshot_download(
-        repo_id=settings.QUEUE_REPO,
-        local_dir=settings.EVAL_REQUESTS_PATH,
-        repo_type="dataset",
-        tqdm_class=None,
-        etag_timeout=30,
-        token=settings.TOKEN,
-    )
-except Exception:
-    restart_space()
-try:
-    snapshot_download(
-        repo_id=settings.RESULTS_REPO,
-        local_dir=settings.EVAL_RESULTS_PATH,
-        repo_type="dataset",
-        tqdm_class=None,
-        etag_timeout=30,
-        token=settings.TOKEN,
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(
-    settings.EVAL_RESULTS_PATH,
-    settings.EVAL_REQUESTS_PATH,
-    COLS,
-    BENCHMARK_COLS,
-)
 (
     finished_eval_queue_df,
@@ -90,8 +69,9 @@ def filter_dataframe_by_columns(selected_cols: list[str], original_df: pd.DataFr
     """
     根据选择的列过滤 DataFrame
     """
-    # 始终包含基础列 'T' 和 'Model'
-    base_cols = ['T', 'Model']
     all_selected_cols = [col for col in base_cols if col in original_df.columns]
     # 添加用户选择的列（排除已存在的基础列）
@@ -175,7 +155,11 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
     return filtered_df
-def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
     # 存储原始 DataFrame 以便后续过滤使用（使用闭包保存）
     original_df = dataframe.copy()
@@ -187,7 +171,9 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
     )
     # 初始化显示的列（包含基础列和默认选中的列）
-    default_selected = [col for col in dataframe.columns if col in cols] + ['Average ⬆️']
     # 先按 precision 筛选 original_df
     precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
@@ -197,8 +183,13 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
     with gr.Row():
         with gr.Column(scale=1):
             search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
             show_columns = gr.CheckboxGroup(
-                choices=[col for col in dataframe.columns if col not in ['T', 'Model']],
                 label="Select Columns to Display",
                 value=default_selected,
                 interactive=True,
@@ -271,24 +262,37 @@ def main():
         gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         with gr.Tabs(elem_classes="tab-buttons") as _tabs:
-            for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
-                with gr.TabItem(f"🏅 {benchmark}", elem_id="llm-benchmark-tab-table", id=i):
-                    benchmark_cols = [
-                        BENCHMARK_COL for BENCHMARK_COL in BENCHMARK_COLS if BENCHMARK_COL.startswith(benchmark)
-                    ]
-                    cols = BASE_COLS + benchmark_cols
-                    BENCHMARK_DF = get_leaderboard_df(
-                        settings.EVAL_RESULTS_PATH,
-                        settings.EVAL_REQUESTS_PATH,
-                        cols,
-                        benchmark_cols,
-                    )
-                    _leaderboard = init_leaderboard_tabs(BENCHMARK_DF, benchmark_cols)
-            with gr.TabItem("📝 About", elem_id="about-tab", id=len(BENCHMARKS)):
                 gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-            with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=len(BENCHMARKS) + 1):
                 with gr.Column():
                     with gr.Row():
                         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -405,6 +409,28 @@ def main():
                     submission_result,
                 )
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False):
                 _citation_button = gr.Textbox(
@@ -414,28 +440,6 @@ def main():
                     elem_id="citation-button",
                     show_copy_button=True,
                 )
-        # Backend status indicator
-        backend_status = gr.HTML(
-            value=get_backend_status_undefined_html(),
-            elem_id="backend-status-container",
-        )
-        # trigger button to bind the click event
-        status_trigger = gr.Button(elem_id="backend-status-trigger-btn", visible=False)
-        status_trigger.click(
-            fn=lambda: check_backend_health()[1],
-            inputs=None,
-            outputs=backend_status,
-        )
-        # load external JavaScript file
-        js_content = backend_status_js()
-        status_trigger_js_html = f'<script>{js_content}</script>'
-        gr.HTML(status_trigger_js_html, visible=False)
-        demo.load(
-            fn=lambda: check_backend_health()[1],
-            inputs=None,
-            outputs=backend_status,
-        )
     return demo
@@ -480,7 +484,7 @@ if __name__ == "__main__":
     def run_fastapi():
         host = settings.BACKEND_HOST
         port = settings.BACKEND_PORT
-        print(f"Starting FastAPI server on http://{host}:{port}")
         uvicorn.run(
             app,
             host=host,

 import requests
 import uvicorn
 from apscheduler.schedulers.background import BackgroundScheduler
 from rich import print
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
+    get_benchmarks,
 )
 from src.backend.app import create_app
 from src.display.css_html_js import (
 from src.display.utils import (
     BASE_COLS,
     BENCHMARK_COLS,
     EVAL_COLS,
     EVAL_TYPES,
+    NOT_SUPPORTED_COLS,
     AutoEvalColumn,
     ModelType,
     Precision,
 )
 from src.envs import API, settings
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.prepare import prepare_space
 from src.submission.submit import add_new_eval
+prepare_space()
+BENCHMARKS = get_benchmarks()
 def restart_space():
     API.restart_space(repo_id=settings.REPO_ID)
 print("///// --- Settings --- /////", settings.model_dump())
+# LEADERBOARD_DF = get_leaderboard_df(
+#     settings.EVAL_RESULTS_PATH,
+#     settings.EVAL_REQUESTS_PATH,
+#     COLS,
+#     BENCHMARK_COLS,
+# )
 (
     finished_eval_queue_df,
     """
     根据选择的列过滤 DataFrame
     """
+    # # 始终包含基础列 'T' 和 'Model'
+    # base_cols = ['T', 'Model']
+    base_cols = ['Model']
     all_selected_cols = [col for col in base_cols if col in original_df.columns]
     # 添加用户选择的列（排除已存在的基础列）
     return filtered_df
+def init_leaderboard_tabs(
+    dataframe: pd.DataFrame,
+    cols: list[str],
+    not_supported_cols: list[str],
+):
     # 存储原始 DataFrame 以便后续过滤使用（使用闭包保存）
     original_df = dataframe.copy()
     )
     # 初始化显示的列（包含基础列和默认选中的列）
+    default_selected = [col for col in dataframe.columns if col in cols and col not in not_supported_cols] + [
+        'Average ⬆️'
+    ]
     # 先按 precision 筛选 original_df
     precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
     with gr.Row():
         with gr.Column(scale=1):
             search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
+            column_choices = [
+                col
+                for col in dataframe.columns
+                if col not in ['T', 'Model'] and (not not_supported_cols or col not in not_supported_cols)
+            ]
             show_columns = gr.CheckboxGroup(
+                choices=column_choices,
                 label="Select Columns to Display",
                 value=default_selected,
                 interactive=True,
         gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         with gr.Tabs(elem_classes="tab-buttons") as _tabs:
+            with gr.TabItem("📝 Overview", elem_id="benchmark-overview-tab", id=0):
+                benchmark_cols = BENCHMARK_COLS.copy()
+                print("benchmark_cols:", benchmark_cols)
+                cols = BASE_COLS + benchmark_cols
+                benchmark_df = get_leaderboard_df(
+                    settings.EVAL_RESULTS_PATH,
+                    settings.EVAL_REQUESTS_PATH,
+                    cols,
+                    benchmark_cols,
+                )
+                _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
+            i_bench = 1
+            if False:
+                for i_bench, benchmark in enumerate(sorted(BENCHMARKS), start=1):
+                    with gr.TabItem(f"🏅 {benchmark.title}", elem_id="llm-benchmark-tab-table", id=i_bench):
+                        print(f"benchmark.title: {benchmark.title!r}")
+                        benchmark_cols = [col for col in BENCHMARK_COLS if col.startswith(benchmark.title)]
+                        cols = BASE_COLS + benchmark_cols
+                        benchmark_df = get_leaderboard_df(
+                            settings.EVAL_RESULTS_PATH,
+                            settings.EVAL_REQUESTS_PATH,
+                            cols,
+                            benchmark_cols,
+                        )
+                        _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
+            with gr.TabItem("📝 About", elem_id="about-tab", id=i_bench + 1):
                 gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=i_bench + 2):
                 with gr.Column():
                     with gr.Row():
                         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                     submission_result,
                 )
+                # Backend status indicator
+                backend_status = gr.HTML(
+                    value=get_backend_status_undefined_html(),
+                    elem_id="backend-status-container",
+                )
+                # trigger button to bind the click event
+                status_trigger = gr.Button(elem_id="backend-status-trigger-btn", visible=False)
+                status_trigger.click(
+                    fn=lambda: check_backend_health()[1],
+                    inputs=None,
+                    outputs=backend_status,
+                )
+                # load external JavaScript file
+                js_content = backend_status_js()
+                status_trigger_js_html = f'<script>{js_content}</script>'
+                gr.HTML(status_trigger_js_html, visible=False)
+                demo.load(
+                    fn=lambda: check_backend_health()[1],
+                    inputs=None,
+                    outputs=backend_status,
+                )
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False):
                 _citation_button = gr.Textbox(
                     elem_id="citation-button",
                     show_copy_button=True,
                 )
     return demo
     def run_fastapi():
         host = settings.BACKEND_HOST
         port = settings.BACKEND_PORT
+        print("Starting FastAPI server:")
         uvicorn.run(
             app,
             host=host,

pyproject.toml CHANGED Viewed

@@ -1,7 +1,7 @@
 [project]
-name = "leaderboard"
 version = "0.1.0"
-description = "Leaderboard for Benchmarking LLMs"
 readme = "README.md"
 requires-python = ">=3.10,<3.11"
@@ -28,7 +28,9 @@ dependencies = [
   "fastapi>=0.120.0",
   "loguru>=0.7.3",
   "uvicorn>=0.38.0",
 ]
 [dependency-groups]
-dev = ["ruff>=0.14.0,<0.15.0"]

 [project]
+name = "easi-leaderboard"
 version = "0.1.0"
+description = "Leaderboard for EASI: Holistic Evaluation and Analysis for Spatial Intelligence Made Easy"
 readme = "README.md"
 requires-python = ">=3.10,<3.11"
   "fastapi>=0.120.0",
   "loguru>=0.7.3",
   "uvicorn>=0.38.0",
+  "tomli>=2.3.0 ; python_full_version < '3.11'",
+  "typing-extensions>=4.15.0",
 ]
 [dependency-groups]
+dev = ["ruff>=0.14.0,<0.15.0", "tabulate"]

requirements.txt CHANGED Viewed

@@ -21,3 +21,5 @@ rich>=14.2.0
 fastapi>=0.120.0
 loguru>=0.7.3
 uvicorn>=0.38.0

 fastapi>=0.120.0
 loguru>=0.7.3
 uvicorn>=0.38.0
+tomli>=2.3.0; python_version < '3.11'
+typing_extensions>=4.15.0

src/about.py CHANGED Viewed

@@ -1,10 +1,15 @@
 from enum import Enum
 from typing import Annotated
 from pydantic import BaseModel, Field
-class Task(BaseModel):
     benchmark: Annotated[str, Field(description="The benchmark name")]
     metric: Annotated[str, Field(description="The metric name")]
     col_name: Annotated[str, Field(description="The column name")]
@@ -12,59 +17,64 @@ class Task(BaseModel):
 # Select your tasks here
 # ---------------------------------------------------
-class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # acc
-    task1_1 = Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
-    task2_1 = Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
-    task3_1 = Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
-    task4_1 = Task(benchmark="Core", metric="acc", col_name="Core(acc)")
-    task5_1 = Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
-    task6_1 = Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
-    task7_1 = Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
-    task8_1 = Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
     # caa
-    task1_2 = Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
-    task2_2 = Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
-    task3_2 = Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
-    task4_2 = Task(benchmark="Core", metric="caa", col_name="Core(caa)")
-    task5_2 = Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
-    task6_2 = Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
-    task7_2 = Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
-    task8_2 = Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
     # rand
-    task1_3 = Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
-    task2_3 = Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
-    task3_3 = Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
-    task4_3 = Task(benchmark="Core", metric="rand", col_name="Core(rand)")
-    task5_3 = Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
-    task6_3 = Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
-    task7_3 = Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
-    task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
-BENCHMARKS = {m.value.benchmark for m in Tasks}
-METRICS = {m.value.metric for m in Tasks}
-COL_NAMES = {m.value.col_name for m in Tasks}
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = """
-## How it works
 ## Reproducibility
 To reproduce our results, here is the commands you can run:

 from enum import Enum
+from functools import lru_cache
 from typing import Annotated
 from pydantic import BaseModel, Field
+from src.prepare import load_meta_toml, prepare_space
+prepare_space()
+class _Task(BaseModel):
     benchmark: Annotated[str, Field(description="The benchmark name")]
     metric: Annotated[str, Field(description="The metric name")]
     col_name: Annotated[str, Field(description="The column name")]
 # Select your tasks here
 # ---------------------------------------------------
+class _Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # acc
+    task1_1 = _Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
+    task2_1 = _Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
+    task3_1 = _Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
+    task4_1 = _Task(benchmark="Core", metric="acc", col_name="Core(acc)")
+    task5_1 = _Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
+    task6_1 = _Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
+    task7_1 = _Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
+    task8_1 = _Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
     # caa
+    task1_2 = _Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
+    task2_2 = _Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
+    task3_2 = _Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
+    task4_2 = _Task(benchmark="Core", metric="caa", col_name="Core(caa)")
+    task5_2 = _Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
+    task6_2 = _Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
+    task7_2 = _Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
+    task8_2 = _Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
     # rand
+    task1_3 = _Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
+    task2_3 = _Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
+    task3_3 = _Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
+    task4_3 = _Task(benchmark="Core", metric="rand", col_name="Core(rand)")
+    task5_3 = _Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
+    task6_3 = _Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
+    task7_3 = _Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
+    task8_3 = _Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
+# BENCHMARKS = {m.value.benchmark for m in Tasks}
+# METRICS = {m.value.metric for m in Tasks}
+# COL_NAMES = {m.value.col_name for m in Tasks}
+@lru_cache(maxsize=1)
+def get_benchmarks():
+    meta_toml = load_meta_toml()
+    return sorted(meta_toml.benchmarks)
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">EASI Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+EASI: Holistic Evaluation and Analysis for Spatial Intelligence Made Easy
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = """
+## Leaderboard
 ## Reproducibility
 To reproduce our results, here is the commands you can run:

src/backend/config.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from functools import cached_property
 from typing import Annotated
 from dotenv import load_dotenv
-from pydantic import Field, SecretStr
 from pydantic_settings import BaseSettings, SettingsConfigDict
 load_dotenv()
@@ -31,8 +32,26 @@ class Settings(BaseSettings):
             token=token,
         )
-    REQUESTS_REPO_ID: str = "y-playground/requests"
-    RESULTS_REPO_ID: str = "y-playground/results"
 settings = Settings()

 from functools import cached_property
+from pathlib import Path
 from typing import Annotated
 from dotenv import load_dotenv
+from pydantic import Field, SecretStr, computed_field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 load_dotenv()
             token=token,
         )
+    # Settings for Hugging Face repos
+    HF_OWNER: str = "lmms-lab"
+    HF_REPO_NAME: Annotated[str, Field(description="Name of leaderboard repo")] = "EASI-Leaderboard"
+    HF_RESULTS_REPO_NAME: Annotated[str, Field(description="Name of results repo")] = "EASI-Leaderboard-Results"
+    HF_REQUESTS_REPO_NAME: Annotated[str, Field(description="Name of requests repo")] = "EASI-Leaderboard-Requests"
+    @computed_field
+    @cached_property
+    def REPO_ID(self) -> str:
+        return (Path(self.HF_OWNER) / self.HF_REPO_NAME).as_posix()
+    @computed_field
+    @cached_property
+    def RESULTS_REPO_ID(self) -> str:
+        return (Path(self.HF_OWNER) / self.HF_RESULTS_REPO_NAME).as_posix()
+    @computed_field
+    @cached_property
+    def QUEUE_REPO_ID(self) -> str:
+        return (Path(self.HF_OWNER) / self.HF_REQUESTS_REPO_NAME).as_posix()
 settings = Settings()

src/backend/routes/hf.py CHANGED Viewed

@@ -52,7 +52,7 @@ async def upload_file_content(
         path_or_fileobj=file_obj,
         path_in_repo=params.path_in_repo,
         commit_message=params.commit_message,
-        repo_id=settings.REQUESTS_REPO_ID,
         repo_type="dataset",
     )
     return ResponseData(data=data)
@@ -104,7 +104,7 @@ async def community_submit(
         path_or_fileobj=file_obj,
         path_in_repo=path_in_repo,
         commit_message=params.commit_message,
-        repo_id=settings.REQUESTS_REPO_ID,
         repo_type="dataset",
     )
     return ResponseData(data=data)

         path_or_fileobj=file_obj,
         path_in_repo=params.path_in_repo,
         commit_message=params.commit_message,
+        repo_id=settings.QUEUE_REPO_ID,
         repo_type="dataset",
     )
     return ResponseData(data=data)
         path_or_fileobj=file_obj,
         path_in_repo=path_in_repo,
         commit_message=params.commit_message,
+        repo_id=settings.QUEUE_REPO_ID,
         repo_type="dataset",
     )
     return ResponseData(data=data)

src/display/formatting.py CHANGED Viewed

@@ -9,6 +9,9 @@ def model_hyperlink(link: str, model_name: str) -> str:
 def make_clickable_model(model_name: str) -> str:
     link = f"https://huggingface.co/{model_name}"
     return model_hyperlink(link, model_name)
@@ -26,7 +29,7 @@ def styled_message(message: str) -> str:
 def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
-    return df.loc[:, columns].notna().all(axis=1)
 def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":

 def make_clickable_model(model_name: str) -> str:
+    if "/" not in model_name:
+        # not a full model name, cannot be clicked
+        return model_name
     link = f"https://huggingface.co/{model_name}"
     return model_hyperlink(link, model_name)
 def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
+    return df.loc[:, columns].notna().any(axis=1)
 def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":

src/display/utils.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Literal, Union
 from pydantic import BaseModel, ConfigDict, create_model
 from typing_extensions import Self
-from src.about import Tasks
 def fields(
@@ -33,6 +33,8 @@ class ColumnContent(BaseModel):
     hidden: bool = False
     never_hidden: bool = False
     @classmethod
     def new(
         cls,
@@ -42,6 +44,7 @@ class ColumnContent(BaseModel):
         *,
         hidden: bool = False,
         never_hidden: bool = False,
     ) -> Self:
         return cls(
             name=name,
@@ -49,6 +52,7 @@ class ColumnContent(BaseModel):
             displayed_by_default=displayed_by_default,
             hidden=hidden,
             never_hidden=never_hidden,
         )
@@ -56,29 +60,34 @@ class _AutoEvalColumnBase(BaseModel):
     model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
     model_type_symbol: ColumnContent = ColumnContent(
-        name="T", type="str", displayed_by_default=True, never_hidden=True
     )
     model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
     average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
     model_type: ColumnContent = ColumnContent.new("Type", "str")
-    architecture: ColumnContent = ColumnContent.new("Architecture", "str")
     weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
-    precision: ColumnContent = ColumnContent.new("Precision", "str")
-    license: ColumnContent = ColumnContent.new("Hub License", "str")
-    params: ColumnContent = ColumnContent.new("#Params (B)", "number")
-    likes: ColumnContent = ColumnContent.new("Hub ❤️", "number")
-    still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool")
-    revision: ColumnContent = ColumnContent.new("Model sha", "str")
 # We use create_model to dynamically fill the scores from Tasks
 field_definitions = {
-    task.name: (
         ColumnContent,
-        ColumnContent.new(task.value.col_name, "number", True),
     )
-    for task in Tasks
 }
 AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model(  # pyright: ignore[reportCallIssue]
     '_AutoEvalColumnCls',
@@ -156,9 +165,11 @@ class Precision(Enum):
 # Column selection
-COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
 BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
 EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
 EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
-BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]

 from pydantic import BaseModel, ConfigDict, create_model
 from typing_extensions import Self
+from src.about import get_benchmarks
 def fields(
     hidden: bool = False
     never_hidden: bool = False
+    not_supported: bool = False  # for not supported columns, should not be displayed
     @classmethod
     def new(
         cls,
         *,
         hidden: bool = False,
         never_hidden: bool = False,
+        not_supported: bool = False,
     ) -> Self:
         return cls(
             name=name,
             displayed_by_default=displayed_by_default,
             hidden=hidden,
             never_hidden=never_hidden,
+            not_supported=not_supported,
         )
     model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
     model_type_symbol: ColumnContent = ColumnContent(
+        name="T",
+        type="str",
+        displayed_by_default=True,
+        # never_hidden=True,
     )
     model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
     average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
     model_type: ColumnContent = ColumnContent.new("Type", "str")
+    architecture: ColumnContent = ColumnContent.new("Architecture", "str", not_supported=True)
     weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
+    precision: ColumnContent = ColumnContent.new("Precision", "str", not_supported=True)
+    license: ColumnContent = ColumnContent.new("Hub License", "str", not_supported=True)
+    params: ColumnContent = ColumnContent.new("#Params (B)", "number", not_supported=True)
+    likes: ColumnContent = ColumnContent.new("Hub ❤️", "number", not_supported=True)
+    still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool", not_supported=True)
+    revision: ColumnContent = ColumnContent.new("Model sha", "str", not_supported=True)
+BENCHMARKS = get_benchmarks()
 # We use create_model to dynamically fill the scores from Tasks
 field_definitions = {
+    task.key: (
         ColumnContent,
+        ColumnContent.new(task.title, "number", True),
     )
+    for task in BENCHMARKS
 }
 AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model(  # pyright: ignore[reportCallIssue]
     '_AutoEvalColumnCls',
 # Column selection
+# COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
 BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
 EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
 EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
+NOT_SUPPORTED_COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if c.not_supported]
+# BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]
+BENCHMARK_COLS: list[str] = [t.title for t in BENCHMARKS]

src/envs.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Annotated
 from huggingface_hub import HfApi
-from pydantic import Field, computed_field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 # ----------------------------------
@@ -14,64 +14,64 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
     model_config = SettingsConfigDict(env_file=".env")
-    TOKEN: Annotated[str, Field(..., alias="HF_TOKEN", description="A read/write token for your org")]
-    # Change to your org - don't forget to create a results and request dataset, with the correct format!
-    OWNER: Annotated[
-        str,
-        Field("y-playground"),
-    ]
-    BACKEND_HOST: Annotated[str, Field("127.0.0.1", description="Backend host")]
-    BACKEND_PORT: Annotated[int, Field(8000, description="Backend port")]
     @computed_field
     @cached_property
     def REPO_ID(self) -> str:
-        return (Path(self.OWNER) / "leaderboard").as_posix()
     @computed_field
     @cached_property
-    def QUEUE_REPO(self) -> str:
-        return (Path(self.OWNER) / "requests").as_posix()
     @computed_field
     @cached_property
-    def RESULTS_REPO(self) -> str:
-        return (Path(self.OWNER) / "results").as_posix()
-    CACHE_PATH: Annotated[
-        str,
-        Field(".", alias="HF_HOME", description="If you setup a cache later, just change `HF_HOME`"),
     ]
     # Local caches
     @computed_field
     @cached_property
     def EVAL_REQUESTS_PATH(self) -> str:
-        return (Path(self.CACHE_PATH) / "eval-queue").as_posix()
     @computed_field
     @cached_property
     def EVAL_RESULTS_PATH(self) -> str:
-        return (Path(self.CACHE_PATH) / "eval-results").as_posix()
-    @computed_field
-    @cached_property
-    def EVAL_REQUESTS_PATH_BACKEND(self) -> str:
-        return (Path(self.CACHE_PATH) / "eval-queue-bk").as_posix()
     @computed_field
     @cached_property
-    def EVAL_RESULTS_PATH_BACKEND(self) -> str:
-        return (Path(self.CACHE_PATH) / "eval-results-bk").as_posix()
     @computed_field
     @cached_property
-    def API(self) -> HfApi:
-        return HfApi(token=self.TOKEN)
 settings = Settings()  # pyright: ignore[reportCallIssue]
-API = settings.API

 from typing import Annotated
 from huggingface_hub import HfApi
+from pydantic import Field, SecretStr, computed_field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 # ----------------------------------
 class Settings(BaseSettings):
     model_config = SettingsConfigDict(env_file=".env")
+    HF_TOKEN: Annotated[SecretStr, Field(..., description="A read/write token for your org")]
+    # Settings for Hugging Face repos
+    HF_OWNER: str = "lmms-lab"
+    HF_REPO_NAME: Annotated[str, Field(description="Name of leaderboard repo")] = "EASI-Leaderboard"
+    HF_RESULTS_REPO_NAME: Annotated[str, Field(description="Name of results repo")] = "EASI-Leaderboard-Results"
+    HF_REQUESTS_REPO_NAME: Annotated[str, Field(description="Name of requests repo")] = "EASI-Leaderboard-Requests"
     @computed_field
     @cached_property
     def REPO_ID(self) -> str:
+        return (Path(self.HF_OWNER) / self.HF_REPO_NAME).as_posix()
     @computed_field
     @cached_property
+    def RESULTS_REPO_ID(self) -> str:
+        return (Path(self.HF_OWNER) / self.HF_RESULTS_REPO_NAME).as_posix()
     @computed_field
     @cached_property
+    def QUEUE_REPO_ID(self) -> str:
+        return (Path(self.HF_OWNER) / self.HF_REQUESTS_REPO_NAME).as_posix()
+    HF_HOME: Annotated[
+        Path,
+        Field(
+            default_factory=lambda: Path(".").resolve(),
+            description="If you setup a cache later, just change `HF_HOME`",
+        ),
     ]
+    # Backend settings
+    BACKEND_HOST: Annotated[str, Field("127.0.0.1", description="Backend host")]
+    BACKEND_PORT: Annotated[int, Field(8000, description="Backend port")]
     # Local caches
     @computed_field
     @cached_property
     def EVAL_REQUESTS_PATH(self) -> str:
+        return (self.HF_HOME / "eval-queue").as_posix()
     @computed_field
     @cached_property
     def EVAL_RESULTS_PATH(self) -> str:
+        return (self.HF_HOME / "eval-results").as_posix()
     @computed_field
     @cached_property
+    def EVAL_REQUESTS_PATH_BACKUP(self) -> str:
+        return (self.HF_HOME / "eval-queue-bk").as_posix()
     @computed_field
     @cached_property
+    def EVAL_RESULTS_PATH_BACKUP(self) -> str:
+        return (self.HF_HOME / "eval-results-bk").as_posix()
 settings = Settings()  # pyright: ignore[reportCallIssue]
+API = HfApi(token=settings.HF_TOKEN.get_secret_value())

src/leaderboard/read_evals.py CHANGED Viewed

@@ -15,10 +15,14 @@ import numpy as np
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import Self
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
 from src.submission.check_validity import is_model_on_hub
 class EvalResultJson(BaseModel):
     """Model of the eval result json file."""
@@ -34,8 +38,8 @@ class EvalResultJson_Config(BaseModel):
     model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
-    model_dtype: Annotated[str, Field(..., description="The model precision. e.g. torch.bfloat16")]
     model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
     model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
     model_args: Annotated[str | None, Field(description="The model args.")] = None
@@ -70,6 +74,7 @@ class EvalResult(BaseModel):
         precision = Precision.from_str(config.model_dtype)
         # Get model and org
         org_and_model = config.model_name or config.model_args or ""
         org_and_model = org_and_model.split("/", 1)
@@ -83,6 +88,10 @@ class EvalResult(BaseModel):
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
             full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
         )
@@ -94,16 +103,15 @@ class EvalResult(BaseModel):
         # Extract results available in this file (some results are split in several files)
         results: dict[str, float] = {}
-        for t in Tasks:
-            task = t.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data.results.items() if task.benchmark == k])
             if accs.size == 0 or any(acc is None for acc in accs):
                 continue
             mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = float(mean_acc)
         return cls.model_validate({
             "eval_name": result_key,
@@ -119,6 +127,8 @@ class EvalResult(BaseModel):
     def update_with_request_file(self, requests_path: str) -> None:
         """Finds the relevant request file for the current model and updates info with it"""
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
@@ -137,7 +147,7 @@ class EvalResult(BaseModel):
     def to_dict(self) -> dict:
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum(v for v in self.results.values() if v is not None) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
@@ -154,8 +164,8 @@ class EvalResult(BaseModel):
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
@@ -181,8 +191,6 @@ def get_request_file_for_model(requests_path, model_name, precision) -> str:
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
-    from rich import print as rprint  # FIXME: DEBUG
     model_result_filepaths: list[str] = []
     for root, _, files in os.walk(results_path):
@@ -208,7 +216,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result

 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import Self
+from src.about import get_benchmarks
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
+from src.prepare import load_meta_toml
 from src.submission.check_validity import is_model_on_hub
+BENCHMARKS = get_benchmarks()
 class EvalResultJson(BaseModel):
     """Model of the eval result json file."""
     model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
     model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
+    model_dtype: Annotated[str | None, Field(description="The model precision. e.g. torch.bfloat16")] = None
     model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
     model_args: Annotated[str | None, Field(description="The model args.")] = None
         precision = Precision.from_str(config.model_dtype)
         # Get model and org
         org_and_model = config.model_name or config.model_args or ""
         org_and_model = org_and_model.split("/", 1)
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
+        meta_toml = load_meta_toml()
+        # update full_model from meta_toml if it exists
+        full_model = meta_toml.model_title_to_repo_id.get(full_model, full_model)
         still_on_hub, _, model_config = is_model_on_hub(
             full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
         )
         # Extract results available in this file (some results are split in several files)
         results: dict[str, float] = {}
+        for task in BENCHMARKS:
             # We average all scores of a given metric (not all metrics are present in all files)
+            # TODO: support multiple metrics
+            accs = np.array([v.get("acc", None) for k, v in data.results.items() if task.key == k])
             if accs.size == 0 or any(acc is None for acc in accs):
                 continue
             mean_acc = np.mean(accs) * 100.0
+            results[task.title] = float(mean_acc)
         return cls.model_validate({
             "eval_name": result_key,
     def update_with_request_file(self, requests_path: str) -> None:
         """Finds the relevant request file for the current model and updates info with it"""
+        # TODO: do nothing for now
+        return
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
     def to_dict(self) -> dict:
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = sum(v for v in self.results.values() if v is not None) / len(BENCHMARKS)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
+        for task in BENCHMARKS:
+            data_dict[task.title] = self.results.get(task.title, None)
         return data_dict
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths: list[str] = []
     for root, _, files in os.walk(results_path):
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
+            results_loaded = {k: v for k, v in eval_result.results.items() if v is not None}
+            eval_results[eval_name].results.update(results_loaded)
         else:
             eval_results[eval_name] = eval_result

src/populate.py CHANGED Viewed

@@ -60,7 +60,7 @@ def get_leaderboard_df(
     df = df.loc[:, cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
     return df

     df = df.loc[:, cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
+    df = df.loc[has_no_nan_values(df, benchmark_cols), :]
     return df

src/prepare.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import sys
+from functools import cached_property, lru_cache
+from pathlib import Path
+from huggingface_hub import snapshot_download
+from loguru import logger
+from pydantic import BaseModel, ConfigDict
+from typing_extensions import Self
+from src.envs import API, settings
+if sys.version_info < (3, 11):
+    from tomli import load as toml_load
+else:
+    from tomllib import load as toml_load
+PREPARED_FLAG: bool = os.getenv("NO_DOWNLOAD", 0) == 1
+def prepare_space():
+    """Space initialisation"""
+    def _restart_space():
+        API.restart_space(repo_id=settings.REPO_ID)
+    global PREPARED_FLAG
+    if not PREPARED_FLAG:
+        try:
+            snapshot_download(
+                repo_id=settings.QUEUE_REPO_ID,
+                local_dir=settings.EVAL_REQUESTS_PATH,
+                repo_type="dataset",
+                tqdm_class=None,
+                etag_timeout=30,
+                token=settings.HF_TOKEN.get_secret_value(),
+            )
+        except Exception as e:
+            logger.error(f"Error downloading eval queue: {e!s}")
+            _restart_space()
+        try:
+            snapshot_download(
+                repo_id=settings.RESULTS_REPO_ID,
+                local_dir=settings.EVAL_RESULTS_PATH,
+                repo_type="dataset",
+                tqdm_class=None,
+                etag_timeout=30,
+                allow_patterns=["leaderboard/*.toml", "leaderboard/**/*.json"],
+                token=settings.HF_TOKEN.get_secret_value(),
+            )
+        except Exception as e:
+            logger.error(f"Error downloading eval queue: {e!s}")
+            _restart_space()
+        PREPARED_FLAG = True
+    load_meta_toml()
+class MetaToml(BaseModel):
+    model_config = ConfigDict(extra="allow", frozen=True)
+    models: list["MetaToml_Model"]
+    benchmarks: list["MetaToml_Benchmark"]
+    model_repos: list["MetaToml_ModelRepo"]
+    @cached_property
+    def model_title_to_key(self) -> dict[str, str]:
+        return {model.title: model.key for model in self.models}
+    @cached_property
+    def benchmark_title_to_key(self) -> dict[str, str]:
+        return {benchmark.title: benchmark.key for benchmark in self.benchmarks}
+    @cached_property
+    def model_key_to_repo_id(self) -> dict[str, str]:
+        return {model.key: model.repo_id for model in self.model_repos if model.repo_id is not None}
+    @cached_property
+    def model_title_to_repo_id(self) -> dict[str, str]:
+        mapping: dict[str, str] = {}
+        for model in self.models:
+            model_key = self.model_title_to_key.get(model.title)
+            if model_key:
+                model_repo_id = self.model_key_to_repo_id.get(model_key)
+                if model_repo_id:
+                    mapping[model.title] = model_repo_id
+        return mapping
+class _HashableComparableMixin(BaseModel):
+    model_config = ConfigDict(extra="allow", frozen=True)
+    key: str
+    title: str
+    def __hash__(self) -> int:
+        return hash(self.key)
+    def __eq__(self, other: Self) -> bool:
+        return (self.key, self.title) == (other.key, other.title)
+    def __lt__(self, other: Self) -> bool:
+        return (self.key, self.title) < (other.key, other.title)
+    def __gt__(self, other: Self) -> bool:
+        return (self.key, self.title) > (other.key, other.title)
+    def __le__(self, other: Self) -> bool:
+        return (self.key, self.title) <= (other.key, other.title)
+    def __ge__(self, other: Self) -> bool:
+        return (self.key, self.title) >= (other.key, other.title)
+class MetaToml_Benchmark(_HashableComparableMixin): ...
+class MetaToml_Model(_HashableComparableMixin): ...
+class MetaToml_ModelRepo(BaseModel):
+    model_config = ConfigDict(extra="allow", frozen=True)
+    key: str
+    repo_id: str | None = None
+@lru_cache(maxsize=1)
+def load_meta_toml() -> MetaToml:
+    meta_toml_path = Path(settings.EVAL_RESULTS_PATH) / "leaderboard" / "meta.toml"
+    logger.info(f'Loading meta.toml from: {meta_toml_path.as_posix()!r}')
+    with meta_toml_path.open("rb") as f:
+        data = toml_load(f)
+    meta_toml = MetaToml.model_validate(data)
+    logger.info("Loaded meta.toml")
+    assert meta_toml is not None, f"Failed to load meta.toml: {meta_toml_path.as_posix()!r}"
+    return meta_toml

src/submission/submit.py CHANGED Viewed

@@ -53,14 +53,14 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
         base_model_on_hub, error, _ = is_model_on_hub(
-            model_name=base_model, revision=revision, token=settings.TOKEN, test_tokenizer=True
         )
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
         model_on_hub, error, _ = is_model_on_hub(
-            model_name=model, revision=revision, token=settings.TOKEN, test_tokenizer=True
         )
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
@@ -117,7 +117,7 @@ def add_new_eval(
     API.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=settings.QUEUE_REPO,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",
     )

     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
         base_model_on_hub, error, _ = is_model_on_hub(
+            model_name=base_model, revision=revision, token=settings.HF_TOKEN.get_secret_value(), test_tokenizer=True
         )
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
         model_on_hub, error, _ = is_model_on_hub(
+            model_name=model, revision=revision, token=settings.HF_TOKEN.get_secret_value(), test_tokenizer=True
         )
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
     API.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=settings.QUEUE_REPO_ID,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",
     )

uv.lock CHANGED Viewed

@@ -356,6 +356,76 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" },
 ]
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
@@ -667,68 +737,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" },
 ]
-[[package]]
-name = "leaderboard"
-version = "0.1.0"
-source = { virtual = "." }
-dependencies = [
-    { name = "apscheduler" },
-    { name = "datasets" },
-    { name = "fastapi" },
-    { name = "gradio", extra = ["oauth"] },
-    { name = "gradio-client" },
-    { name = "gradio-leaderboard" },
-    { name = "huggingface-hub" },
-    { name = "loguru" },
-    { name = "matplotlib" },
-    { name = "numpy" },
-    { name = "pandas" },
-    { name = "pydantic" },
-    { name = "pydantic-settings" },
-    { name = "python-dateutil" },
-    { name = "python-dotenv" },
-    { name = "rich" },
-    { name = "sentencepiece" },
-    { name = "tokenizers" },
-    { name = "tqdm" },
-    { name = "transformers" },
-    { name = "uvicorn" },
-]
-[package.dev-dependencies]
-dev = [
-    { name = "ruff" },
-]
-[package.metadata]
-requires-dist = [
-    { name = "apscheduler" },
-    { name = "datasets" },
-    { name = "fastapi", specifier = ">=0.120.0" },
-    { name = "gradio" },
-    { name = "gradio", extras = ["oauth"] },
-    { name = "gradio-client" },
-    { name = "gradio-leaderboard", specifier = "==0.0.13" },
-    { name = "huggingface-hub", specifier = ">=0.18.0" },
-    { name = "loguru", specifier = ">=0.7.3" },
-    { name = "matplotlib" },
-    { name = "numpy" },
-    { name = "pandas" },
-    { name = "pydantic", specifier = ">=2.11.10" },
-    { name = "pydantic-settings", specifier = ">=2.11.0" },
-    { name = "python-dateutil" },
-    { name = "python-dotenv", specifier = ">=1.2.1" },
-    { name = "rich", specifier = ">=14.2.0" },
-    { name = "sentencepiece" },
-    { name = "tokenizers", specifier = ">=0.15.0" },
-    { name = "tqdm" },
-    { name = "transformers" },
-    { name = "uvicorn", specifier = ">=0.38.0" },
-]
-[package.metadata.requires-dev]
-dev = [{ name = "ruff", specifier = ">=0.14.0,<0.15.0" }]
 [[package]]
 name = "loguru"
 version = "0.7.3"
@@ -1324,6 +1332,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
 ]
 [[package]]
 name = "tokenizers"
 version = "0.22.1"
@@ -1349,6 +1366,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
 ]
 [[package]]
 name = "tomlkit"
 version = "0.13.3"

     { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" },
 ]
+[[package]]
+name = "easi-leaderboard"
+version = "0.1.0"
+source = { virtual = "." }
+dependencies = [
+    { name = "apscheduler" },
+    { name = "datasets" },
+    { name = "fastapi" },
+    { name = "gradio", extra = ["oauth"] },
+    { name = "gradio-client" },
+    { name = "gradio-leaderboard" },
+    { name = "huggingface-hub" },
+    { name = "loguru" },
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "pydantic" },
+    { name = "pydantic-settings" },
+    { name = "python-dateutil" },
+    { name = "python-dotenv" },
+    { name = "rich" },
+    { name = "sentencepiece" },
+    { name = "tokenizers" },
+    { name = "tomli" },
+    { name = "tqdm" },
+    { name = "transformers" },
+    { name = "typing-extensions" },
+    { name = "uvicorn" },
+]
+[package.dev-dependencies]
+dev = [
+    { name = "ruff" },
+    { name = "tabulate" },
+]
+[package.metadata]
+requires-dist = [
+    { name = "apscheduler" },
+    { name = "datasets" },
+    { name = "fastapi", specifier = ">=0.120.0" },
+    { name = "gradio" },
+    { name = "gradio", extras = ["oauth"] },
+    { name = "gradio-client" },
+    { name = "gradio-leaderboard", specifier = "==0.0.13" },
+    { name = "huggingface-hub", specifier = ">=0.18.0" },
+    { name = "loguru", specifier = ">=0.7.3" },
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "pydantic", specifier = ">=2.11.10" },
+    { name = "pydantic-settings", specifier = ">=2.11.0" },
+    { name = "python-dateutil" },
+    { name = "python-dotenv", specifier = ">=1.2.1" },
+    { name = "rich", specifier = ">=14.2.0" },
+    { name = "sentencepiece" },
+    { name = "tokenizers", specifier = ">=0.15.0" },
+    { name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.3.0" },
+    { name = "tqdm" },
+    { name = "transformers" },
+    { name = "typing-extensions", specifier = ">=4.15.0" },
+    { name = "uvicorn", specifier = ">=0.38.0" },
+]
+[package.metadata.requires-dev]
+dev = [
+    { name = "ruff", specifier = ">=0.14.0,<0.15.0" },
+    { name = "tabulate" },
+]
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
     { url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" },
 ]
 [[package]]
 name = "loguru"
 version = "0.7.3"
     { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
 ]
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
+]
 [[package]]
 name = "tokenizers"
 version = "0.22.1"
     { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
 ]
+[[package]]
+name = "tomli"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/ed/3f73f72945444548f33eba9a87fc7a6e969915e7b1acc8260b30e1f76a2f/tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549", size = 17392, upload-time = "2025-10-08T22:01:47.119Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" },
+]
 [[package]]
 name = "tomlkit"
 version = "0.13.3"