yangzhitao commited on
Commit
c85dcc4
·
1 Parent(s): d1fd905

feat: enhance leaderboard functionality and refactor app structure; update token handling and project metadata; update environment settings

Browse files
.env.example CHANGED
@@ -1,2 +1,6 @@
1
  HF_TOKEN=changethis
2
  HF_HOME=.
 
 
 
 
 
1
  HF_TOKEN=changethis
2
  HF_HOME=.
3
+ HF_OWNER=lmms-lab
4
+ HF_REPO_NAME=EASI-Leaderboard
5
+ HF_RESULTS_REPO_NAME=EASI-Leaderboard-Results
6
+ HF_REQUESTS_REPO_NAME=EASI-Leaderboard-Requests
.vscode/cspell.json CHANGED
@@ -3,8 +3,10 @@
3
  "accs",
4
  "changethis",
5
  "checkboxgroup",
 
6
  "evals",
7
  "initialisation",
 
8
  "modelcard",
9
  "sentencepiece"
10
  ]
 
3
  "accs",
4
  "changethis",
5
  "checkboxgroup",
6
+ "EASI",
7
  "evals",
8
  "initialisation",
9
+ "lmms",
10
  "modelcard",
11
  "sentencepiece"
12
  ]
app.py CHANGED
@@ -6,17 +6,16 @@ import pandas as pd
6
  import requests
7
  import uvicorn
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
- from huggingface_hub import snapshot_download
10
  from rich import print
11
 
12
  from src.about import (
13
- BENCHMARKS,
14
  CITATION_BUTTON_LABEL,
15
  CITATION_BUTTON_TEXT,
16
  EVALUATION_QUEUE_TEXT,
17
  INTRODUCTION_TEXT,
18
  LLM_BENCHMARKS_TEXT,
19
  TITLE,
 
20
  )
21
  from src.backend.app import create_app
22
  from src.display.css_html_js import (
@@ -28,9 +27,9 @@ from src.display.css_html_js import (
28
  from src.display.utils import (
29
  BASE_COLS,
30
  BENCHMARK_COLS,
31
- COLS,
32
  EVAL_COLS,
33
  EVAL_TYPES,
 
34
  AutoEvalColumn,
35
  ModelType,
36
  Precision,
@@ -38,8 +37,12 @@ from src.display.utils import (
38
  )
39
  from src.envs import API, settings
40
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
41
  from src.submission.submit import add_new_eval
42
 
 
 
 
43
 
44
  def restart_space():
45
  API.restart_space(repo_id=settings.REPO_ID)
@@ -47,37 +50,13 @@ def restart_space():
47
 
48
  print("///// --- Settings --- /////", settings.model_dump())
49
 
50
- # Space initialisation
51
- try:
52
- snapshot_download(
53
- repo_id=settings.QUEUE_REPO,
54
- local_dir=settings.EVAL_REQUESTS_PATH,
55
- repo_type="dataset",
56
- tqdm_class=None,
57
- etag_timeout=30,
58
- token=settings.TOKEN,
59
- )
60
- except Exception:
61
- restart_space()
62
- try:
63
- snapshot_download(
64
- repo_id=settings.RESULTS_REPO,
65
- local_dir=settings.EVAL_RESULTS_PATH,
66
- repo_type="dataset",
67
- tqdm_class=None,
68
- etag_timeout=30,
69
- token=settings.TOKEN,
70
- )
71
- except Exception:
72
- restart_space()
73
 
74
-
75
- LEADERBOARD_DF = get_leaderboard_df(
76
- settings.EVAL_RESULTS_PATH,
77
- settings.EVAL_REQUESTS_PATH,
78
- COLS,
79
- BENCHMARK_COLS,
80
- )
81
 
82
  (
83
  finished_eval_queue_df,
@@ -90,8 +69,9 @@ def filter_dataframe_by_columns(selected_cols: list[str], original_df: pd.DataFr
90
  """
91
  根据选择的列过滤 DataFrame
92
  """
93
- # 始终包含基础列 'T' 和 'Model'
94
- base_cols = ['T', 'Model']
 
95
  all_selected_cols = [col for col in base_cols if col in original_df.columns]
96
 
97
  # 添加用户选择的列(排除已存在的基础列)
@@ -175,7 +155,11 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
175
  return filtered_df
176
 
177
 
178
- def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
 
 
 
 
179
  # 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
180
  original_df = dataframe.copy()
181
 
@@ -187,7 +171,9 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
187
  )
188
 
189
  # 初始化显示的列(包含基础列和默认选中的列)
190
- default_selected = [col for col in dataframe.columns if col in cols] + ['Average ⬆️']
 
 
191
 
192
  # 先按 precision 筛选 original_df
193
  precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
@@ -197,8 +183,13 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
197
  with gr.Row():
198
  with gr.Column(scale=1):
199
  search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
 
 
 
 
 
200
  show_columns = gr.CheckboxGroup(
201
- choices=[col for col in dataframe.columns if col not in ['T', 'Model']],
202
  label="Select Columns to Display",
203
  value=default_selected,
204
  interactive=True,
@@ -271,24 +262,37 @@ def main():
271
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
272
 
273
  with gr.Tabs(elem_classes="tab-buttons") as _tabs:
274
- for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
275
- with gr.TabItem(f"🏅 {benchmark}", elem_id="llm-benchmark-tab-table", id=i):
276
- benchmark_cols = [
277
- BENCHMARK_COL for BENCHMARK_COL in BENCHMARK_COLS if BENCHMARK_COL.startswith(benchmark)
278
- ]
279
- cols = BASE_COLS + benchmark_cols
280
- BENCHMARK_DF = get_leaderboard_df(
281
- settings.EVAL_RESULTS_PATH,
282
- settings.EVAL_REQUESTS_PATH,
283
- cols,
284
- benchmark_cols,
285
- )
286
- _leaderboard = init_leaderboard_tabs(BENCHMARK_DF, benchmark_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
- with gr.TabItem("📝 About", elem_id="about-tab", id=len(BENCHMARKS)):
289
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
290
 
291
- with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=len(BENCHMARKS) + 1):
292
  with gr.Column():
293
  with gr.Row():
294
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -405,6 +409,28 @@ def main():
405
  submission_result,
406
  )
407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  with gr.Row():
409
  with gr.Accordion("📙 Citation", open=False):
410
  _citation_button = gr.Textbox(
@@ -414,28 +440,6 @@ def main():
414
  elem_id="citation-button",
415
  show_copy_button=True,
416
  )
417
-
418
- # Backend status indicator
419
- backend_status = gr.HTML(
420
- value=get_backend_status_undefined_html(),
421
- elem_id="backend-status-container",
422
- )
423
- # trigger button to bind the click event
424
- status_trigger = gr.Button(elem_id="backend-status-trigger-btn", visible=False)
425
- status_trigger.click(
426
- fn=lambda: check_backend_health()[1],
427
- inputs=None,
428
- outputs=backend_status,
429
- )
430
- # load external JavaScript file
431
- js_content = backend_status_js()
432
- status_trigger_js_html = f'<script>{js_content}</script>'
433
- gr.HTML(status_trigger_js_html, visible=False)
434
- demo.load(
435
- fn=lambda: check_backend_health()[1],
436
- inputs=None,
437
- outputs=backend_status,
438
- )
439
  return demo
440
 
441
 
@@ -480,7 +484,7 @@ if __name__ == "__main__":
480
  def run_fastapi():
481
  host = settings.BACKEND_HOST
482
  port = settings.BACKEND_PORT
483
- print(f"Starting FastAPI server on http://{host}:{port}")
484
  uvicorn.run(
485
  app,
486
  host=host,
 
6
  import requests
7
  import uvicorn
8
  from apscheduler.schedulers.background import BackgroundScheduler
 
9
  from rich import print
10
 
11
  from src.about import (
 
12
  CITATION_BUTTON_LABEL,
13
  CITATION_BUTTON_TEXT,
14
  EVALUATION_QUEUE_TEXT,
15
  INTRODUCTION_TEXT,
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
+ get_benchmarks,
19
  )
20
  from src.backend.app import create_app
21
  from src.display.css_html_js import (
 
27
  from src.display.utils import (
28
  BASE_COLS,
29
  BENCHMARK_COLS,
 
30
  EVAL_COLS,
31
  EVAL_TYPES,
32
+ NOT_SUPPORTED_COLS,
33
  AutoEvalColumn,
34
  ModelType,
35
  Precision,
 
37
  )
38
  from src.envs import API, settings
39
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
40
+ from src.prepare import prepare_space
41
  from src.submission.submit import add_new_eval
42
 
43
+ prepare_space()
44
+ BENCHMARKS = get_benchmarks()
45
+
46
 
47
  def restart_space():
48
  API.restart_space(repo_id=settings.REPO_ID)
 
50
 
51
  print("///// --- Settings --- /////", settings.model_dump())
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # LEADERBOARD_DF = get_leaderboard_df(
55
+ # settings.EVAL_RESULTS_PATH,
56
+ # settings.EVAL_REQUESTS_PATH,
57
+ # COLS,
58
+ # BENCHMARK_COLS,
59
+ # )
 
60
 
61
  (
62
  finished_eval_queue_df,
 
69
  """
70
  根据选择的列过滤 DataFrame
71
  """
72
+ # # 始终包含基础列 'T' 和 'Model'
73
+ # base_cols = ['T', 'Model']
74
+ base_cols = ['Model']
75
  all_selected_cols = [col for col in base_cols if col in original_df.columns]
76
 
77
  # 添加用户选择的列(排除已存在的基础列)
 
155
  return filtered_df
156
 
157
 
158
+ def init_leaderboard_tabs(
159
+ dataframe: pd.DataFrame,
160
+ cols: list[str],
161
+ not_supported_cols: list[str],
162
+ ):
163
  # 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
164
  original_df = dataframe.copy()
165
 
 
171
  )
172
 
173
  # 初始化显示的列(包含基础列和默认选中的列)
174
+ default_selected = [col for col in dataframe.columns if col in cols and col not in not_supported_cols] + [
175
+ 'Average ⬆️'
176
+ ]
177
 
178
  # 先按 precision 筛选 original_df
179
  precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
 
183
  with gr.Row():
184
  with gr.Column(scale=1):
185
  search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
186
+ column_choices = [
187
+ col
188
+ for col in dataframe.columns
189
+ if col not in ['T', 'Model'] and (not not_supported_cols or col not in not_supported_cols)
190
+ ]
191
  show_columns = gr.CheckboxGroup(
192
+ choices=column_choices,
193
  label="Select Columns to Display",
194
  value=default_selected,
195
  interactive=True,
 
262
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
263
 
264
  with gr.Tabs(elem_classes="tab-buttons") as _tabs:
265
+ with gr.TabItem("📝 Overview", elem_id="benchmark-overview-tab", id=0):
266
+ benchmark_cols = BENCHMARK_COLS.copy()
267
+ print("benchmark_cols:", benchmark_cols)
268
+ cols = BASE_COLS + benchmark_cols
269
+ benchmark_df = get_leaderboard_df(
270
+ settings.EVAL_RESULTS_PATH,
271
+ settings.EVAL_REQUESTS_PATH,
272
+ cols,
273
+ benchmark_cols,
274
+ )
275
+ _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
276
+
277
+ i_bench = 1
278
+ if False:
279
+ for i_bench, benchmark in enumerate(sorted(BENCHMARKS), start=1):
280
+ with gr.TabItem(f"🏅 {benchmark.title}", elem_id="llm-benchmark-tab-table", id=i_bench):
281
+ print(f"benchmark.title: {benchmark.title!r}")
282
+ benchmark_cols = [col for col in BENCHMARK_COLS if col.startswith(benchmark.title)]
283
+ cols = BASE_COLS + benchmark_cols
284
+ benchmark_df = get_leaderboard_df(
285
+ settings.EVAL_RESULTS_PATH,
286
+ settings.EVAL_REQUESTS_PATH,
287
+ cols,
288
+ benchmark_cols,
289
+ )
290
+ _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
291
 
292
+ with gr.TabItem("📝 About", elem_id="about-tab", id=i_bench + 1):
293
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
294
 
295
+ with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=i_bench + 2):
296
  with gr.Column():
297
  with gr.Row():
298
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
409
  submission_result,
410
  )
411
 
412
+ # Backend status indicator
413
+ backend_status = gr.HTML(
414
+ value=get_backend_status_undefined_html(),
415
+ elem_id="backend-status-container",
416
+ )
417
+ # trigger button to bind the click event
418
+ status_trigger = gr.Button(elem_id="backend-status-trigger-btn", visible=False)
419
+ status_trigger.click(
420
+ fn=lambda: check_backend_health()[1],
421
+ inputs=None,
422
+ outputs=backend_status,
423
+ )
424
+ # load external JavaScript file
425
+ js_content = backend_status_js()
426
+ status_trigger_js_html = f'<script>{js_content}</script>'
427
+ gr.HTML(status_trigger_js_html, visible=False)
428
+ demo.load(
429
+ fn=lambda: check_backend_health()[1],
430
+ inputs=None,
431
+ outputs=backend_status,
432
+ )
433
+
434
  with gr.Row():
435
  with gr.Accordion("📙 Citation", open=False):
436
  _citation_button = gr.Textbox(
 
440
  elem_id="citation-button",
441
  show_copy_button=True,
442
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  return demo
444
 
445
 
 
484
  def run_fastapi():
485
  host = settings.BACKEND_HOST
486
  port = settings.BACKEND_PORT
487
+ print("Starting FastAPI server:")
488
  uvicorn.run(
489
  app,
490
  host=host,
pyproject.toml CHANGED
@@ -1,7 +1,7 @@
1
  [project]
2
- name = "leaderboard"
3
  version = "0.1.0"
4
- description = "Leaderboard for Benchmarking LLMs"
5
  readme = "README.md"
6
  requires-python = ">=3.10,<3.11"
7
 
@@ -28,7 +28,9 @@ dependencies = [
28
  "fastapi>=0.120.0",
29
  "loguru>=0.7.3",
30
  "uvicorn>=0.38.0",
 
 
31
  ]
32
 
33
  [dependency-groups]
34
- dev = ["ruff>=0.14.0,<0.15.0"]
 
1
  [project]
2
+ name = "easi-leaderboard"
3
  version = "0.1.0"
4
+ description = "Leaderboard for EASI: Holistic Evaluation and Analysis for Spatial Intelligence Made Easy"
5
  readme = "README.md"
6
  requires-python = ">=3.10,<3.11"
7
 
 
28
  "fastapi>=0.120.0",
29
  "loguru>=0.7.3",
30
  "uvicorn>=0.38.0",
31
+ "tomli>=2.3.0 ; python_full_version < '3.11'",
32
+ "typing-extensions>=4.15.0",
33
  ]
34
 
35
  [dependency-groups]
36
+ dev = ["ruff>=0.14.0,<0.15.0", "tabulate"]
requirements.txt CHANGED
@@ -21,3 +21,5 @@ rich>=14.2.0
21
  fastapi>=0.120.0
22
  loguru>=0.7.3
23
  uvicorn>=0.38.0
 
 
 
21
  fastapi>=0.120.0
22
  loguru>=0.7.3
23
  uvicorn>=0.38.0
24
+ tomli>=2.3.0; python_version < '3.11'
25
+ typing_extensions>=4.15.0
src/about.py CHANGED
@@ -1,10 +1,15 @@
1
  from enum import Enum
 
2
  from typing import Annotated
3
 
4
  from pydantic import BaseModel, Field
5
 
 
6
 
7
- class Task(BaseModel):
 
 
 
8
  benchmark: Annotated[str, Field(description="The benchmark name")]
9
  metric: Annotated[str, Field(description="The metric name")]
10
  col_name: Annotated[str, Field(description="The column name")]
@@ -12,59 +17,64 @@ class Task(BaseModel):
12
 
13
  # Select your tasks here
14
  # ---------------------------------------------------
15
- class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
 
18
  # acc
19
- task1_1 = Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
20
- task2_1 = Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
21
- task3_1 = Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
22
- task4_1 = Task(benchmark="Core", metric="acc", col_name="Core(acc)")
23
- task5_1 = Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
24
- task6_1 = Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
25
- task7_1 = Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
26
- task8_1 = Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
27
 
28
  # caa
29
- task1_2 = Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
30
- task2_2 = Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
31
- task3_2 = Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
32
- task4_2 = Task(benchmark="Core", metric="caa", col_name="Core(caa)")
33
- task5_2 = Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
34
- task6_2 = Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
35
- task7_2 = Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
36
- task8_2 = Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
37
 
38
  # rand
39
- task1_3 = Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
40
- task2_3 = Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
41
- task3_3 = Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
42
- task4_3 = Task(benchmark="Core", metric="rand", col_name="Core(rand)")
43
- task5_3 = Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
44
- task6_3 = Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
45
- task7_3 = Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
46
- task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
 
47
 
 
 
 
 
 
 
 
48
 
49
- BENCHMARKS = {m.value.benchmark for m in Tasks}
50
- METRICS = {m.value.metric for m in Tasks}
51
- COL_NAMES = {m.value.col_name for m in Tasks}
52
 
53
  NUM_FEWSHOT = 0 # Change with your few shot
54
  # ---------------------------------------------------
55
 
56
 
57
  # Your leaderboard name
58
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
59
 
60
  # What does your leaderboard evaluate?
61
  INTRODUCTION_TEXT = """
62
- Intro text
63
  """
64
 
65
  # Which evaluations are you running? how can people reproduce what you have?
66
  LLM_BENCHMARKS_TEXT = """
67
- ## How it works
68
 
69
  ## Reproducibility
70
  To reproduce our results, here is the commands you can run:
 
1
  from enum import Enum
2
+ from functools import lru_cache
3
  from typing import Annotated
4
 
5
  from pydantic import BaseModel, Field
6
 
7
+ from src.prepare import load_meta_toml, prepare_space
8
 
9
+ prepare_space()
10
+
11
+
12
+ class _Task(BaseModel):
13
  benchmark: Annotated[str, Field(description="The benchmark name")]
14
  metric: Annotated[str, Field(description="The metric name")]
15
  col_name: Annotated[str, Field(description="The column name")]
 
17
 
18
  # Select your tasks here
19
  # ---------------------------------------------------
20
+ class _Tasks(Enum):
21
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
22
 
23
  # acc
24
+ task1_1 = _Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
25
+ task2_1 = _Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
26
+ task3_1 = _Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
27
+ task4_1 = _Task(benchmark="Core", metric="acc", col_name="Core(acc)")
28
+ task5_1 = _Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
29
+ task6_1 = _Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
30
+ task7_1 = _Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
31
+ task8_1 = _Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
32
 
33
  # caa
34
+ task1_2 = _Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
35
+ task2_2 = _Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
36
+ task3_2 = _Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
37
+ task4_2 = _Task(benchmark="Core", metric="caa", col_name="Core(caa)")
38
+ task5_2 = _Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
39
+ task6_2 = _Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
40
+ task7_2 = _Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
41
+ task8_2 = _Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
42
 
43
  # rand
44
+ task1_3 = _Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
45
+ task2_3 = _Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
46
+ task3_3 = _Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
47
+ task4_3 = _Task(benchmark="Core", metric="rand", col_name="Core(rand)")
48
+ task5_3 = _Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
49
+ task6_3 = _Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
50
+ task7_3 = _Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
51
+ task8_3 = _Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
52
+
53
 
54
+ # BENCHMARKS = {m.value.benchmark for m in Tasks}
55
+ # METRICS = {m.value.metric for m in Tasks}
56
+ # COL_NAMES = {m.value.col_name for m in Tasks}
57
+ @lru_cache(maxsize=1)
58
+ def get_benchmarks():
59
+ meta_toml = load_meta_toml()
60
+ return sorted(meta_toml.benchmarks)
61
 
 
 
 
62
 
63
  NUM_FEWSHOT = 0 # Change with your few shot
64
  # ---------------------------------------------------
65
 
66
 
67
  # Your leaderboard name
68
+ TITLE = """<h1 align="center" id="space-title">EASI Leaderboard</h1>"""
69
 
70
  # What does your leaderboard evaluate?
71
  INTRODUCTION_TEXT = """
72
+ EASI: Holistic Evaluation and Analysis for Spatial Intelligence Made Easy
73
  """
74
 
75
  # Which evaluations are you running? how can people reproduce what you have?
76
  LLM_BENCHMARKS_TEXT = """
77
+ ## Leaderboard
78
 
79
  ## Reproducibility
80
  To reproduce our results, here is the commands you can run:
src/backend/config.py CHANGED
@@ -1,8 +1,9 @@
1
  from functools import cached_property
 
2
  from typing import Annotated
3
 
4
  from dotenv import load_dotenv
5
- from pydantic import Field, SecretStr
6
  from pydantic_settings import BaseSettings, SettingsConfigDict
7
 
8
  load_dotenv()
@@ -31,8 +32,26 @@ class Settings(BaseSettings):
31
  token=token,
32
  )
33
 
34
- REQUESTS_REPO_ID: str = "y-playground/requests"
35
- RESULTS_REPO_ID: str = "y-playground/results"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
 
38
  settings = Settings()
 
1
  from functools import cached_property
2
+ from pathlib import Path
3
  from typing import Annotated
4
 
5
  from dotenv import load_dotenv
6
+ from pydantic import Field, SecretStr, computed_field
7
  from pydantic_settings import BaseSettings, SettingsConfigDict
8
 
9
  load_dotenv()
 
32
  token=token,
33
  )
34
 
35
+ # Settings for Hugging Face repos
36
+ HF_OWNER: str = "lmms-lab"
37
+ HF_REPO_NAME: Annotated[str, Field(description="Name of leaderboard repo")] = "EASI-Leaderboard"
38
+ HF_RESULTS_REPO_NAME: Annotated[str, Field(description="Name of results repo")] = "EASI-Leaderboard-Results"
39
+ HF_REQUESTS_REPO_NAME: Annotated[str, Field(description="Name of requests repo")] = "EASI-Leaderboard-Requests"
40
+
41
+ @computed_field
42
+ @cached_property
43
+ def REPO_ID(self) -> str:
44
+ return (Path(self.HF_OWNER) / self.HF_REPO_NAME).as_posix()
45
+
46
+ @computed_field
47
+ @cached_property
48
+ def RESULTS_REPO_ID(self) -> str:
49
+ return (Path(self.HF_OWNER) / self.HF_RESULTS_REPO_NAME).as_posix()
50
+
51
+ @computed_field
52
+ @cached_property
53
+ def QUEUE_REPO_ID(self) -> str:
54
+ return (Path(self.HF_OWNER) / self.HF_REQUESTS_REPO_NAME).as_posix()
55
 
56
 
57
  settings = Settings()
src/backend/routes/hf.py CHANGED
@@ -52,7 +52,7 @@ async def upload_file_content(
52
  path_or_fileobj=file_obj,
53
  path_in_repo=params.path_in_repo,
54
  commit_message=params.commit_message,
55
- repo_id=settings.REQUESTS_REPO_ID,
56
  repo_type="dataset",
57
  )
58
  return ResponseData(data=data)
@@ -104,7 +104,7 @@ async def community_submit(
104
  path_or_fileobj=file_obj,
105
  path_in_repo=path_in_repo,
106
  commit_message=params.commit_message,
107
- repo_id=settings.REQUESTS_REPO_ID,
108
  repo_type="dataset",
109
  )
110
  return ResponseData(data=data)
 
52
  path_or_fileobj=file_obj,
53
  path_in_repo=params.path_in_repo,
54
  commit_message=params.commit_message,
55
+ repo_id=settings.QUEUE_REPO_ID,
56
  repo_type="dataset",
57
  )
58
  return ResponseData(data=data)
 
104
  path_or_fileobj=file_obj,
105
  path_in_repo=path_in_repo,
106
  commit_message=params.commit_message,
107
+ repo_id=settings.QUEUE_REPO_ID,
108
  repo_type="dataset",
109
  )
110
  return ResponseData(data=data)
src/display/formatting.py CHANGED
@@ -9,6 +9,9 @@ def model_hyperlink(link: str, model_name: str) -> str:
9
 
10
 
11
  def make_clickable_model(model_name: str) -> str:
 
 
 
12
  link = f"https://huggingface.co/{model_name}"
13
  return model_hyperlink(link, model_name)
14
 
@@ -26,7 +29,7 @@ def styled_message(message: str) -> str:
26
 
27
 
28
  def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
29
- return df.loc[:, columns].notna().all(axis=1)
30
 
31
 
32
  def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
 
9
 
10
 
11
  def make_clickable_model(model_name: str) -> str:
12
+ if "/" not in model_name:
13
+ # not a full model name, cannot be clicked
14
+ return model_name
15
  link = f"https://huggingface.co/{model_name}"
16
  return model_hyperlink(link, model_name)
17
 
 
29
 
30
 
31
  def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
32
+ return df.loc[:, columns].notna().any(axis=1)
33
 
34
 
35
  def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
src/display/utils.py CHANGED
@@ -9,7 +9,7 @@ from typing import Literal, Union
9
  from pydantic import BaseModel, ConfigDict, create_model
10
  from typing_extensions import Self
11
 
12
- from src.about import Tasks
13
 
14
 
15
  def fields(
@@ -33,6 +33,8 @@ class ColumnContent(BaseModel):
33
  hidden: bool = False
34
  never_hidden: bool = False
35
 
 
 
36
  @classmethod
37
  def new(
38
  cls,
@@ -42,6 +44,7 @@ class ColumnContent(BaseModel):
42
  *,
43
  hidden: bool = False,
44
  never_hidden: bool = False,
 
45
  ) -> Self:
46
  return cls(
47
  name=name,
@@ -49,6 +52,7 @@ class ColumnContent(BaseModel):
49
  displayed_by_default=displayed_by_default,
50
  hidden=hidden,
51
  never_hidden=never_hidden,
 
52
  )
53
 
54
 
@@ -56,29 +60,34 @@ class _AutoEvalColumnBase(BaseModel):
56
  model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
57
 
58
  model_type_symbol: ColumnContent = ColumnContent(
59
- name="T", type="str", displayed_by_default=True, never_hidden=True
 
 
 
60
  )
61
  model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
62
  average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
63
 
64
  model_type: ColumnContent = ColumnContent.new("Type", "str")
65
- architecture: ColumnContent = ColumnContent.new("Architecture", "str")
66
  weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
67
- precision: ColumnContent = ColumnContent.new("Precision", "str")
68
- license: ColumnContent = ColumnContent.new("Hub License", "str")
69
- params: ColumnContent = ColumnContent.new("#Params (B)", "number")
70
- likes: ColumnContent = ColumnContent.new("Hub ❤️", "number")
71
- still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool")
72
- revision: ColumnContent = ColumnContent.new("Model sha", "str")
 
73
 
 
74
 
75
  # We use create_model to dynamically fill the scores from Tasks
76
  field_definitions = {
77
- task.name: (
78
  ColumnContent,
79
- ColumnContent.new(task.value.col_name, "number", True),
80
  )
81
- for task in Tasks
82
  }
83
  AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model( # pyright: ignore[reportCallIssue]
84
  '_AutoEvalColumnCls',
@@ -156,9 +165,11 @@ class Precision(Enum):
156
 
157
 
158
  # Column selection
159
- COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
160
  BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
161
  EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
162
  EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
 
163
 
164
- BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]
 
 
9
  from pydantic import BaseModel, ConfigDict, create_model
10
  from typing_extensions import Self
11
 
12
+ from src.about import get_benchmarks
13
 
14
 
15
  def fields(
 
33
  hidden: bool = False
34
  never_hidden: bool = False
35
 
36
+ not_supported: bool = False # for not supported columns, should not be displayed
37
+
38
  @classmethod
39
  def new(
40
  cls,
 
44
  *,
45
  hidden: bool = False,
46
  never_hidden: bool = False,
47
+ not_supported: bool = False,
48
  ) -> Self:
49
  return cls(
50
  name=name,
 
52
  displayed_by_default=displayed_by_default,
53
  hidden=hidden,
54
  never_hidden=never_hidden,
55
+ not_supported=not_supported,
56
  )
57
 
58
 
 
60
  model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
61
 
62
  model_type_symbol: ColumnContent = ColumnContent(
63
+ name="T",
64
+ type="str",
65
+ displayed_by_default=True,
66
+ # never_hidden=True,
67
  )
68
  model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
69
  average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
70
 
71
  model_type: ColumnContent = ColumnContent.new("Type", "str")
72
+ architecture: ColumnContent = ColumnContent.new("Architecture", "str", not_supported=True)
73
  weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
74
+ precision: ColumnContent = ColumnContent.new("Precision", "str", not_supported=True)
75
+ license: ColumnContent = ColumnContent.new("Hub License", "str", not_supported=True)
76
+ params: ColumnContent = ColumnContent.new("#Params (B)", "number", not_supported=True)
77
+ likes: ColumnContent = ColumnContent.new("Hub ❤️", "number", not_supported=True)
78
+ still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool", not_supported=True)
79
+ revision: ColumnContent = ColumnContent.new("Model sha", "str", not_supported=True)
80
+
81
 
82
+ BENCHMARKS = get_benchmarks()
83
 
84
  # We use create_model to dynamically fill the scores from Tasks
85
  field_definitions = {
86
+ task.key: (
87
  ColumnContent,
88
+ ColumnContent.new(task.title, "number", True),
89
  )
90
+ for task in BENCHMARKS
91
  }
92
  AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model( # pyright: ignore[reportCallIssue]
93
  '_AutoEvalColumnCls',
 
165
 
166
 
167
  # Column selection
168
+ # COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
169
  BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
170
  EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
171
  EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
172
+ NOT_SUPPORTED_COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if c.not_supported]
173
 
174
+ # BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]
175
+ BENCHMARK_COLS: list[str] = [t.title for t in BENCHMARKS]
src/envs.py CHANGED
@@ -3,7 +3,7 @@ from pathlib import Path
3
  from typing import Annotated
4
 
5
  from huggingface_hub import HfApi
6
- from pydantic import Field, computed_field
7
  from pydantic_settings import BaseSettings, SettingsConfigDict
8
 
9
  # ----------------------------------
@@ -14,64 +14,64 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
14
  class Settings(BaseSettings):
15
  model_config = SettingsConfigDict(env_file=".env")
16
 
17
- TOKEN: Annotated[str, Field(..., alias="HF_TOKEN", description="A read/write token for your org")]
18
 
19
- # Change to your org - don't forget to create a results and request dataset, with the correct format!
20
- OWNER: Annotated[
21
- str,
22
- Field("y-playground"),
23
- ]
24
-
25
- BACKEND_HOST: Annotated[str, Field("127.0.0.1", description="Backend host")]
26
- BACKEND_PORT: Annotated[int, Field(8000, description="Backend port")]
27
 
28
  @computed_field
29
  @cached_property
30
  def REPO_ID(self) -> str:
31
- return (Path(self.OWNER) / "leaderboard").as_posix()
32
 
33
  @computed_field
34
  @cached_property
35
- def QUEUE_REPO(self) -> str:
36
- return (Path(self.OWNER) / "requests").as_posix()
37
 
38
  @computed_field
39
  @cached_property
40
- def RESULTS_REPO(self) -> str:
41
- return (Path(self.OWNER) / "results").as_posix()
42
-
43
- CACHE_PATH: Annotated[
44
- str,
45
- Field(".", alias="HF_HOME", description="If you setup a cache later, just change `HF_HOME`"),
 
 
 
46
  ]
47
 
 
 
 
 
 
48
  # Local caches
49
 
50
  @computed_field
51
  @cached_property
52
  def EVAL_REQUESTS_PATH(self) -> str:
53
- return (Path(self.CACHE_PATH) / "eval-queue").as_posix()
54
 
55
  @computed_field
56
  @cached_property
57
  def EVAL_RESULTS_PATH(self) -> str:
58
- return (Path(self.CACHE_PATH) / "eval-results").as_posix()
59
-
60
- @computed_field
61
- @cached_property
62
- def EVAL_REQUESTS_PATH_BACKEND(self) -> str:
63
- return (Path(self.CACHE_PATH) / "eval-queue-bk").as_posix()
64
 
65
  @computed_field
66
  @cached_property
67
- def EVAL_RESULTS_PATH_BACKEND(self) -> str:
68
- return (Path(self.CACHE_PATH) / "eval-results-bk").as_posix()
69
 
70
  @computed_field
71
  @cached_property
72
- def API(self) -> HfApi:
73
- return HfApi(token=self.TOKEN)
74
 
75
 
76
  settings = Settings() # pyright: ignore[reportCallIssue]
77
- API = settings.API
 
3
  from typing import Annotated
4
 
5
  from huggingface_hub import HfApi
6
+ from pydantic import Field, SecretStr, computed_field
7
  from pydantic_settings import BaseSettings, SettingsConfigDict
8
 
9
  # ----------------------------------
 
14
  class Settings(BaseSettings):
15
  model_config = SettingsConfigDict(env_file=".env")
16
 
17
+ HF_TOKEN: Annotated[SecretStr, Field(..., description="A read/write token for your org")]
18
 
19
+ # Settings for Hugging Face repos
20
+ HF_OWNER: str = "lmms-lab"
21
+ HF_REPO_NAME: Annotated[str, Field(description="Name of leaderboard repo")] = "EASI-Leaderboard"
22
+ HF_RESULTS_REPO_NAME: Annotated[str, Field(description="Name of results repo")] = "EASI-Leaderboard-Results"
23
+ HF_REQUESTS_REPO_NAME: Annotated[str, Field(description="Name of requests repo")] = "EASI-Leaderboard-Requests"
 
 
 
24
 
25
  @computed_field
26
  @cached_property
27
  def REPO_ID(self) -> str:
28
+ return (Path(self.HF_OWNER) / self.HF_REPO_NAME).as_posix()
29
 
30
  @computed_field
31
  @cached_property
32
+ def RESULTS_REPO_ID(self) -> str:
33
+ return (Path(self.HF_OWNER) / self.HF_RESULTS_REPO_NAME).as_posix()
34
 
35
  @computed_field
36
  @cached_property
37
+ def QUEUE_REPO_ID(self) -> str:
38
+ return (Path(self.HF_OWNER) / self.HF_REQUESTS_REPO_NAME).as_posix()
39
+
40
+ HF_HOME: Annotated[
41
+ Path,
42
+ Field(
43
+ default_factory=lambda: Path(".").resolve(),
44
+ description="If you setup a cache later, just change `HF_HOME`",
45
+ ),
46
  ]
47
 
48
+ # Backend settings
49
+
50
+ BACKEND_HOST: Annotated[str, Field("127.0.0.1", description="Backend host")]
51
+ BACKEND_PORT: Annotated[int, Field(8000, description="Backend port")]
52
+
53
  # Local caches
54
 
55
  @computed_field
56
  @cached_property
57
  def EVAL_REQUESTS_PATH(self) -> str:
58
+ return (self.HF_HOME / "eval-queue").as_posix()
59
 
60
  @computed_field
61
  @cached_property
62
  def EVAL_RESULTS_PATH(self) -> str:
63
+ return (self.HF_HOME / "eval-results").as_posix()
 
 
 
 
 
64
 
65
  @computed_field
66
  @cached_property
67
+ def EVAL_REQUESTS_PATH_BACKUP(self) -> str:
68
+ return (self.HF_HOME / "eval-queue-bk").as_posix()
69
 
70
  @computed_field
71
  @cached_property
72
+ def EVAL_RESULTS_PATH_BACKUP(self) -> str:
73
+ return (self.HF_HOME / "eval-results-bk").as_posix()
74
 
75
 
76
  settings = Settings() # pyright: ignore[reportCallIssue]
77
+ API = HfApi(token=settings.HF_TOKEN.get_secret_value())
src/leaderboard/read_evals.py CHANGED
@@ -15,10 +15,14 @@ import numpy as np
15
  from pydantic import BaseModel, ConfigDict, Field
16
  from typing_extensions import Self
17
 
 
18
  from src.display.formatting import make_clickable_model
19
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
 
20
  from src.submission.check_validity import is_model_on_hub
21
 
 
 
22
 
23
  class EvalResultJson(BaseModel):
24
  """Model of the eval result json file."""
@@ -34,8 +38,8 @@ class EvalResultJson_Config(BaseModel):
34
 
35
  model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
36
 
37
- model_dtype: Annotated[str, Field(..., description="The model precision. e.g. torch.bfloat16")]
38
  model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
 
39
  model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
40
  model_args: Annotated[str | None, Field(description="The model args.")] = None
41
 
@@ -70,6 +74,7 @@ class EvalResult(BaseModel):
70
  precision = Precision.from_str(config.model_dtype)
71
 
72
  # Get model and org
 
73
  org_and_model = config.model_name or config.model_args or ""
74
  org_and_model = org_and_model.split("/", 1)
75
 
@@ -83,6 +88,10 @@ class EvalResult(BaseModel):
83
  result_key = f"{org}_{model}_{precision.value.name}"
84
  full_model = "/".join(org_and_model)
85
 
 
 
 
 
86
  still_on_hub, _, model_config = is_model_on_hub(
87
  full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
88
  )
@@ -94,16 +103,15 @@ class EvalResult(BaseModel):
94
 
95
  # Extract results available in this file (some results are split in several files)
96
  results: dict[str, float] = {}
97
- for t in Tasks:
98
- task = t.value
99
-
100
  # We average all scores of a given metric (not all metrics are present in all files)
101
- accs = np.array([v.get(task.metric, None) for k, v in data.results.items() if task.benchmark == k])
 
102
  if accs.size == 0 or any(acc is None for acc in accs):
103
  continue
104
 
105
  mean_acc = np.mean(accs) * 100.0
106
- results[task.benchmark] = float(mean_acc)
107
 
108
  return cls.model_validate({
109
  "eval_name": result_key,
@@ -119,6 +127,8 @@ class EvalResult(BaseModel):
119
 
120
  def update_with_request_file(self, requests_path: str) -> None:
121
  """Finds the relevant request file for the current model and updates info with it"""
 
 
122
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
123
 
124
  try:
@@ -137,7 +147,7 @@ class EvalResult(BaseModel):
137
 
138
  def to_dict(self) -> dict:
139
  """Converts the Eval Result to a dict compatible with our dataframe display"""
140
- average = sum(v for v in self.results.values() if v is not None) / len(Tasks)
141
  data_dict = {
142
  "eval_name": self.eval_name, # not a column, just a save name,
143
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -154,8 +164,8 @@ class EvalResult(BaseModel):
154
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
155
  }
156
 
157
- for task in Tasks:
158
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
159
 
160
  return data_dict
161
 
@@ -181,8 +191,6 @@ def get_request_file_for_model(requests_path, model_name, precision) -> str:
181
 
182
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
183
  """From the path of the results folder root, extract all needed info for results"""
184
- from rich import print as rprint # FIXME: DEBUG
185
-
186
  model_result_filepaths: list[str] = []
187
 
188
  for root, _, files in os.walk(results_path):
@@ -208,7 +216,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
208
  # Store results of same eval together
209
  eval_name = eval_result.eval_name
210
  if eval_name in eval_results.keys():
211
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
212
  else:
213
  eval_results[eval_name] = eval_result
214
 
 
15
  from pydantic import BaseModel, ConfigDict, Field
16
  from typing_extensions import Self
17
 
18
+ from src.about import get_benchmarks
19
  from src.display.formatting import make_clickable_model
20
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
21
+ from src.prepare import load_meta_toml
22
  from src.submission.check_validity import is_model_on_hub
23
 
24
+ BENCHMARKS = get_benchmarks()
25
+
26
 
27
  class EvalResultJson(BaseModel):
28
  """Model of the eval result json file."""
 
38
 
39
  model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
40
 
 
41
  model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
42
+ model_dtype: Annotated[str | None, Field(description="The model precision. e.g. torch.bfloat16")] = None
43
  model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
44
  model_args: Annotated[str | None, Field(description="The model args.")] = None
45
 
 
74
  precision = Precision.from_str(config.model_dtype)
75
 
76
  # Get model and org
77
+
78
  org_and_model = config.model_name or config.model_args or ""
79
  org_and_model = org_and_model.split("/", 1)
80
 
 
88
  result_key = f"{org}_{model}_{precision.value.name}"
89
  full_model = "/".join(org_and_model)
90
 
91
+ meta_toml = load_meta_toml()
92
+ # update full_model from meta_toml if it exists
93
+ full_model = meta_toml.model_title_to_repo_id.get(full_model, full_model)
94
+
95
  still_on_hub, _, model_config = is_model_on_hub(
96
  full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
97
  )
 
103
 
104
  # Extract results available in this file (some results are split in several files)
105
  results: dict[str, float] = {}
106
+ for task in BENCHMARKS:
 
 
107
  # We average all scores of a given metric (not all metrics are present in all files)
108
+ # TODO: support multiple metrics
109
+ accs = np.array([v.get("acc", None) for k, v in data.results.items() if task.key == k])
110
  if accs.size == 0 or any(acc is None for acc in accs):
111
  continue
112
 
113
  mean_acc = np.mean(accs) * 100.0
114
+ results[task.title] = float(mean_acc)
115
 
116
  return cls.model_validate({
117
  "eval_name": result_key,
 
127
 
128
  def update_with_request_file(self, requests_path: str) -> None:
129
  """Finds the relevant request file for the current model and updates info with it"""
130
+ # TODO: do nothing for now
131
+ return
132
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
133
 
134
  try:
 
147
 
148
  def to_dict(self) -> dict:
149
  """Converts the Eval Result to a dict compatible with our dataframe display"""
150
+ average = sum(v for v in self.results.values() if v is not None) / len(BENCHMARKS)
151
  data_dict = {
152
  "eval_name": self.eval_name, # not a column, just a save name,
153
  AutoEvalColumn.precision.name: self.precision.value.name,
 
164
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
165
  }
166
 
167
+ for task in BENCHMARKS:
168
+ data_dict[task.title] = self.results.get(task.title, None)
169
 
170
  return data_dict
171
 
 
191
 
192
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
193
  """From the path of the results folder root, extract all needed info for results"""
 
 
194
  model_result_filepaths: list[str] = []
195
 
196
  for root, _, files in os.walk(results_path):
 
216
  # Store results of same eval together
217
  eval_name = eval_result.eval_name
218
  if eval_name in eval_results.keys():
219
+ results_loaded = {k: v for k, v in eval_result.results.items() if v is not None}
220
+ eval_results[eval_name].results.update(results_loaded)
221
  else:
222
  eval_results[eval_name] = eval_result
223
 
src/populate.py CHANGED
@@ -60,7 +60,7 @@ def get_leaderboard_df(
60
  df = df.loc[:, cols].round(decimals=2)
61
 
62
  # filter out if any of the benchmarks have not been produced
63
- df = df[has_no_nan_values(df, benchmark_cols)]
64
  return df
65
 
66
 
 
60
  df = df.loc[:, cols].round(decimals=2)
61
 
62
  # filter out if any of the benchmarks have not been produced
63
+ df = df.loc[has_no_nan_values(df, benchmark_cols), :]
64
  return df
65
 
66
 
src/prepare.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from functools import cached_property, lru_cache
4
+ from pathlib import Path
5
+
6
+ from huggingface_hub import snapshot_download
7
+ from loguru import logger
8
+ from pydantic import BaseModel, ConfigDict
9
+ from typing_extensions import Self
10
+
11
+ from src.envs import API, settings
12
+
13
+ if sys.version_info < (3, 11):
14
+ from tomli import load as toml_load
15
+ else:
16
+ from tomllib import load as toml_load
17
+
18
+ PREPARED_FLAG: bool = os.getenv("NO_DOWNLOAD", 0) == 1
19
+
20
+
21
+ def prepare_space():
22
+ """Space initialisation"""
23
+
24
+ def _restart_space():
25
+ API.restart_space(repo_id=settings.REPO_ID)
26
+
27
+ global PREPARED_FLAG
28
+ if not PREPARED_FLAG:
29
+ try:
30
+ snapshot_download(
31
+ repo_id=settings.QUEUE_REPO_ID,
32
+ local_dir=settings.EVAL_REQUESTS_PATH,
33
+ repo_type="dataset",
34
+ tqdm_class=None,
35
+ etag_timeout=30,
36
+ token=settings.HF_TOKEN.get_secret_value(),
37
+ )
38
+ except Exception as e:
39
+ logger.error(f"Error downloading eval queue: {e!s}")
40
+ _restart_space()
41
+ try:
42
+ snapshot_download(
43
+ repo_id=settings.RESULTS_REPO_ID,
44
+ local_dir=settings.EVAL_RESULTS_PATH,
45
+ repo_type="dataset",
46
+ tqdm_class=None,
47
+ etag_timeout=30,
48
+ allow_patterns=["leaderboard/*.toml", "leaderboard/**/*.json"],
49
+ token=settings.HF_TOKEN.get_secret_value(),
50
+ )
51
+ except Exception as e:
52
+ logger.error(f"Error downloading eval queue: {e!s}")
53
+ _restart_space()
54
+ PREPARED_FLAG = True
55
+
56
+ load_meta_toml()
57
+
58
+
59
+ class MetaToml(BaseModel):
60
+ model_config = ConfigDict(extra="allow", frozen=True)
61
+
62
+ models: list["MetaToml_Model"]
63
+ benchmarks: list["MetaToml_Benchmark"]
64
+ model_repos: list["MetaToml_ModelRepo"]
65
+
66
+ @cached_property
67
+ def model_title_to_key(self) -> dict[str, str]:
68
+ return {model.title: model.key for model in self.models}
69
+
70
+ @cached_property
71
+ def benchmark_title_to_key(self) -> dict[str, str]:
72
+ return {benchmark.title: benchmark.key for benchmark in self.benchmarks}
73
+
74
+ @cached_property
75
+ def model_key_to_repo_id(self) -> dict[str, str]:
76
+ return {model.key: model.repo_id for model in self.model_repos if model.repo_id is not None}
77
+
78
+ @cached_property
79
+ def model_title_to_repo_id(self) -> dict[str, str]:
80
+ mapping: dict[str, str] = {}
81
+ for model in self.models:
82
+ model_key = self.model_title_to_key.get(model.title)
83
+ if model_key:
84
+ model_repo_id = self.model_key_to_repo_id.get(model_key)
85
+ if model_repo_id:
86
+ mapping[model.title] = model_repo_id
87
+ return mapping
88
+
89
+
90
+ class _HashableComparableMixin(BaseModel):
91
+ model_config = ConfigDict(extra="allow", frozen=True)
92
+
93
+ key: str
94
+ title: str
95
+
96
+ def __hash__(self) -> int:
97
+ return hash(self.key)
98
+
99
+ def __eq__(self, other: Self) -> bool:
100
+ return (self.key, self.title) == (other.key, other.title)
101
+
102
+ def __lt__(self, other: Self) -> bool:
103
+ return (self.key, self.title) < (other.key, other.title)
104
+
105
+ def __gt__(self, other: Self) -> bool:
106
+ return (self.key, self.title) > (other.key, other.title)
107
+
108
+ def __le__(self, other: Self) -> bool:
109
+ return (self.key, self.title) <= (other.key, other.title)
110
+
111
+ def __ge__(self, other: Self) -> bool:
112
+ return (self.key, self.title) >= (other.key, other.title)
113
+
114
+
115
+ class MetaToml_Benchmark(_HashableComparableMixin): ...
116
+
117
+
118
+ class MetaToml_Model(_HashableComparableMixin): ...
119
+
120
+
121
+ class MetaToml_ModelRepo(BaseModel):
122
+ model_config = ConfigDict(extra="allow", frozen=True)
123
+
124
+ key: str
125
+ repo_id: str | None = None
126
+
127
+
128
+ @lru_cache(maxsize=1)
129
+ def load_meta_toml() -> MetaToml:
130
+ meta_toml_path = Path(settings.EVAL_RESULTS_PATH) / "leaderboard" / "meta.toml"
131
+ logger.info(f'Loading meta.toml from: {meta_toml_path.as_posix()!r}')
132
+ with meta_toml_path.open("rb") as f:
133
+ data = toml_load(f)
134
+ meta_toml = MetaToml.model_validate(data)
135
+ logger.info("Loaded meta.toml")
136
+ assert meta_toml is not None, f"Failed to load meta.toml: {meta_toml_path.as_posix()!r}"
137
+ return meta_toml
src/submission/submit.py CHANGED
@@ -53,14 +53,14 @@ def add_new_eval(
53
  # Is the model on the hub?
54
  if weight_type in ["Delta", "Adapter"]:
55
  base_model_on_hub, error, _ = is_model_on_hub(
56
- model_name=base_model, revision=revision, token=settings.TOKEN, test_tokenizer=True
57
  )
58
  if not base_model_on_hub:
59
  return styled_error(f'Base model "{base_model}" {error}')
60
 
61
  if not weight_type == "Adapter":
62
  model_on_hub, error, _ = is_model_on_hub(
63
- model_name=model, revision=revision, token=settings.TOKEN, test_tokenizer=True
64
  )
65
  if not model_on_hub:
66
  return styled_error(f'Model "{model}" {error}')
@@ -117,7 +117,7 @@ def add_new_eval(
117
  API.upload_file(
118
  path_or_fileobj=out_path,
119
  path_in_repo=out_path.split("eval-queue/")[1],
120
- repo_id=settings.QUEUE_REPO,
121
  repo_type="dataset",
122
  commit_message=f"Add {model} to eval queue",
123
  )
 
53
  # Is the model on the hub?
54
  if weight_type in ["Delta", "Adapter"]:
55
  base_model_on_hub, error, _ = is_model_on_hub(
56
+ model_name=base_model, revision=revision, token=settings.HF_TOKEN.get_secret_value(), test_tokenizer=True
57
  )
58
  if not base_model_on_hub:
59
  return styled_error(f'Base model "{base_model}" {error}')
60
 
61
  if not weight_type == "Adapter":
62
  model_on_hub, error, _ = is_model_on_hub(
63
+ model_name=model, revision=revision, token=settings.HF_TOKEN.get_secret_value(), test_tokenizer=True
64
  )
65
  if not model_on_hub:
66
  return styled_error(f'Model "{model}" {error}')
 
117
  API.upload_file(
118
  path_or_fileobj=out_path,
119
  path_in_repo=out_path.split("eval-queue/")[1],
120
+ repo_id=settings.QUEUE_REPO_ID,
121
  repo_type="dataset",
122
  commit_message=f"Add {model} to eval queue",
123
  )
uv.lock CHANGED
@@ -356,6 +356,76 @@ wheels = [
356
  { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" },
357
  ]
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  [[package]]
360
  name = "exceptiongroup"
361
  version = "1.3.0"
@@ -667,68 +737,6 @@ wheels = [
667
  { url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" },
668
  ]
669
 
670
- [[package]]
671
- name = "leaderboard"
672
- version = "0.1.0"
673
- source = { virtual = "." }
674
- dependencies = [
675
- { name = "apscheduler" },
676
- { name = "datasets" },
677
- { name = "fastapi" },
678
- { name = "gradio", extra = ["oauth"] },
679
- { name = "gradio-client" },
680
- { name = "gradio-leaderboard" },
681
- { name = "huggingface-hub" },
682
- { name = "loguru" },
683
- { name = "matplotlib" },
684
- { name = "numpy" },
685
- { name = "pandas" },
686
- { name = "pydantic" },
687
- { name = "pydantic-settings" },
688
- { name = "python-dateutil" },
689
- { name = "python-dotenv" },
690
- { name = "rich" },
691
- { name = "sentencepiece" },
692
- { name = "tokenizers" },
693
- { name = "tqdm" },
694
- { name = "transformers" },
695
- { name = "uvicorn" },
696
- ]
697
-
698
- [package.dev-dependencies]
699
- dev = [
700
- { name = "ruff" },
701
- ]
702
-
703
- [package.metadata]
704
- requires-dist = [
705
- { name = "apscheduler" },
706
- { name = "datasets" },
707
- { name = "fastapi", specifier = ">=0.120.0" },
708
- { name = "gradio" },
709
- { name = "gradio", extras = ["oauth"] },
710
- { name = "gradio-client" },
711
- { name = "gradio-leaderboard", specifier = "==0.0.13" },
712
- { name = "huggingface-hub", specifier = ">=0.18.0" },
713
- { name = "loguru", specifier = ">=0.7.3" },
714
- { name = "matplotlib" },
715
- { name = "numpy" },
716
- { name = "pandas" },
717
- { name = "pydantic", specifier = ">=2.11.10" },
718
- { name = "pydantic-settings", specifier = ">=2.11.0" },
719
- { name = "python-dateutil" },
720
- { name = "python-dotenv", specifier = ">=1.2.1" },
721
- { name = "rich", specifier = ">=14.2.0" },
722
- { name = "sentencepiece" },
723
- { name = "tokenizers", specifier = ">=0.15.0" },
724
- { name = "tqdm" },
725
- { name = "transformers" },
726
- { name = "uvicorn", specifier = ">=0.38.0" },
727
- ]
728
-
729
- [package.metadata.requires-dev]
730
- dev = [{ name = "ruff", specifier = ">=0.14.0,<0.15.0" }]
731
-
732
  [[package]]
733
  name = "loguru"
734
  version = "0.7.3"
@@ -1324,6 +1332,15 @@ wheels = [
1324
  { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
1325
  ]
1326
 
 
 
 
 
 
 
 
 
 
1327
  [[package]]
1328
  name = "tokenizers"
1329
  version = "0.22.1"
@@ -1349,6 +1366,15 @@ wheels = [
1349
  { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
1350
  ]
1351
 
 
 
 
 
 
 
 
 
 
1352
  [[package]]
1353
  name = "tomlkit"
1354
  version = "0.13.3"
 
356
  { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" },
357
  ]
358
 
359
+ [[package]]
360
+ name = "easi-leaderboard"
361
+ version = "0.1.0"
362
+ source = { virtual = "." }
363
+ dependencies = [
364
+ { name = "apscheduler" },
365
+ { name = "datasets" },
366
+ { name = "fastapi" },
367
+ { name = "gradio", extra = ["oauth"] },
368
+ { name = "gradio-client" },
369
+ { name = "gradio-leaderboard" },
370
+ { name = "huggingface-hub" },
371
+ { name = "loguru" },
372
+ { name = "matplotlib" },
373
+ { name = "numpy" },
374
+ { name = "pandas" },
375
+ { name = "pydantic" },
376
+ { name = "pydantic-settings" },
377
+ { name = "python-dateutil" },
378
+ { name = "python-dotenv" },
379
+ { name = "rich" },
380
+ { name = "sentencepiece" },
381
+ { name = "tokenizers" },
382
+ { name = "tomli" },
383
+ { name = "tqdm" },
384
+ { name = "transformers" },
385
+ { name = "typing-extensions" },
386
+ { name = "uvicorn" },
387
+ ]
388
+
389
+ [package.dev-dependencies]
390
+ dev = [
391
+ { name = "ruff" },
392
+ { name = "tabulate" },
393
+ ]
394
+
395
+ [package.metadata]
396
+ requires-dist = [
397
+ { name = "apscheduler" },
398
+ { name = "datasets" },
399
+ { name = "fastapi", specifier = ">=0.120.0" },
400
+ { name = "gradio" },
401
+ { name = "gradio", extras = ["oauth"] },
402
+ { name = "gradio-client" },
403
+ { name = "gradio-leaderboard", specifier = "==0.0.13" },
404
+ { name = "huggingface-hub", specifier = ">=0.18.0" },
405
+ { name = "loguru", specifier = ">=0.7.3" },
406
+ { name = "matplotlib" },
407
+ { name = "numpy" },
408
+ { name = "pandas" },
409
+ { name = "pydantic", specifier = ">=2.11.10" },
410
+ { name = "pydantic-settings", specifier = ">=2.11.0" },
411
+ { name = "python-dateutil" },
412
+ { name = "python-dotenv", specifier = ">=1.2.1" },
413
+ { name = "rich", specifier = ">=14.2.0" },
414
+ { name = "sentencepiece" },
415
+ { name = "tokenizers", specifier = ">=0.15.0" },
416
+ { name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.3.0" },
417
+ { name = "tqdm" },
418
+ { name = "transformers" },
419
+ { name = "typing-extensions", specifier = ">=4.15.0" },
420
+ { name = "uvicorn", specifier = ">=0.38.0" },
421
+ ]
422
+
423
+ [package.metadata.requires-dev]
424
+ dev = [
425
+ { name = "ruff", specifier = ">=0.14.0,<0.15.0" },
426
+ { name = "tabulate" },
427
+ ]
428
+
429
  [[package]]
430
  name = "exceptiongroup"
431
  version = "1.3.0"
 
737
  { url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" },
738
  ]
739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
  [[package]]
741
  name = "loguru"
742
  version = "0.7.3"
 
1332
  { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
1333
  ]
1334
 
1335
+ [[package]]
1336
+ name = "tabulate"
1337
+ version = "0.9.0"
1338
+ source = { registry = "https://pypi.org/simple" }
1339
+ sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
1340
+ wheels = [
1341
+ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
1342
+ ]
1343
+
1344
  [[package]]
1345
  name = "tokenizers"
1346
  version = "0.22.1"
 
1366
  { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
1367
  ]
1368
 
1369
+ [[package]]
1370
+ name = "tomli"
1371
+ version = "2.3.0"
1372
+ source = { registry = "https://pypi.org/simple" }
1373
+ sdist = { url = "https://files.pythonhosted.org/packages/52/ed/3f73f72945444548f33eba9a87fc7a6e969915e7b1acc8260b30e1f76a2f/tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549", size = 17392, upload-time = "2025-10-08T22:01:47.119Z" }
1374
+ wheels = [
1375
+ { url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" },
1376
+ ]
1377
+
1378
  [[package]]
1379
  name = "tomlkit"
1380
  version = "0.13.3"