anonymoussssssss commited on
Commit
672654b
·
verified ·
1 Parent(s): b10a768

Initial commit

Browse files
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md CHANGED
@@ -1,14 +1,46 @@
1
  ---
2
- title: ThaiSafetyBench Leaderboard
3
- emoji: 🌖
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.38.2
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
- short_description: Safety leaderboard tailored to Thai language and culture
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ThaiSafetyBench
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
  sdk: gradio
 
7
  app_file: app.py
8
+ pinned: true
9
  license: mit
10
+ short_description: ThaiSafetyBench Leaderboard
11
+ sdk_version: 5.19.0
12
  ---
13
 
14
+ # Start the configuration
15
+
16
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
17
+
18
+ Results files should have the following format and be stored as json files:
19
+ ```json
20
+ {
21
+ "config": {
22
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
+ "model_name": "path of the model on the hub: org/model",
24
+ "model_sha": "revision on the hub",
25
+ },
26
+ "results": {
27
+ "task_name": {
28
+ "metric_name": score,
29
+ },
30
+ "task_name2": {
31
+ "metric_name": score,
32
+ }
33
+ }
34
+ }
35
+ ```
36
+
37
+ Request files are created automatically by this tool.
38
+
39
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
40
+
41
+ # Code logic for more complex edits
42
+
43
+ You'll find
44
+ - the main table' columns names and properties in `src/display/utils.py`
45
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from apscheduler.schedulers.background import BackgroundScheduler
3
+ from huggingface_hub import snapshot_download
4
+
5
+ from src.about import (
6
+ CITATION_BUTTON_LABEL,
7
+ CITATION_BUTTON_TEXT,
8
+ INTRODUCTION_TEXT,
9
+ LLM_BENCHMARKS_TEXT,
10
+ TITLE,
11
+ SUBMIT_TEXT
12
+ )
13
+ from src.display.css_html_js import custom_css
14
+ from src.display.utils import (
15
+ BENCHMARK_COLS,
16
+ COLS,
17
+ AutoEvalColumn,
18
+ fields,
19
+ )
20
+ from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
21
+ from src.populate import get_leaderboard_df
22
+
23
+
24
+ def restart_space():
25
+ API.restart_space(repo_id=REPO_ID)
26
+
27
+ ### Space initialisation
28
+ try:
29
+ snapshot_download(
30
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
31
+ )
32
+ except Exception:
33
+ restart_space()
34
+
35
+
36
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
37
+
38
+ def init_leaderboard(dataframe, css):
39
+ if dataframe is None or dataframe.empty:
40
+ raise ValueError("Leaderboard DataFrame is empty or None.")
41
+
42
+ with gr.Blocks(css=css) as app:
43
+ # Title
44
+ gr.Markdown("# Leaderboard")
45
+
46
+ # Select Columns - Full width, as CheckboxGroup
47
+ select_columns = gr.CheckboxGroup(
48
+ label="Select Columns to Display:",
49
+ choices=[c.name for c in fields(AutoEvalColumn)],
50
+ value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
51
+ elem_id="select-columns"
52
+ )
53
+
54
+ # Search Columns - Full width
55
+ search_columns = gr.Textbox(
56
+ label="Search",
57
+ placeholder=f"Search in {', '.join([AutoEvalColumn.model_name.name])}...",
58
+ lines=1,
59
+ elem_id="search-columns"
60
+ )
61
+
62
+ # Initialize DataFrame with only default-selected columns
63
+ default_columns = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]
64
+ initial_dataframe = dataframe[default_columns].copy()
65
+
66
+ # Leaderboard Component
67
+ leaderboard = gr.Dataframe(
68
+ value=initial_dataframe,
69
+ datatype=[c.type for c in fields(AutoEvalColumn) if c.name in default_columns],
70
+ headers=default_columns,
71
+ wrap=True,
72
+ interactive=False,
73
+ max_height=800
74
+ )
75
+
76
+ # Update function
77
+ def update_leaderboard(search, selected_cols):
78
+ df = dataframe.copy()
79
+ # Apply search
80
+ if search:
81
+ df = df[df[AutoEvalColumn.model_name.name].str.contains(search, case=False, na=False)]
82
+ # Filter columns to display
83
+ visible_cols = [col for col in selected_cols if col in df.columns]
84
+ df = df[visible_cols]
85
+ return df
86
+
87
+ # Connect inputs to update leaderboard
88
+ search_columns.change(
89
+ fn=update_leaderboard,
90
+ inputs=[search_columns, select_columns],
91
+ outputs=leaderboard
92
+ )
93
+ select_columns.change(
94
+ fn=update_leaderboard,
95
+ inputs=[search_columns, select_columns],
96
+ outputs=leaderboard
97
+ )
98
+
99
+ return app
100
+
101
+
102
+ demo = gr.Blocks(fill_height=False, css=custom_css)
103
+ with demo:
104
+ gr.HTML(TITLE)
105
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
106
+
107
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
108
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
109
+ leaderboard = init_leaderboard(LEADERBOARD_DF, css=custom_css)
110
+
111
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
112
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
113
+
114
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
115
+ with gr.Row():
116
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
117
+
118
+ with gr.Row():
119
+ gr.Markdown(SUBMIT_TEXT, elem_classes="markdown-text")
120
+
121
+ with gr.Row():
122
+ with gr.Accordion("📙 Citation", open=True):
123
+ citation_button = gr.Textbox(
124
+ value=CITATION_BUTTON_TEXT,
125
+ label=CITATION_BUTTON_LABEL,
126
+ lines=10,
127
+ elem_id="citation-button",
128
+ show_copy_button=True,
129
+ )
130
+
131
+ scheduler = BackgroundScheduler()
132
+ scheduler.add_job(restart_space, "interval", seconds=1800)
133
+ scheduler.start()
134
+ demo.queue(default_concurrency_limit=40).launch()
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
src/about.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
+
10
+
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
+ class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("overall", "asr", "🥇 Overall ASR ⬇️")
16
+ task1 = Task("Discrimination, Exclusion, Toxicity, Hateful, Offensive", "asr", "👉 Discrimination, Exclusion, Toxicity, Hateful, Offensive ASR ⬇️")
17
+ task2 = Task("Human-Chatbot Interaction Harms", "asr", "👉 Human-Chatbot Interaction Harm ASR ⬇️")
18
+ task3 = Task("Information Hazards", "asr", "👉 Information Hazards ASR ⬇️")
19
+ task4 = Task("Malicious Uses", "asr", "👉 Malicious Uses ASR ⬇️")
20
+ task5 = Task("Misinformation Harms", "asr", "👉 Misinformation Harms ASR ⬇️")
21
+ task6 = Task("Thai Socio-Cultural Harm", "asr", "👉 Thai Socio-Cultural Harms ASR ⬇️")
22
+ task7 = Task("Thai culture related attack", "asr", "🔶 Thai Culture Related Attack ASR ⬇️")
23
+ task8 = Task("General prompt attack", "asr", "🔶 General Prompt Attack ⬇️")
24
+
25
+ NUM_FEWSHOT = 0 # Change with your few shot
26
+ # ---------------------------------------------------
27
+
28
+
29
+
30
+ # Your leaderboard name
31
+ TITLE = """<h1 align="center" id="space-title">ThaiSafetyBench Leaderboard 🥇</h1>"""
32
+
33
+ # What does your leaderboard evaluate?
34
+ INTRODUCTION_TEXT = """
35
+ ThaiSafetyBench is a safety benchmark tailored to the Thai language and culture.
36
+ """
37
+
38
+ # Which evaluations are you running? how can people reproduce what you have?
39
+ LLM_BENCHMARKS_TEXT = f"""
40
+ ## How it works
41
+
42
+ We evaluate models on the ThaiSafetyBench benchmark, which consists of various tasks related to safety and
43
+ harmful content in the Thai language and culture. The evaluation is performed using the ThaiSafetyBench dataset,
44
+ which includes a range of scenarios designed to assess the model's ability to handle sensitive topics,
45
+ discrimination, misinformation, and other harmful content. The automatic evaluation is conducted using the GPT-4o model as a judge.
46
+ We report the Attack Success Rate (ASR) for each task, which indicates the model's vulnerability to the harmful content.
47
+ We categorize the tasks into two groups: Thai Culture-Related Attacks, which evaluate the model's ability to handle content specific to Thai culture, including its norms, values, and sensitivities, and General Prompt Attacks, which assess the model's capacity to manage broadly harmful content that, while not unique to Thai culture, remains relevant in a wider context.
48
+
49
+ ## Reproducibility
50
+
51
+ To reproduce our results, we provide the automatic evaluation code in our Github repository. You can run the evaluation on your own models by following these steps:
52
+
53
+ 1. Generate the responses of your model on the ThaiSafetyBench dataset with temperature at 0.1
54
+ 2. Use the provided evaluation script to evaluate the responses using the GPT-4o model as a judge
55
+
56
+ ## Developers and Maintainers
57
+
58
+ <Anonymous due to paper submission policy>
59
+ """
60
+
61
+ SUBMIT_TEXT = """
62
+ We openly welcome submissions of new models to the ThaiSafetyBench leaderboard via email. Due to the paper submission anonymity policy, we cannot accept submissions at this time.
63
+ ```
64
+ Subject: [Your Model Name] ThaiSafetyBench Model Submission
65
+ Content:
66
+ - Model name
67
+ - Developer
68
+ - Parameters (in billions)
69
+ - Model type (Base or CPT)
70
+ - Base model name (if the model is a CPT, otherwise leave empty)
71
+ - Release date (YYYY-MM)
72
+ - How to run the model (Python code to generate responses, if the model is on Hugging Face Hub, otherwise provide a code snippet to run the model and generate responses)
73
+ - Contact email (for us to contact you about the evaluation results)
74
+ ```
75
+ """
76
+
77
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
78
+ CITATION_BUTTON_TEXT = r"""
79
+ coming soon...
80
+ """
src/display/css_html_js.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ .tab-buttons button {
42
+ font-size: 20px;
43
+ }
44
+
45
+ #scale-logo {
46
+ border-style: none !important;
47
+ box-shadow: none;
48
+ display: block;
49
+ margin-left: auto;
50
+ margin-right: auto;
51
+ max-width: 600px;
52
+ }
53
+
54
+ #scale-logo .download {
55
+ display: none;
56
+ }
57
+
58
+ #select-columns, #search-columns {
59
+ width: 100% !important;
60
+ max-width: 100% !important;
61
+ }
62
+ .gr-box {
63
+ padding: 10px;
64
+ margin-bottom: 10px;
65
+ }
66
+ /* Smaller font and wrapping for table headers */
67
+ .gradio-container table th {
68
+ white-space: normal !important; /* Allows text to wrap */
69
+ overflow-wrap: break-word !important; /* Breaks long words if needed */
70
+ }
71
+ /* Ensure text wrapping for table cells */
72
+ .gradio-container table td {
73
+ white-space: normal !important; /* Allows text to wrap */
74
+ overflow-wrap: break-word !important; /* Breaks long words if needed */
75
+ }
76
+ #select-columns, #search-columns {
77
+ white-space: normal !important;
78
+ word-wrap: break-word !important;
79
+ overflow-wrap: break-word !important;
80
+ }
81
+ """
82
+
83
+ get_window_url_params = """
84
+ function(url_params) {
85
+ const params = new URLSearchParams(window.location.search);
86
+ url_params = Object.fromEntries(params);
87
+ return url_params;
88
+ }
89
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
+
9
+
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
+
13
+
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
+
17
+
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
+
21
+
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
+
25
+
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field, make_dataclass
2
+ from enum import Enum
3
+
4
+ from src.about import Tasks
5
+
6
+ def fields(raw_class):
7
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
8
+
9
+ # These classes are for user facing column names,
10
+ # to avoid having to change them all around the code
11
+ # when a modif is needed
12
+ @dataclass
13
+ class ColumnContent:
14
+ name: str
15
+ type: str
16
+ displayed_by_default: bool
17
+ hidden: bool = False
18
+ never_hidden: bool = False
19
+
20
+
21
+ ## Leaderboard columns
22
+ auto_eval_column_dict = []
23
+ # model_name
24
+ auto_eval_column_dict.append((
25
+ "model_name",
26
+ ColumnContent,
27
+ ColumnContent("Model", "markdown", True, never_hidden=True)
28
+ ))
29
+ # Scores
30
+ for task in Tasks:
31
+ if task.value.benchmark in ["Thai culture related attack", "General prompt attack"]:
32
+ show = False
33
+ else:
34
+ show = True
35
+ auto_eval_column_dict.append((
36
+ task.name,
37
+ ColumnContent,
38
+ ColumnContent(task.value.col_name, "number", show)
39
+ ))
40
+ # Model information
41
+ auto_eval_column_dict.append((
42
+ "model_type",
43
+ ColumnContent,
44
+ ColumnContent("Type", "str", False)
45
+ ))
46
+ auto_eval_column_dict.append((
47
+ "params",
48
+ ColumnContent,
49
+ ColumnContent("#Params (B)", "number", False)
50
+ ))
51
+
52
+ # We use make dataclass to dynamically fill the scores from Tasks
53
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
54
+
55
+ ## For the queue columns in the submission tab
56
+ @dataclass(frozen=True)
57
+ class EvalQueueColumn: # Queue column
58
+ model = ColumnContent("model_name", "markdown", True)
59
+ revision = ColumnContent("revision", "str", True)
60
+ private = ColumnContent("private", "bool", True)
61
+ precision = ColumnContent("precision", "str", True)
62
+ weight_type = ColumnContent("weight_type", "str", "Original")
63
+ status = ColumnContent("status", "str", True)
64
+
65
+ ## All the model information that we might need
66
+ @dataclass
67
+ class ModelDetails:
68
+ name: str
69
+ display_name: str = ""
70
+
71
+ # Column selection
72
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
73
+
74
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
75
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
76
+
77
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
src/envs.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ OWNER = "anonymoussssssss" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+
11
+ # Configure Git identity
12
+ os.system('git config --global user.name "anonymoussssssss"')
13
+ os.system('git config --global user.email "[email protected]"')
14
+
15
+ # ----------------------------------
16
+
17
+ REPO_ID = f"{OWNER}/ThaiSafetyBench-Leaderboard"
18
+ RESULTS_REPO = f"{OWNER}/ThaiSafetyBench-Results"
19
+
20
+ # If you setup a cache later, just change HF_HOME
21
+ CACHE_PATH=os.getenv("HF_HOME", ".")
22
+
23
+ # Local caches
24
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
25
+
26
+ API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+
5
+ import numpy as np
6
+
7
+ from src.display.utils import Tasks
8
+
9
+
10
+ @dataclass
11
+ class EvalResult:
12
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
13
+ """
14
+ model_name: str
15
+ org: str
16
+ results: dict
17
+ model_type: str # Pretrained, fine tuned, ...
18
+ num_params: int = 0
19
+ date: str = "" # submission date of request file
20
+ still_on_hub: bool = False
21
+
22
+ @classmethod
23
+ def init_from_json_file(self, json_filepath):
24
+ """Inits the result from the specific model result file"""
25
+ with open(json_filepath) as fp:
26
+ data = json.load(fp)
27
+
28
+ config = data.get("config")
29
+
30
+ num_params = config.get("params")
31
+ org = config.get("developer", "Unknown")
32
+ model_type = config.get("model_type", "Unknown")
33
+ model_name = config.get("model_name")
34
+
35
+ # Extract results available in this file (some results are split in several files)
36
+ results = {}
37
+ for task in Tasks:
38
+ task = task.value
39
+
40
+ # We average all scores of a given metric (not all metrics are present in all files)
41
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
42
+ if accs.size == 0 or any([acc is None for acc in accs]):
43
+ continue
44
+
45
+ mean_acc = np.mean(accs)
46
+ results[task.benchmark] = mean_acc
47
+
48
+ return self(
49
+ model_name=model_name,
50
+ org=org,
51
+ results=results,
52
+ num_params=num_params,
53
+ model_type=model_type
54
+ )
55
+
56
+ def to_dict(self):
57
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
58
+ data_dict = {
59
+ "Model": self.model_name,
60
+ "Type": self.model_type,
61
+ "#Params (B)": self.num_params
62
+ }
63
+
64
+ for task in Tasks:
65
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
66
+
67
+ return data_dict
68
+
69
+ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
70
+ """From the path of the results folder root, extract all needed info for results"""
71
+ model_result_filepaths = []
72
+
73
+ for fn in os.listdir(results_path):
74
+ # We should only have json files in model results
75
+ if fn.endswith(".json"):
76
+ model_result_filepaths.append(os.path.join(results_path, fn))
77
+
78
+ eval_results = {}
79
+ for model_result_filepath in model_result_filepaths:
80
+ # Creation of result
81
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
82
+ # Store results of same eval together
83
+ model_name = eval_result.model_name
84
+ eval_results[model_name] = eval_result
85
+
86
+ results = []
87
+ for v in eval_results.values():
88
+ try:
89
+ v.to_dict() # we test if the dict version is complete
90
+ results.append(v)
91
+ except KeyError: # not all eval values present
92
+ continue
93
+
94
+ return results
src/populate.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from src.display.formatting import has_no_nan_values
4
+ from src.display.utils import AutoEvalColumn
5
+ from src.leaderboard.read_evals import get_raw_eval_results
6
+
7
+
8
+ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
9
+ """Creates a dataframe from all the individual experiment results"""
10
+ raw_data = get_raw_eval_results(results_path)
11
+ all_data_json = [v.to_dict() for v in raw_data]
12
+
13
+ df = pd.DataFrame.from_records(all_data_json)
14
+
15
+ df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=True)
16
+ df = df[cols].round(decimals=2)
17
+
18
+ # filter out if any of the benchmarks have not been produced
19
+ df = df[has_no_nan_values(df, benchmark_cols)]
20
+
21
+ return df