Spaces:

anonymoussssssss
/

ThaiSafetyBench-Leaderboard

Sleeping

App Files Files Community

anonymoussssssss commited on Jul 30

Commit

672654b

verified ·

1 Parent(s): b10a768

Initial commit

Browse files

Files changed (12) hide show

Makefile +13 -0
README.md +40 -8
app.py +134 -0
pyproject.toml +13 -0
requirements.txt +12 -0
src/about.py +80 -0
src/display/css_html_js.py +89 -0
src/display/formatting.py +27 -0
src/display/utils.py +77 -0
src/envs.py +26 -0
src/leaderboard/read_evals.py +94 -0
src/populate.py +21 -0

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

README.md CHANGED Viewed

@@ -1,14 +1,46 @@
 ---
-title: ThaiSafetyBench Leaderboard
-emoji: 🌖
-colorFrom: indigo
-colorTo: pink
 sdk: gradio
-sdk_version: 5.38.2
 app_file: app.py
-pinned: false
 license: mit
-short_description: Safety leaderboard tailored to Thai language and culture
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ThaiSafetyBench
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
 sdk: gradio
 app_file: app.py
+pinned: true
 license: mit
+short_description: ThaiSafetyBench Leaderboard
+sdk_version: 5.19.0
 ---
+# Start the configuration
+Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
+Results files should have the following format and be stored as json files:
+```json
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+# Code logic for more complex edits
+You'll find
+- the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import gradio as gr
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+    SUBMIT_TEXT
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    AutoEvalColumn,
+    fields,
+)
+from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_leaderboard_df
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+### Space initialisation
+try:
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
+def init_leaderboard(dataframe, css):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    with gr.Blocks(css=css) as app:
+        # Title
+        gr.Markdown("# Leaderboard")
+        # Select Columns - Full width, as CheckboxGroup
+        select_columns = gr.CheckboxGroup(
+            label="Select Columns to Display:",
+            choices=[c.name for c in fields(AutoEvalColumn)],
+            value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            elem_id="select-columns"
+        )
+        # Search Columns - Full width
+        search_columns = gr.Textbox(
+            label="Search",
+            placeholder=f"Search in {', '.join([AutoEvalColumn.model_name.name])}...",
+            lines=1,
+            elem_id="search-columns"
+        )
+        # Initialize DataFrame with only default-selected columns
+        default_columns = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]
+        initial_dataframe = dataframe[default_columns].copy()
+        # Leaderboard Component
+        leaderboard = gr.Dataframe(
+            value=initial_dataframe,
+            datatype=[c.type for c in fields(AutoEvalColumn) if c.name in default_columns],
+            headers=default_columns,
+            wrap=True,
+            interactive=False,
+            max_height=800
+        )
+        # Update function
+        def update_leaderboard(search, selected_cols):
+            df = dataframe.copy()
+            # Apply search
+            if search:
+                df = df[df[AutoEvalColumn.model_name.name].str.contains(search, case=False, na=False)]
+            # Filter columns to display
+            visible_cols = [col for col in selected_cols if col in df.columns]
+            df = df[visible_cols]
+            return df
+        # Connect inputs to update leaderboard
+        search_columns.change(
+            fn=update_leaderboard,
+            inputs=[search_columns, select_columns],
+            outputs=leaderboard
+        )
+        select_columns.change(
+            fn=update_leaderboard,
+            inputs=[search_columns, select_columns],
+            outputs=leaderboard
+        )
+    return app
+demo = gr.Blocks(fill_height=False, css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF, css=custom_css)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown(SUBMIT_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=True):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=10,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+APScheduler
+black
+datasets
+gradio
+gradio[oauth]
+gradio_leaderboard==0.0.13
+gradio_client
+huggingface-hub>=0.18.0
+matplotlib
+numpy
+pandas
+python-dateutil

src/about.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+# Select your tasks here
+# ---------------------------------------------------
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("overall", "asr", "🥇 Overall ASR ⬇️")
+    task1 = Task("Discrimination, Exclusion, Toxicity, Hateful, Offensive", "asr", "👉 Discrimination, Exclusion, Toxicity, Hateful, Offensive ASR ⬇️")
+    task2 = Task("Human-Chatbot Interaction Harms", "asr", "👉 Human-Chatbot Interaction Harm ASR ⬇️")
+    task3 = Task("Information Hazards", "asr", "👉 Information Hazards ASR ⬇️")
+    task4 = Task("Malicious Uses", "asr", "👉 Malicious Uses ASR ⬇️")
+    task5 = Task("Misinformation Harms", "asr", "👉 Misinformation Harms ASR ⬇️")
+    task6 = Task("Thai Socio-Cultural Harm", "asr", "👉 Thai Socio-Cultural Harms ASR ⬇️")
+    task7 = Task("Thai culture related attack", "asr", "🔶 Thai Culture Related Attack ASR ⬇️")
+    task8 = Task("General prompt attack", "asr", "🔶 General Prompt Attack ⬇️")
+NUM_FEWSHOT = 0 # Change with your few shot
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">ThaiSafetyBench Leaderboard 🥇</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+ThaiSafetyBench is a safety benchmark tailored to the Thai language and culture.
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+## How it works
+We evaluate models on the ThaiSafetyBench benchmark, which consists of various tasks related to safety and
+harmful content in the Thai language and culture. The evaluation is performed using the ThaiSafetyBench dataset,
+which includes a range of scenarios designed to assess the model's ability to handle sensitive topics,
+discrimination, misinformation, and other harmful content. The automatic evaluation is conducted using the GPT-4o model as a judge.
+We report the Attack Success Rate (ASR) for each task, which indicates the model's vulnerability to the harmful content.
+We categorize the tasks into two groups: Thai Culture-Related Attacks, which evaluate the model's ability to handle content specific to Thai culture, including its norms, values, and sensitivities, and General Prompt Attacks, which assess the model's capacity to manage broadly harmful content that, while not unique to Thai culture, remains relevant in a wider context.
+## Reproducibility
+To reproduce our results, we provide the automatic evaluation code in our Github repository. You can run the evaluation on your own models by following these steps:
+1. Generate the responses of your model on the ThaiSafetyBench dataset with temperature at 0.1
+2. Use the provided evaluation script to evaluate the responses using the GPT-4o model as a judge
+## Developers and Maintainers
+<Anonymous due to paper submission policy>
+"""
+SUBMIT_TEXT = """
+We openly welcome submissions of new models to the ThaiSafetyBench leaderboard via email. Due to the paper submission anonymity policy, we cannot accept submissions at this time.
+```
+Subject: [Your Model Name] ThaiSafetyBench Model Submission
+Content:
+- Model name
+- Developer
+- Parameters (in billions)
+- Model type (Base or CPT)
+- Base model name (if the model is a CPT, otherwise leave empty)
+- Release date (YYYY-MM)
+- How to run the model (Python code to generate responses, if the model is on Hugging Face Hub, otherwise provide a code snippet to run the model and generate responses)
+- Contact email (for us to contact you about the evaluation results)
+```
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+coming soon...
+"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,89 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#select-columns, #search-columns {
+    width: 100% !important;
+    max-width: 100% !important;
+}
+.gr-box {
+    padding: 10px;
+    margin-bottom: 10px;
+}
+/* Smaller font and wrapping for table headers */
+.gradio-container table th {
+    white-space: normal !important; /* Allows text to wrap */
+    overflow-wrap: break-word !important; /* Breaks long words if needed */
+}
+/* Ensure text wrapping for table cells */
+.gradio-container table td {
+    white-space: normal !important; /* Allows text to wrap */
+    overflow-wrap: break-word !important; /* Breaks long words if needed */
+}
+#select-columns, #search-columns {
+    white-space: normal !important;
+    word-wrap: break-word !important;
+    overflow-wrap: break-word !important;
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from dataclasses import dataclass, field, make_dataclass
+from enum import Enum
+from src.about import Tasks
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+auto_eval_column_dict = []
+# model_name
+auto_eval_column_dict.append((
+    "model_name",
+    ColumnContent,
+    ColumnContent("Model", "markdown", True, never_hidden=True)
+))
+# Scores
+for task in Tasks:
+    if task.value.benchmark in ["Thai culture related attack", "General prompt attack"]:
+        show = False
+    else:
+        show = True
+    auto_eval_column_dict.append((
+        task.name,
+        ColumnContent,
+        ColumnContent(task.value.col_name, "number", show)
+    ))
+# Model information
+auto_eval_column_dict.append((
+    "model_type",
+    ColumnContent,
+    ColumnContent("Type", "str", False)
+))
+auto_eval_column_dict.append((
+    "params",
+    ColumnContent,
+    ColumnContent("#Params (B)", "number", False)
+))
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model_name", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "anonymoussssssss" # Change to your org - don't forget to create a results and request dataset, with the correct format!
+# Configure Git identity
+os.system('git config --global user.name "anonymoussssssss"')
+os.system('git config --global user.email "[email protected]"')
+# ----------------------------------
+REPO_ID = f"{OWNER}/ThaiSafetyBench-Leaderboard"
+RESULTS_REPO = f"{OWNER}/ThaiSafetyBench-Results"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH=os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import json
+import os
+from dataclasses import dataclass
+import numpy as np
+from src.display.utils import Tasks
+@dataclass
+class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
+    """
+    model_name: str
+    org: str
+    results: dict
+    model_type: str # Pretrained, fine tuned, ...
+    num_params: int = 0
+    date: str = "" # submission date of request file
+    still_on_hub: bool = False
+    @classmethod
+    def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        config = data.get("config")
+        num_params = config.get("params")
+        org = config.get("developer", "Unknown")
+        model_type = config.get("model_type", "Unknown")
+        model_name = config.get("model_name")
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        for task in Tasks:
+            task = task.value
+            # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                continue
+            mean_acc = np.mean(accs)
+            results[task.benchmark] = mean_acc
+        return self(
+            model_name=model_name,
+            org=org,
+            results=results,
+            num_params=num_params,
+            model_type=model_type
+        )
+    def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+        data_dict = {
+            "Model": self.model_name,
+            "Type": self.model_type,
+            "#Params (B)": self.num_params
+        }
+        for task in Tasks:
+            data_dict[task.value.col_name] = self.results[task.value.benchmark]
+        return data_dict
+def get_raw_eval_results(results_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+    for fn in os.listdir(results_path):
+        # We should only have json files in model results
+        if fn.endswith(".json"):
+            model_result_filepaths.append(os.path.join(results_path, fn))
+    eval_results = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        # Store results of same eval together
+        model_name = eval_result.model_name
+        eval_results[model_name] = eval_result
+    results = []
+    for v in eval_results.values():
+        try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+    return results

src/populate.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import pandas as pd
+from src.display.formatting import has_no_nan_values
+from src.display.utils import AutoEvalColumn
+from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    """Creates a dataframe from all the individual experiment results"""
+    raw_data = get_raw_eval_results(results_path)
+    all_data_json = [v.to_dict() for v in raw_data]
+    df = pd.DataFrame.from_records(all_data_json)
+    df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=True)
+    df = df[cols].round(decimals=2)
+    # filter out if any of the benchmarks have not been produced
+    df = df[has_no_nan_values(df, benchmark_cols)]
+    return df