Spaces:

kluster-ai
/

LLM-Hallucination-Detection-Leaderboard

Running

App Files Files Community

aloe-vera commited on Jul 3

Commit

73adc36

verified ·

1 Parent(s): 27b730c

leaderboard v1

Browse files

Files changed (4) hide show

app.py +312 -203
docs.md +48 -0
leaderboard/data/leaderboard.csv +16 -0
static/kluster-color.png +0 -0

app.py CHANGED Viewed

@@ -1,204 +1,313 @@
-import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
-    with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

+import gradio as gr
+import pandas as pd
+from pathlib import Path
+import plotly.express as px
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+import base64
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+def make_rate_chart(df: pd.DataFrame):
+    """Return a Plotly bar chart of hallucination rates."""
+    # long-form dataframe for grouped bars
+    df_long = df.melt(
+        id_vars="Models",
+        value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
+        var_name="Benchmark",
+        value_name="Rate",
+    )
+    fig = px.bar(
+        df_long,
+        x="Models",
+        y="Rate",
+        color="Benchmark",
+        barmode="group",
+        title="Hallucination Rates by Model",
+        height=400,
+    )
+    fig.update_layout(xaxis_title="", yaxis_title="%")
+    return fig
+def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
+    """
+    Return a horizontal bar chart sorted ascending by `col`.
+    Lowest value (best) at the top.
+    """
+    df_sorted = df.sort_values(col, ascending=False)           # best → worst
+    fig = px.bar(
+        df_sorted,
+        x=col,
+        y="Models",
+        orientation="h",
+        title=title,
+        text_auto=".2f",
+        height=400,
+        color_discrete_sequence=[bar_color],
+    )
+    fig.update_traces(textposition="outside", cliponaxis=False)
+    fig.update_layout(
+        xaxis_title="Hallucination Rate (%)",
+        yaxis_title="",
+        yaxis=dict(dtick=1),   # ensure every model shown
+        margin=dict(l=140, r=60, t=60, b=40)
+    )
+    fig.update_traces(textposition="outside")
+    return fig
+def color_scale(s, cmap):
+    """
+    Return background-colour styles for a numeric Series (lower = greener,
+    higher = redder). Works with any palette length.
+    """
+    colours = px.colors.sequential.__dict__[cmap]
+    n = len(colours) - 1                     # max valid index
+    rng = s.max() - s.min()
+    norm = (s - s.min()) / (rng if rng else 1)
+    return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]
+### Space initialisation
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    # restart_space()
+    print(f"[WARN] Skipping RESULTS sync: {e}")
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    # restart_space()
+    print(f"[WARN] Skipping RESULTS sync: {e}")
+# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
+# (
+#     finished_eval_queue_df,
+#     running_eval_queue_df,
+#     pending_eval_queue_df,
+# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(df: pd.DataFrame):
+    if df is None or df.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=df,
+        datatype=["markdown", "markdown", "number", "number", "number"],
+        select_columns=SelectColumns(
+            default_selection=[
+                "Rank", "Models",
+                "Average Hallucination Rate (%)",
+                "RAG Hallucination Rate (%)",
+                "Non-RAG Hallucination Rate (%)"
+            ],
+            cant_deselect=["Models", "Rank"],
+            label="Select Columns to Display:",
+        ),
+        search_columns=["Models"],
+        # column_widths=["3%"],
+        bool_checkboxgroup_label=None,
+        interactive=False,
+    )
+image_path = "static/kluster-color.png"
+with open(image_path, "rb") as img_file:
+    b64_string = base64.b64encode(img_file.read()).decode("utf-8")
+# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(f"""
+        <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
+            <img src="data:image/png;base64,{b64_string}" alt="KlusterAI logo" style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
+            <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em;">
+                LLM Hallucination Detection <span style="color: #0057ff;">Leaderboard</span>
+            </div>
+            <div style="font-size: 1.5em; color: #444; margin-top: 0.5em;">
+                Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
+                <a href="https://platform.kluster.ai/verify" target="_blank" style="color: #0057ff; text-decoration: none;">
+                    Verify
+                </a> by
+                <a href="https://platform.kluster.ai/" target="_blank" style="color: #0057ff; text-decoration: none;">
+                    KlusterAI
+                </a>
+            </div>
+        </div>
+    """)
+    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Hallucination Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            # ----------  Chart  ----------
+            with gr.Row():
+                gr.Plot(
+                    make_leaderboard_plot(
+                        LEADERBOARD_DF,
+                        "RAG Hallucination Rate (%)",
+                        "RAG Hallucination Rate (lower is better)",
+                        bar_color="#4CAF50",
+                    ),
+                    show_label=False,
+                )
+                gr.Plot(
+                    make_leaderboard_plot(
+                        LEADERBOARD_DF,
+                        "Non-RAG Hallucination Rate (%)",
+                        "Non-RAG Hallucination Rate (lower is better)",
+                        bar_color="#FF7043",
+                    ),
+                    show_label=False,
+                )
+            # ----------  Leaderboard  ----------
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("📝 Document", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown((Path(__file__).parent / "docs.md").read_text())
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            gr.Markdown((Path(__file__).parent / "submit.md").read_text())
+            # with gr.Column():
+            #     with gr.Row():
+            #         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+            #     with gr.Column():
+            #         with gr.Accordion(
+            #             f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+            #             open=False,
+            #         ):
+            #             with gr.Row():
+            #                 finished_eval_table = gr.components.Dataframe(
+            #                     value=finished_eval_queue_df,
+            #                     headers=EVAL_COLS,
+            #                     datatype=EVAL_TYPES,
+            #                     row_count=5,
+            #                 )
+            #         with gr.Accordion(
+            #             f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+            #             open=False,
+            #         ):
+            #             with gr.Row():
+            #                 running_eval_table = gr.components.Dataframe(
+            #                     value=running_eval_queue_df,
+            #                     headers=EVAL_COLS,
+            #                     datatype=EVAL_TYPES,
+            #                     row_count=5,
+            #                 )
+            #         with gr.Accordion(
+            #             f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+            #             open=False,
+            #         ):
+            #             with gr.Row():
+            #                 pending_eval_table = gr.components.Dataframe(
+            #                     value=pending_eval_queue_df,
+            #                     headers=EVAL_COLS,
+            #                     datatype=EVAL_TYPES,
+            #                     row_count=5,
+            #                 )
+            # with gr.Row():
+            #     gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            # with gr.Row():
+            #     with gr.Column():
+            #         model_name_textbox = gr.Textbox(label="Model name")
+            #         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+            #         model_type = gr.Dropdown(
+            #             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+            #             label="Model type",
+            #             multiselect=False,
+            #             value=None,
+            #             interactive=True,
+            #         )
+            #     with gr.Column():
+            #         precision = gr.Dropdown(
+            #             choices=[i.value.name for i in Precision if i != Precision.Unknown],
+            #             label="Precision",
+            #             multiselect=False,
+            #             value="float16",
+            #             interactive=True,
+            #         )
+            #         weight_type = gr.Dropdown(
+            #             choices=[i.value.name for i in WeightType],
+            #             label="Weights type",
+            #             multiselect=False,
+            #             value="Original",
+            #             interactive=True,
+            #         )
+            #         base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            # submit_button = gr.Button("Submit Eval")
+            # submission_result = gr.Markdown()
+            # submit_button.click(
+            #     add_new_eval,
+            #     [
+            #         model_name_textbox,
+            #         base_model_name_textbox,
+            #         revision_name_textbox,
+            #         precision,
+            #         weight_type,
+            #         model_type,
+            #     ],
+            #     submission_result,
+            # )
+    # with gr.Row():
+    #     with gr.Accordion("📙 Citation", open=False):
+    #         citation_button = gr.Textbox(
+    #             value=CITATION_BUTTON_TEXT,
+    #             label=CITATION_BUTTON_LABEL,
+    #             lines=20,
+    #             elem_id="citation-button",
+    #             show_copy_button=True,
+    #         )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

docs.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# About
+As large language models (LLMs) continue to improve, evaluating how well they avoid hallucinations (producing information that is unfaithful or factually incorrect) has become increasingly important. While many models claim to be reliable, their factual grounding can vary significantly across tasks and settings.
+This leaderboard provides a standardised evaluation of how different LLMs perform on hallucination detection tasks. Our goal is to help researchers and developers understand which models are more trustworthy in both grounded (context-based) and open-ended (real-world knowledge) settings. We use [Verify](https://platform.kluster.ai/verify) by [KlusterAI](https://platform.kluster.ai/), an automated hallucination detection tool, to evaluate the factual consistency of model outputs.
+---
+# Tasks
+We evaluate each model using two benchmarks:
+## Retrieval-Augmented Generation (RAG setting)
+RAG evaluates how well a model stays faithful to a provided context when answering a question. The input consists of a synthetic or real context paired with a relevant question. Models are expected to generate answers using **only the information given**, without adding external knowledge or contradicting the context.
+- **Source**: [HaluEval QA](https://huggingface.co/datasets/pminervini/HaluEval/viewer/qa?views%5B%5D=qa)
+- **Dataset Size**: 10,000 question-context pairs
+- **Prompt Format**: Prompt with relevant context document
+- **Temperature**: 0 (to enforce deterministic, grounded outputs)
+- **System Prompt**: Instructs the model to only use the document and avoid guessing.
+## Real-World Knowledge (Non-RAG setting)
+This setting evaluates how factually accurate a model is when **no context is provided**. The model must rely solely on its internal knowledge to answer a broad range of user questions across many topics. The answers are then verified using web search to determine factual correctness.
+- **Source**: Filtered from [UltraChat](https://huggingface.co/datasets/stingning/ultrachat) prompts
+- **Dataset Size**: 11,746 single-turn user queries
+- **Prompt Format**: Single user prompt without additional context
+- **Temperature**: 1 (to reflect natural, fluent generation)
+- **System Prompt**: Encourages helpfulness, accuracy, and honesty when unsure.
+---
+# Evaluation Method
+We use **Verify**, a hallucination detection tool built by KlusterAI, to classify model outputs:
+- In the **RAG setting**, Verify checks if the output contradicts, fabricates, or strays from the input document.
+- In the **real-world knowledge setting**, Verify uses search queries to fact-check the answer based on current, public information.
+Each model's hallucination rate is computed as:
+### Hallucination Rate = (Number of hallucinated outputs) / (Total number of prompts)
+A **lower** hallucination rate indicates **better** performance.

leaderboard/data/leaderboard.csv ADDED Viewed

	@@ -0,0 +1,16 @@

+Models,ha_rag_rate,ha_non_rag_rate
+klusterai/Meta-Llama-3.1-8B-Instruct-Turbo,8.1,12.5
+Qwen/Qwen2.5-VL-7B-Instruct,9.35,4.55
+mistralai/Mistral-Nemo-Instruct-2407,10.63,8.74
+meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,3.34,0.69
+meta-llama/Llama-4-Scout-17B-16E-Instruct,4.23,2.48
+mistralai/Mistral-Small-24B-Instruct-2501,4.74,7.85
+mistralai/Magistral-Small-2506,8.62,28.07
+google/gemma-3-27b-it,3.71,0.48
+klusterai/Meta-Llama-3.3-70B-Instruct-Turbo,2.12,1.09
+deepseek-ai/DeepSeek-V3-0324,4.66,0.91
+Qwen/Qwen3-235B-A22B-FP8,5.04,0.88
+deepseek-ai/DeepSeek-R1-0528,2.26,0.78
+openai/gpt-4o,6.05,
+anthropic/claude-sonnet-4,2.21,
+google/gemini-2.5-pro,,

static/kluster-color.png ADDED Viewed