Spaces:

kluster-ai
/

LLM-Hallucination-Detection-Leaderboard

Running

App Files Files Community

rymc commited on Jul 7

Commit

c9518d2

verified ·

1 Parent(s): e35f81a

Update app.py

Browse files

Files changed (1) hide show

app.py +316 -316

app.py CHANGED Viewed

@@ -1,317 +1,317 @@
-import gradio as gr
-import pandas as pd
-from pathlib import Path
-import plotly.express as px
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-import base64
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-def make_rate_chart(df: pd.DataFrame):
-    """Return a Plotly bar chart of hallucination rates."""
-    # long-form dataframe for grouped bars
-    df_long = df.melt(
-        id_vars="Models",
-        value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
-        var_name="Benchmark",
-        value_name="Rate",
-    )
-    fig = px.bar(
-        df_long,
-        x="Models",
-        y="Rate",
-        color="Benchmark",
-        barmode="group",
-        title="Hallucination Rates by Model",
-        height=400,
-    )
-    fig.update_layout(xaxis_title="", yaxis_title="%")
-    return fig
-def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
-    """
-    Return a horizontal bar chart sorted ascending by `col`.
-    Lowest value (best) at the top.
-    """
-    df_sorted = df.sort_values(col, ascending=False)           # best → worst
-    fig = px.bar(
-        df_sorted,
-        x=col,
-        y="Models",
-        orientation="h",
-        title=title,
-        text_auto=".2f",
-        height=400,
-        color_discrete_sequence=[bar_color],
-    )
-    fig.update_traces(textposition="outside", cliponaxis=False)
-    fig.update_layout(
-        xaxis_title="Hallucination Rate (%)",
-        yaxis_title="",
-        yaxis=dict(dtick=1),   # ensure every model shown
-        margin=dict(l=140, r=60, t=60, b=40)
-    )
-    fig.update_traces(textposition="outside")
-    return fig
-def color_scale(s, cmap):
-    """
-    Return background-colour styles for a numeric Series (lower = greener,
-    higher = redder). Works with any palette length.
-    """
-    colours = px.colors.sequential.__dict__[cmap]
-    n = len(colours) - 1                     # max valid index
-    rng = s.max() - s.min()
-    norm = (s - s.min()) / (rng if rng else 1)
-    return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    # restart_space()
-    print(f"[WARN] Skipping RESULTS sync: {Exception}")
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    # restart_space()
-    print(f"[WARN] Skipping RESULTS sync: {Exception}")
-# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
-# (
-#     finished_eval_queue_df,
-#     running_eval_queue_df,
-#     pending_eval_queue_df,
-# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(df: pd.DataFrame):
-    if df is None or df.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=df,
-        datatype=["markdown", "markdown", "number", "number", "number"],
-        select_columns=SelectColumns(
-            default_selection=[
-                "Rank", "Models",
-                "Average Hallucination Rate (%)",
-                "RAG Hallucination Rate (%)",
-                "Non-RAG Hallucination Rate (%)"
-            ],
-            cant_deselect=["Models", "Rank"],
-            label="Select Columns to Display:",
-        ),
-        search_columns=["Models"],
-        # column_widths=["3%"],
-        bool_checkboxgroup_label=None,
-        interactive=False,
-    )
-image_path = "static/kluster-color.png"
-with open(image_path, "rb") as img_file:
-    b64_string = base64.b64encode(img_file.read()).decode("utf-8")
-# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(f"""
-        <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
-            <img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"
-                style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
-            <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
-                LLM Hallucination Detection Leaderboard
-            </div>
-            <div style="font-size: 1.5em; margin-top: 0.5em;">
-                Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
-                <a href="https://platform.kluster.ai/verify" target="_blank">
-                    Verify
-                </a> by
-                <a href="https://platform.kluster.ai/" target="_blank">
-                    kluster.ai
-                </a>
-            </div>
-        </div>
-        """)
-    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
-            # ----------  Chart  ----------
-            with gr.Row():
-                gr.Plot(
-                    make_leaderboard_plot(
-                        LEADERBOARD_DF,
-                        "RAG Hallucination Rate (%)",
-                        "RAG Hallucination Rate (lower is better)",
-                        bar_color="#4CAF50",
-                    ),
-                    show_label=False,
-                )
-                gr.Plot(
-                    make_leaderboard_plot(
-                        LEADERBOARD_DF,
-                        "Non-RAG Hallucination Rate (%)",
-                        "Non-RAG Hallucination Rate (lower is better)",
-                        bar_color="#FF7043",
-                    ),
-                    show_label=False,
-                )
-            # ----------  Leaderboard  ----------
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 Details", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown((Path(__file__).parent / "docs.md").read_text())
-        with gr.TabItem("🚀 Submit Here! ", elem_id="llm-benchmark-tab-table", id=3):
-            gr.Markdown((Path(__file__).parent / "submit.md").read_text())
-            # with gr.Column():
-            #     with gr.Row():
-            #         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-            #     with gr.Column():
-            #         with gr.Accordion(
-            #             f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-            #             open=False,
-            #         ):
-            #             with gr.Row():
-            #                 finished_eval_table = gr.components.Dataframe(
-            #                     value=finished_eval_queue_df,
-            #                     headers=EVAL_COLS,
-            #                     datatype=EVAL_TYPES,
-            #                     row_count=5,
-            #                 )
-            #         with gr.Accordion(
-            #             f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-            #             open=False,
-            #         ):
-            #             with gr.Row():
-            #                 running_eval_table = gr.components.Dataframe(
-            #                     value=running_eval_queue_df,
-            #                     headers=EVAL_COLS,
-            #                     datatype=EVAL_TYPES,
-            #                     row_count=5,
-            #                 )
-            #         with gr.Accordion(
-            #             f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-            #             open=False,
-            #         ):
-            #             with gr.Row():
-            #                 pending_eval_table = gr.components.Dataframe(
-            #                     value=pending_eval_queue_df,
-            #                     headers=EVAL_COLS,
-            #                     datatype=EVAL_TYPES,
-            #                     row_count=5,
-            #                 )
-            # with gr.Row():
-            #     gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            # with gr.Row():
-            #     with gr.Column():
-            #         model_name_textbox = gr.Textbox(label="Model name")
-            #         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-            #         model_type = gr.Dropdown(
-            #             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-            #             label="Model type",
-            #             multiselect=False,
-            #             value=None,
-            #             interactive=True,
-            #         )
-            #     with gr.Column():
-            #         precision = gr.Dropdown(
-            #             choices=[i.value.name for i in Precision if i != Precision.Unknown],
-            #             label="Precision",
-            #             multiselect=False,
-            #             value="float16",
-            #             interactive=True,
-            #         )
-            #         weight_type = gr.Dropdown(
-            #             choices=[i.value.name for i in WeightType],
-            #             label="Weights type",
-            #             multiselect=False,
-            #             value="Original",
-            #             interactive=True,
-            #         )
-            #         base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            # submit_button = gr.Button("Submit Eval")
-            # submission_result = gr.Markdown()
-            # submit_button.click(
-            #     add_new_eval,
-            #     [
-            #         model_name_textbox,
-            #         base_model_name_textbox,
-            #         revision_name_textbox,
-            #         precision,
-            #         weight_type,
-            #         model_type,
-            #     ],
-            #     submission_result,
-            # )
-    # with gr.Row():
-    #     with gr.Accordion("📙 Citation", open=False):
-    #         citation_button = gr.Textbox(
-    #             value=CITATION_BUTTON_TEXT,
-    #             label=CITATION_BUTTON_LABEL,
-    #             lines=20,
-    #             elem_id="citation-button",
-    #             show_copy_button=True,
-    #         )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

+import gradio as gr
+import pandas as pd
+from pathlib import Path
+import plotly.express as px
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+import base64
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+def make_rate_chart(df: pd.DataFrame):
+    """Return a Plotly bar chart of hallucination rates."""
+    # long-form dataframe for grouped bars
+    df_long = df.melt(
+        id_vars="Models",
+        value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
+        var_name="Benchmark",
+        value_name="Rate",
+    )
+    fig = px.bar(
+        df_long,
+        x="Models",
+        y="Rate",
+        color="Benchmark",
+        barmode="group",
+        title="Hallucination Rates by Model",
+        height=400,
+    )
+    fig.update_layout(xaxis_title="", yaxis_title="%")
+    return fig
+def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
+    """
+    Return a horizontal bar chart sorted ascending by `col`.
+    Lowest value (best) at the top.
+    """
+    df_sorted = df.sort_values(col, ascending=False)           # best → worst
+    fig = px.bar(
+        df_sorted,
+        x=col,
+        y="Models",
+        orientation="h",
+        title=title,
+        text_auto=".2f",
+        height=400,
+        color_discrete_sequence=[bar_color],
+    )
+    fig.update_traces(textposition="outside", cliponaxis=False)
+    fig.update_layout(
+        xaxis_title="Hallucination Rate (%)",
+        yaxis_title="",
+        yaxis=dict(dtick=1),   # ensure every model shown
+        margin=dict(l=140, r=60, t=60, b=40)
+    )
+    fig.update_traces(textposition="outside")
+    return fig
+def color_scale(s, cmap):
+    """
+    Return background-colour styles for a numeric Series (lower = greener,
+    higher = redder). Works with any palette length.
+    """
+    colours = px.colors.sequential.__dict__[cmap]
+    n = len(colours) - 1                     # max valid index
+    rng = s.max() - s.min()
+    norm = (s - s.min()) / (rng if rng else 1)
+    return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]
+### Space initialisation
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    # restart_space()
+    print(f"[WARN] Skipping RESULTS sync: {Exception}")
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    # restart_space()
+    print(f"[WARN] Skipping RESULTS sync: {Exception}")
+# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
+# (
+#     finished_eval_queue_df,
+#     running_eval_queue_df,
+#     pending_eval_queue_df,
+# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(df: pd.DataFrame):
+    if df is None or df.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=df,
+        datatype=["markdown", "markdown", "number", "number", "number"],
+        select_columns=SelectColumns(
+            default_selection=[
+                "Rank", "Models",
+                "Average Hallucination Rate (%)",
+                "RAG Hallucination Rate (%)",
+                "Non-RAG Hallucination Rate (%)"
+            ],
+            cant_deselect=["Models", "Rank"],
+            label="Select Columns to Display:",
+        ),
+        search_columns=["Models"],
+        # column_widths=["3%"],
+        bool_checkboxgroup_label=None,
+        interactive=False,
+    )
+image_path = "static/kluster-color.png"
+with open(image_path, "rb") as img_file:
+    b64_string = base64.b64encode(img_file.read()).decode("utf-8")
+# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(f"""
+        <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
+            <img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"
+                style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
+            <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
+                LLM Hallucination Detection Leaderboard
+            </div>
+            <div style="font-size: 1.5em; margin-top: 0.5em;">
+                Evaluating factual accuracy and faithfulness of LLMs in both RAG and non-RAG settings with
+                <a href="https://platform.kluster.ai/verify" target="_blank">
+                    Verify
+                </a> by
+                <a href="https://platform.kluster.ai/" target="_blank">
+                    kluster.ai
+                </a>
+            </div>
+        </div>
+        """)
+    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
+            # ----------  Chart  ----------
+            with gr.Row():
+                gr.Plot(
+                    make_leaderboard_plot(
+                        LEADERBOARD_DF,
+                        "RAG Hallucination Rate (%)",
+                        "RAG Hallucination Rate (lower is better)",
+                        bar_color="#4CAF50",
+                    ),
+                    show_label=False,
+                )
+                gr.Plot(
+                    make_leaderboard_plot(
+                        LEADERBOARD_DF,
+                        "Non-RAG Hallucination Rate (%)",
+                        "Non-RAG Hallucination Rate (lower is better)",
+                        bar_color="#FF7043",
+                    ),
+                    show_label=False,
+                )
+            # ----------  Leaderboard  ----------
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("📝 Details", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown((Path(__file__).parent / "docs.md").read_text())
+        with gr.TabItem("🚀 Submit Here! ", elem_id="llm-benchmark-tab-table", id=3):
+            gr.Markdown((Path(__file__).parent / "submit.md").read_text())
+            # with gr.Column():
+            #     with gr.Row():
+            #         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+            #     with gr.Column():
+            #         with gr.Accordion(
+            #             f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+            #             open=False,
+            #         ):
+            #             with gr.Row():
+            #                 finished_eval_table = gr.components.Dataframe(
+            #                     value=finished_eval_queue_df,
+            #                     headers=EVAL_COLS,
+            #                     datatype=EVAL_TYPES,
+            #                     row_count=5,
+            #                 )
+            #         with gr.Accordion(
+            #             f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+            #             open=False,
+            #         ):
+            #             with gr.Row():
+            #                 running_eval_table = gr.components.Dataframe(
+            #                     value=running_eval_queue_df,
+            #                     headers=EVAL_COLS,
+            #                     datatype=EVAL_TYPES,
+            #                     row_count=5,
+            #                 )
+            #         with gr.Accordion(
+            #             f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+            #             open=False,
+            #         ):
+            #             with gr.Row():
+            #                 pending_eval_table = gr.components.Dataframe(
+            #                     value=pending_eval_queue_df,
+            #                     headers=EVAL_COLS,
+            #                     datatype=EVAL_TYPES,
+            #                     row_count=5,
+            #                 )
+            # with gr.Row():
+            #     gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            # with gr.Row():
+            #     with gr.Column():
+            #         model_name_textbox = gr.Textbox(label="Model name")
+            #         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+            #         model_type = gr.Dropdown(
+            #             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+            #             label="Model type",
+            #             multiselect=False,
+            #             value=None,
+            #             interactive=True,
+            #         )
+            #     with gr.Column():
+            #         precision = gr.Dropdown(
+            #             choices=[i.value.name for i in Precision if i != Precision.Unknown],
+            #             label="Precision",
+            #             multiselect=False,
+            #             value="float16",
+            #             interactive=True,
+            #         )
+            #         weight_type = gr.Dropdown(
+            #             choices=[i.value.name for i in WeightType],
+            #             label="Weights type",
+            #             multiselect=False,
+            #             value="Original",
+            #             interactive=True,
+            #         )
+            #         base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            # submit_button = gr.Button("Submit Eval")
+            # submission_result = gr.Markdown()
+            # submit_button.click(
+            #     add_new_eval,
+            #     [
+            #         model_name_textbox,
+            #         base_model_name_textbox,
+            #         revision_name_textbox,
+            #         precision,
+            #         weight_type,
+            #         model_type,
+            #     ],
+            #     submission_result,
+            # )
+    # with gr.Row():
+    #     with gr.Accordion("📙 Citation", open=False):
+    #         citation_button = gr.Textbox(
+    #             value=CITATION_BUTTON_TEXT,
+    #             label=CITATION_BUTTON_LABEL,
+    #             lines=20,
+    #             elem_id="citation-button",
+    #             show_copy_button=True,
+    #         )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()