Spaces:

kluster-ai
/

LLM-Hallucination-Detection-Leaderboard

Running

App Files Files Community

LLM-Hallucination-Detection-Leaderboard / app.py

aloe-vera

leaderboard v1

73adc36 verified about 1 month ago

raw

history blame

12 kB

	import gradio as gr
	import pandas as pd
	from pathlib import Path
	import plotly.express as px
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	COLS,
	EVAL_COLS,
	EVAL_TYPES,
	AutoEvalColumn,
	ModelType,
	fields,
	WeightType,
	Precision
	)
	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.submission.submit import add_new_eval
	import base64


	def restart_space():
	API.restart_space(repo_id=REPO_ID)



	def make_rate_chart(df: pd.DataFrame):
	"""Return a Plotly bar chart of hallucination rates."""
	# long-form dataframe for grouped bars
	df_long = df.melt(
	id_vars="Models",
	value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
	var_name="Benchmark",
	value_name="Rate",
	)
	fig = px.bar(
	df_long,
	x="Models",
	y="Rate",
	color="Benchmark",
	barmode="group",
	title="Hallucination Rates by Model",
	height=400,
	)
	fig.update_layout(xaxis_title="", yaxis_title="%")
	return fig

	def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
	"""
	Return a horizontal bar chart sorted ascending by `col`.
	Lowest value (best) at the top.
	"""
	df_sorted = df.sort_values(col, ascending=False) # best → worst
	fig = px.bar(
	df_sorted,
	x=col,
	y="Models",
	orientation="h",
	title=title,
	text_auto=".2f",
	height=400,
	color_discrete_sequence=[bar_color],
	)
	fig.update_traces(textposition="outside", cliponaxis=False)

	fig.update_layout(
	xaxis_title="Hallucination Rate (%)",
	yaxis_title="",
	yaxis=dict(dtick=1), # ensure every model shown
	margin=dict(l=140, r=60, t=60, b=40)
	)
	fig.update_traces(textposition="outside")
	return fig


	def color_scale(s, cmap):
	"""
	Return background-colour styles for a numeric Series (lower = greener,
	higher = redder). Works with any palette length.
	"""
	colours = px.colors.sequential.__dict__[cmap]
	n = len(colours) - 1 # max valid index

	rng = s.max() - s.min()
	norm = (s - s.min()) / (rng if rng else 1)

	return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]


	### Space initialisation
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	# restart_space()
	print(f"[WARN] Skipping RESULTS sync: {e}")
	try:
	print(EVAL_RESULTS_PATH)
	snapshot_download(
	repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	# restart_space()
	print(f"[WARN] Skipping RESULTS sync: {e}")


	# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
	LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")

	# (
	# finished_eval_queue_df,
	# running_eval_queue_df,
	# pending_eval_queue_df,
	# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

	def init_leaderboard(df: pd.DataFrame):
	if df is None or df.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")

	return Leaderboard(
	value=df,
	datatype=["markdown", "markdown", "number", "number", "number"],
	select_columns=SelectColumns(
	default_selection=[
	"Rank", "Models",
	"Average Hallucination Rate (%)",
	"RAG Hallucination Rate (%)",
	"Non-RAG Hallucination Rate (%)"
	],
	cant_deselect=["Models", "Rank"],
	label="Select Columns to Display:",
	),
	search_columns=["Models"],
	# column_widths=["3%"],
	bool_checkboxgroup_label=None,
	interactive=False,
	)

	image_path = "static/kluster-color.png"
	with open(image_path, "rb") as img_file:
	b64_string = base64.b64encode(img_file.read()).decode("utf-8")


	# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(f"""
	<div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
	<img src="data:image/png;base64,{b64_string}" alt="KlusterAI logo" style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
	<div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em;">
	LLM Hallucination Detection <span style="color: #0057ff;">Leaderboard</span>
	</div>
	<div style="font-size: 1.5em; color: #444; margin-top: 0.5em;">
	Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
	<a href="https://platform.kluster.ai/verify" target="_blank" style="color: #0057ff; text-decoration: none;">
	Verify
	</a> by
	<a href="https://platform.kluster.ai/" target="_blank" style="color: #0057ff; text-decoration: none;">
	KlusterAI
	</a>
	</div>
	</div>
	""")

	# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 Hallucination Benchmark", elem_id="llm-benchmark-tab-table", id=0):
	# ---------- Chart ----------
	with gr.Row():
	gr.Plot(
	make_leaderboard_plot(
	LEADERBOARD_DF,
	"RAG Hallucination Rate (%)",
	"RAG Hallucination Rate (lower is better)",
	bar_color="#4CAF50",
	),
	show_label=False,
	)
	gr.Plot(
	make_leaderboard_plot(
	LEADERBOARD_DF,
	"Non-RAG Hallucination Rate (%)",
	"Non-RAG Hallucination Rate (lower is better)",
	bar_color="#FF7043",
	),
	show_label=False,
	)

	# ---------- Leaderboard ----------
	leaderboard = init_leaderboard(LEADERBOARD_DF)

	with gr.TabItem("📝 Document", elem_id="llm-benchmark-tab-table", id=2):
	gr.Markdown((Path(__file__).parent / "docs.md").read_text())

	with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
	gr.Markdown((Path(__file__).parent / "submit.md").read_text())

	# with gr.Column():
	# with gr.Row():
	# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

	# with gr.Column():
	# with gr.Accordion(
	# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
	# open=False,
	# ):
	# with gr.Row():
	# finished_eval_table = gr.components.Dataframe(
	# value=finished_eval_queue_df,
	# headers=EVAL_COLS,
	# datatype=EVAL_TYPES,
	# row_count=5,
	# )
	# with gr.Accordion(
	# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
	# open=False,
	# ):
	# with gr.Row():
	# running_eval_table = gr.components.Dataframe(
	# value=running_eval_queue_df,
	# headers=EVAL_COLS,
	# datatype=EVAL_TYPES,
	# row_count=5,
	# )

	# with gr.Accordion(
	# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
	# open=False,
	# ):
	# with gr.Row():
	# pending_eval_table = gr.components.Dataframe(
	# value=pending_eval_queue_df,
	# headers=EVAL_COLS,
	# datatype=EVAL_TYPES,
	# row_count=5,
	# )
	# with gr.Row():
	# gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")

	# with gr.Row():
	# with gr.Column():
	# model_name_textbox = gr.Textbox(label="Model name")
	# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
	# model_type = gr.Dropdown(
	# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
	# label="Model type",
	# multiselect=False,
	# value=None,
	# interactive=True,
	# )

	# with gr.Column():
	# precision = gr.Dropdown(
	# choices=[i.value.name for i in Precision if i != Precision.Unknown],
	# label="Precision",
	# multiselect=False,
	# value="float16",
	# interactive=True,
	# )
	# weight_type = gr.Dropdown(
	# choices=[i.value.name for i in WeightType],
	# label="Weights type",
	# multiselect=False,
	# value="Original",
	# interactive=True,
	# )
	# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")

	# submit_button = gr.Button("Submit Eval")
	# submission_result = gr.Markdown()
	# submit_button.click(
	# add_new_eval,
	# [
	# model_name_textbox,
	# base_model_name_textbox,
	# revision_name_textbox,
	# precision,
	# weight_type,
	# model_type,
	# ],
	# submission_result,
	# )

	# with gr.Row():
	# with gr.Accordion("📙 Citation", open=False):
	# citation_button = gr.Textbox(
	# value=CITATION_BUTTON_TEXT,
	# label=CITATION_BUTTON_LABEL,
	# lines=20,
	# elem_id="citation-button",
	# show_copy_button=True,
	# )

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch()