Ryan McConville
commited on
Commit
Β·
db0d8a2
1
Parent(s):
6a61a18
tweaks to layout and text
Browse files- app.py +139 -196
- docs.md +4 -0
- introduction.md +27 -0
- leaderboard/data/leaderboard.csv +15 -15
- leaderboard/data/rag_methods_compare.csv +12 -12
- rag_techniques.md +86 -0
- rag_techniques_details.md +64 -0
- rag_techniques_intro.md +23 -0
- src/about.py +2 -26
- src/populate.py +9 -3
- submit.md +4 -0
- verify.md +13 -0
app.py
CHANGED
@@ -60,11 +60,8 @@ def make_rate_chart(df: pd.DataFrame):
|
|
60 |
return fig
|
61 |
|
62 |
def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
|
63 |
-
|
64 |
-
|
65 |
-
Lowest value (best) at the top.
|
66 |
-
"""
|
67 |
-
df_sorted = df.sort_values(col, ascending=False) # best β worst
|
68 |
fig = px.bar(
|
69 |
df_sorted,
|
70 |
x=col,
|
@@ -87,6 +84,100 @@ def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str
|
|
87 |
return fig
|
88 |
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
def color_scale(s, cmap):
|
91 |
"""
|
92 |
Return background-colour styles for a numeric Series (lower = greener,
|
@@ -120,15 +211,9 @@ except Exception:
|
|
120 |
print(f"[WARN] Skipping RESULTS sync: {Exception}")
|
121 |
|
122 |
|
123 |
-
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
124 |
LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
|
125 |
RAG_DF = get_rag_leaderboard_df("leaderboard/data/rag_methods_compare.csv")
|
126 |
|
127 |
-
# (
|
128 |
-
# finished_eval_queue_df,
|
129 |
-
# running_eval_queue_df,
|
130 |
-
# pending_eval_queue_df,
|
131 |
-
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
132 |
|
133 |
def init_leaderboard(df: pd.DataFrame):
|
134 |
if df is None or df.empty:
|
@@ -199,7 +284,7 @@ with demo:
|
|
199 |
),
|
200 |
show_label=False,
|
201 |
)
|
202 |
-
gr.Markdown("*HaluEval-QA benchmark (RAG): The model receives a question plus supporting context. We report the % of answers that introduce facts not found in that context β lower is better. See the **
|
203 |
with gr.Column():
|
204 |
gr.Plot(
|
205 |
make_leaderboard_plot(
|
@@ -210,25 +295,13 @@ with demo:
|
|
210 |
),
|
211 |
show_label=False,
|
212 |
)
|
213 |
-
gr.Markdown("*UltraChat benchmark (~11 k prompts, non-RAG): Evaluates open-domain answers when only the question is given. Score is the % of hallucinated responses β lower is better. See the **
|
214 |
|
215 |
# ---------- Leaderboard ----------
|
216 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
217 |
|
218 |
# ---------- Get Started with Verify ----------
|
219 |
-
verify_markdown =
|
220 |
-
"""
|
221 |
-
## Get started with Verify by kluster.ai
|
222 |
-
|
223 |
-
Verify is an intelligent agent that validates LLM outputs in real-time.
|
224 |
-
|
225 |
-
- **Blog post:** [Introducing Verify by kluster.ai](https://www.kluster.ai/blog/introducing-verify-by-kluster-ai-the-missing-trust-layer-in-your-ai-stack)
|
226 |
-
- **Documentation:** [Verify overview & API reference](https://docs.kluster.ai/get-started/verify/overview/)
|
227 |
-
- **Try it out in your browser:** [kluster.ai platform](https://platform.kluster.ai/verify)
|
228 |
-
|
229 |
-
### Quick API examples
|
230 |
-
"""
|
231 |
-
)
|
232 |
|
233 |
gr.Markdown(verify_markdown, elem_classes="markdown-text")
|
234 |
|
@@ -260,79 +333,44 @@ with demo:
|
|
260 |
|
261 |
gr.Code(code_example_chat, language="shell")
|
262 |
|
|
|
|
|
|
|
263 |
|
264 |
with gr.TabItem("π§ͺ RAG Techniques and Hallucinations", elem_id="llm-benchmark-tab-table", id=2):
|
265 |
-
|
266 |
-
|
267 |
-
## Comparison of Different RAG Techniques and Hallucinations
|
268 |
-
|
269 |
-
Many LLMs can generate fluent answers but still hallucinate factsβespecially in RAG settings. This experiment aims to understand how different prompting strategies impact hallucination rates across models. It helps answer: Which prompt format is most reliable? Which models are more sensitive to prompt structure? The goal is to inform better design of RAG pipelines for reducing factual errors in downstream tasks.
|
270 |
-
|
271 |
-
We presents hallucination rates for various LLMs under three different RAG prompting strategies. Each method delivers the same document context and question, but differs in how the information is structured during the prompt.
|
272 |
-
|
273 |
-
### RAG Techniques Evaluated
|
274 |
-
|
275 |
-
**1. Two-Turn Explicit RAG**
|
276 |
-
The document and question are sent in separate user messages:
|
277 |
-
```
|
278 |
-
[System]: You are an assistant for question-answering tasks.
|
279 |
-
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
280 |
-
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
281 |
-
The ANSWER also must not contradict information provided in the DOCUMENT.
|
282 |
-
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
283 |
-
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
284 |
-
Use three sentences maximum and keep the answer concise.
|
285 |
-
|
286 |
-
[User]: DOCUMENT: <context>
|
287 |
-
[User]: QUESTION: <prompt>
|
288 |
-
```
|
289 |
-
This method creates a multi-turn format, which allows the model to treat the context and question independently.
|
290 |
-
*Note: This method does not work on Gemma 3 27B due to its restriction on consecutive user messages without an intervening assistant response.*
|
291 |
-
|
292 |
-
**2. System-Prompt Injection RAG**
|
293 |
-
The document is embedded inside the system prompt, and the user sends only the question:
|
294 |
-
```
|
295 |
-
[System]: You are an assistant for question-answering tasks.
|
296 |
-
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
297 |
-
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
298 |
-
The ANSWER also must not contradict information provided in the DOCUMENT.
|
299 |
-
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
300 |
-
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
301 |
-
Use three sentences maximum and keep the answer concise.
|
302 |
-
DOCUMENT: <context>
|
303 |
-
|
304 |
-
[User]: <prompt>
|
305 |
-
```
|
306 |
-
This approach places the grounding context within the modelβs instruction space.
|
307 |
-
|
308 |
-
**3. Single-Turn Concatenated RAG**
|
309 |
-
Both the document and question are concatenated in a single user message:
|
310 |
-
```
|
311 |
-
[System]: You are an assistant for question-answering tasks.
|
312 |
-
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
313 |
-
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
314 |
-
The ANSWER also must not contradict information provided in the DOCUMENT.
|
315 |
-
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
316 |
-
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
317 |
-
Use three sentences maximum and keep the answer concise.
|
318 |
-
|
319 |
-
[User]:
|
320 |
-
DOCUMENT: <context>
|
321 |
-
QUESTION: <prompt>
|
322 |
-
|
323 |
-
```
|
324 |
-
This is the most compact format, sending everything as one prompt input.
|
325 |
-
|
326 |
-
### Metric
|
327 |
-
|
328 |
-
The values in the table indicate the **hallucination rate (%)** of answers deemed factually incorrect or ungrounded given the provided context.
|
329 |
-
|
330 |
-
"""
|
331 |
|
332 |
-
|
333 |
-
)
|
334 |
|
335 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
|
337 |
|
338 |
rag_leaderboard = Leaderboard(
|
@@ -341,9 +379,9 @@ with demo:
|
|
341 |
select_columns=SelectColumns(
|
342 |
default_selection=[
|
343 |
"Models",
|
344 |
-
"
|
345 |
-
"
|
346 |
-
"
|
347 |
],
|
348 |
cant_deselect=["Models"],
|
349 |
label="Select RAG Method Columns:",
|
@@ -354,111 +392,16 @@ with demo:
|
|
354 |
height=700
|
355 |
)
|
356 |
|
|
|
|
|
357 |
|
358 |
|
359 |
|
360 |
-
with gr.TabItem("π Details", elem_id="llm-benchmark-tab-table", id=3):
|
361 |
-
gr.Markdown((Path(__file__).parent / "docs.md").read_text(), elem_classes="markdown-text")
|
362 |
|
363 |
with gr.TabItem("π Submit Here! ", elem_id="llm-benchmark-tab-table", id=4):
|
364 |
gr.Markdown((Path(__file__).parent / "submit.md").read_text(), elem_classes="markdown-text")
|
365 |
|
366 |
-
|
367 |
-
# with gr.Row():
|
368 |
-
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
369 |
-
|
370 |
-
# with gr.Column():
|
371 |
-
# with gr.Accordion(
|
372 |
-
# f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
373 |
-
# open=False,
|
374 |
-
# ):
|
375 |
-
# with gr.Row():
|
376 |
-
# finished_eval_table = gr.components.Dataframe(
|
377 |
-
# value=finished_eval_queue_df,
|
378 |
-
# headers=EVAL_COLS,
|
379 |
-
# datatype=EVAL_TYPES,
|
380 |
-
# row_count=5,
|
381 |
-
# )
|
382 |
-
# with gr.Accordion(
|
383 |
-
# f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
|
384 |
-
# open=False,
|
385 |
-
# ):
|
386 |
-
# with gr.Row():
|
387 |
-
# running_eval_table = gr.components.Dataframe(
|
388 |
-
# value=running_eval_queue_df,
|
389 |
-
# headers=EVAL_COLS,
|
390 |
-
# datatype=EVAL_TYPES,
|
391 |
-
# row_count=5,
|
392 |
-
# )
|
393 |
-
|
394 |
-
# with gr.Accordion(
|
395 |
-
# f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
396 |
-
# open=False,
|
397 |
-
# ):
|
398 |
-
# with gr.Row():
|
399 |
-
# pending_eval_table = gr.components.Dataframe(
|
400 |
-
# value=pending_eval_queue_df,
|
401 |
-
# headers=EVAL_COLS,
|
402 |
-
# datatype=EVAL_TYPES,
|
403 |
-
# row_count=5,
|
404 |
-
# )
|
405 |
-
# with gr.Row():
|
406 |
-
# gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
407 |
-
|
408 |
-
# with gr.Row():
|
409 |
-
# with gr.Column():
|
410 |
-
# model_name_textbox = gr.Textbox(label="Model name")
|
411 |
-
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
412 |
-
# model_type = gr.Dropdown(
|
413 |
-
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
414 |
-
# label="Model type",
|
415 |
-
# multiselect=False,
|
416 |
-
# value=None,
|
417 |
-
# interactive=True,
|
418 |
-
# )
|
419 |
-
|
420 |
-
# with gr.Column():
|
421 |
-
# precision = gr.Dropdown(
|
422 |
-
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
423 |
-
# label="Precision",
|
424 |
-
# multiselect=False,
|
425 |
-
# value="float16",
|
426 |
-
# interactive=True,
|
427 |
-
# )
|
428 |
-
# weight_type = gr.Dropdown(
|
429 |
-
# choices=[i.value.name for i in WeightType],
|
430 |
-
# label="Weights type",
|
431 |
-
# multiselect=False,
|
432 |
-
# value="Original",
|
433 |
-
# interactive=True,
|
434 |
-
# )
|
435 |
-
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
436 |
-
|
437 |
-
# submit_button = gr.Button("Submit Eval")
|
438 |
-
# submission_result = gr.Markdown()
|
439 |
-
# submit_button.click(
|
440 |
-
# add_new_eval,
|
441 |
-
# [
|
442 |
-
# model_name_textbox,
|
443 |
-
# base_model_name_textbox,
|
444 |
-
# revision_name_textbox,
|
445 |
-
# precision,
|
446 |
-
# weight_type,
|
447 |
-
# model_type,
|
448 |
-
# ],
|
449 |
-
# submission_result,
|
450 |
-
# )
|
451 |
-
|
452 |
-
# with gr.Row():
|
453 |
-
# with gr.Accordion("π Citation", open=False):
|
454 |
-
# citation_button = gr.Textbox(
|
455 |
-
# value=CITATION_BUTTON_TEXT,
|
456 |
-
# label=CITATION_BUTTON_LABEL,
|
457 |
-
# lines=20,
|
458 |
-
# elem_id="citation-button",
|
459 |
-
# show_copy_button=True,
|
460 |
-
# )
|
461 |
-
|
462 |
scheduler = BackgroundScheduler()
|
463 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
464 |
scheduler.start()
|
|
|
60 |
return fig
|
61 |
|
62 |
def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
|
63 |
+
|
64 |
+
df_sorted = df.sort_values(col, ascending=False)
|
|
|
|
|
|
|
65 |
fig = px.bar(
|
66 |
df_sorted,
|
67 |
x=col,
|
|
|
84 |
return fig
|
85 |
|
86 |
|
87 |
+
def make_rag_average_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
|
88 |
+
rag_cols = [
|
89 |
+
"Context in System Prompt (%)",
|
90 |
+
"Context and Question Single-Turn (%)",
|
91 |
+
"Context and Question Two-Turns (%)",
|
92 |
+
]
|
93 |
+
|
94 |
+
df_plot = df.copy()
|
95 |
+
if col not in df_plot.columns:
|
96 |
+
df_plot[col] = df_plot[rag_cols].mean(axis=1, skipna=True).round(2)
|
97 |
+
|
98 |
+
df_plot["Std Dev"] = df_plot[rag_cols].std(axis=1, skipna=True).round(2)
|
99 |
+
|
100 |
+
df_sorted = df_plot.sort_values(col, ascending=False)
|
101 |
+
|
102 |
+
fig = px.bar(
|
103 |
+
df_sorted,
|
104 |
+
x=col,
|
105 |
+
y="Models",
|
106 |
+
orientation="h",
|
107 |
+
title=title,
|
108 |
+
height=400,
|
109 |
+
color_discrete_sequence=[bar_color],
|
110 |
+
error_x="Std Dev",
|
111 |
+
)
|
112 |
+
fig.update_traces(
|
113 |
+
texttemplate="%{x:.2f}",
|
114 |
+
textposition="inside",
|
115 |
+
insidetextanchor="middle",
|
116 |
+
cliponaxis=False,
|
117 |
+
)
|
118 |
+
fig.update_layout(
|
119 |
+
xaxis_title="Hallucination Rate (%)",
|
120 |
+
yaxis_title="",
|
121 |
+
yaxis=dict(dtick=1),
|
122 |
+
margin=dict(l=140, r=60, t=60, b=40),
|
123 |
+
)
|
124 |
+
return fig
|
125 |
+
|
126 |
+
|
127 |
+
def make_rag_method_average_plot(df: pd.DataFrame, title: str, bar_color: str):
|
128 |
+
method_cols = [
|
129 |
+
"Context in System Prompt (%)",
|
130 |
+
"Context and Question Single-Turn (%)",
|
131 |
+
"Context and Question Two-Turns (%)",
|
132 |
+
]
|
133 |
+
|
134 |
+
averages = df[method_cols].mean().round(2)
|
135 |
+
stds = df[method_cols].std().round(2)
|
136 |
+
|
137 |
+
avg_df = pd.DataFrame(
|
138 |
+
{
|
139 |
+
"RAG Method": averages.index,
|
140 |
+
"Average Hallucination Rate (%)": averages.values,
|
141 |
+
"Std Dev": stds.values,
|
142 |
+
}
|
143 |
+
)
|
144 |
+
|
145 |
+
fig = px.bar(
|
146 |
+
avg_df,
|
147 |
+
x="RAG Method",
|
148 |
+
y="Average Hallucination Rate (%)",
|
149 |
+
error_y="Std Dev",
|
150 |
+
title=title,
|
151 |
+
height=400,
|
152 |
+
color_discrete_sequence=[bar_color],
|
153 |
+
)
|
154 |
+
fig.update_traces(
|
155 |
+
texttemplate="%{y:.2f}" if 'orientation' not in fig.data[0] or fig.data[0].orientation == 'v' else "%{x:.2f}",
|
156 |
+
textposition="inside",
|
157 |
+
insidetextanchor="start",
|
158 |
+
cliponaxis=False,
|
159 |
+
textfont_color="white",
|
160 |
+
)
|
161 |
+
labels_map = {
|
162 |
+
"Context in System Prompt (%)": "Context in<br>System Prompt",
|
163 |
+
"Context and Question Single-Turn (%)": "Context & Question<br>Single-Turn",
|
164 |
+
"Context and Question Two-Turns (%)": "Context & Question<br>Two-Turns",
|
165 |
+
}
|
166 |
+
fig.update_xaxes(
|
167 |
+
tickmode="array",
|
168 |
+
tickvals=list(labels_map.keys()),
|
169 |
+
ticktext=list(labels_map.values()),
|
170 |
+
tickangle=0,
|
171 |
+
automargin=True,
|
172 |
+
)
|
173 |
+
fig.update_layout(
|
174 |
+
xaxis_title="",
|
175 |
+
yaxis_title="Hallucination Rate (%)",
|
176 |
+
margin=dict(l=40, r=100, t=60, b=120),
|
177 |
+
)
|
178 |
+
return fig
|
179 |
+
|
180 |
+
|
181 |
def color_scale(s, cmap):
|
182 |
"""
|
183 |
Return background-colour styles for a numeric Series (lower = greener,
|
|
|
211 |
print(f"[WARN] Skipping RESULTS sync: {Exception}")
|
212 |
|
213 |
|
|
|
214 |
LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
|
215 |
RAG_DF = get_rag_leaderboard_df("leaderboard/data/rag_methods_compare.csv")
|
216 |
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
def init_leaderboard(df: pd.DataFrame):
|
219 |
if df is None or df.empty:
|
|
|
284 |
),
|
285 |
show_label=False,
|
286 |
)
|
287 |
+
gr.Markdown("*HaluEval-QA benchmark (RAG): The model receives a question plus supporting context. We report the % of answers that introduce facts not found in that context β lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption")
|
288 |
with gr.Column():
|
289 |
gr.Plot(
|
290 |
make_leaderboard_plot(
|
|
|
295 |
),
|
296 |
show_label=False,
|
297 |
)
|
298 |
+
gr.Markdown("*UltraChat benchmark (~11 k prompts, non-RAG): Evaluates open-domain answers when only the question is given. Score is the % of hallucinated responses β lower is better. See the **Methodology** section below for more information.*", elem_classes="plot-caption")
|
299 |
|
300 |
# ---------- Leaderboard ----------
|
301 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
302 |
|
303 |
# ---------- Get Started with Verify ----------
|
304 |
+
verify_markdown = (Path(__file__).parent / "verify.md").read_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
gr.Markdown(verify_markdown, elem_classes="markdown-text")
|
307 |
|
|
|
333 |
|
334 |
gr.Code(code_example_chat, language="shell")
|
335 |
|
336 |
+
with gr.Accordion("π Methodology & Benchmark Details", open=True):
|
337 |
+
gr.Markdown((Path(__file__).parent / "docs.md").read_text(), elem_classes="markdown-text")
|
338 |
+
|
339 |
|
340 |
with gr.TabItem("π§ͺ RAG Techniques and Hallucinations", elem_id="llm-benchmark-tab-table", id=2):
|
341 |
+
rag_intro_markdown = (Path(__file__).parent / "rag_techniques_intro.md").read_text()
|
342 |
+
rag_details_markdown = (Path(__file__).parent / "rag_techniques_details.md").read_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
+
gr.Markdown(rag_intro_markdown, elem_classes="markdown-text")
|
|
|
345 |
|
346 |
+
with gr.Row():
|
347 |
+
with gr.Column():
|
348 |
+
gr.Plot(
|
349 |
+
make_rag_method_average_plot(
|
350 |
+
RAG_DF,
|
351 |
+
"Average Hallucination Rate by RAG Method (lower is better)",
|
352 |
+
bar_color="#4CAF50",
|
353 |
+
),
|
354 |
+
show_label=False,
|
355 |
+
)
|
356 |
+
gr.Markdown(
|
357 |
+
"*Mean hallucination rate for each RAG prompting strategy across all models on the HaluEval-QA benchmark. Error bars represent Β±1 SD; lower is better.*",
|
358 |
+
elem_classes="plot-caption",
|
359 |
+
)
|
360 |
+
with gr.Column():
|
361 |
+
gr.Plot(
|
362 |
+
make_rag_average_plot(
|
363 |
+
RAG_DF,
|
364 |
+
"Average Hallucination Rate (%)",
|
365 |
+
"Average Hallucination Rate per Model (lower is better)",
|
366 |
+
bar_color="#2196F3",
|
367 |
+
),
|
368 |
+
show_label=False,
|
369 |
+
)
|
370 |
+
gr.Markdown(
|
371 |
+
"*Mean hallucination rate across the three RAG prompting settings for each individual model. Error bars show Β±1 SD across the three strategies; lower is better.*",
|
372 |
+
elem_classes="plot-caption",
|
373 |
+
)
|
374 |
|
375 |
|
376 |
rag_leaderboard = Leaderboard(
|
|
|
379 |
select_columns=SelectColumns(
|
380 |
default_selection=[
|
381 |
"Models",
|
382 |
+
"Context in System Prompt (%)",
|
383 |
+
"Context and Question Single-Turn (%)",
|
384 |
+
"Context and Question Two-Turns (%)",
|
385 |
],
|
386 |
cant_deselect=["Models"],
|
387 |
label="Select RAG Method Columns:",
|
|
|
392 |
height=700
|
393 |
)
|
394 |
|
395 |
+
with gr.Accordion("π RAG Techniques & Benchmark Details", open=True):
|
396 |
+
gr.Markdown(rag_details_markdown, elem_classes="markdown-text")
|
397 |
|
398 |
|
399 |
|
|
|
|
|
400 |
|
401 |
with gr.TabItem("π Submit Here! ", elem_id="llm-benchmark-tab-table", id=4):
|
402 |
gr.Markdown((Path(__file__).parent / "submit.md").read_text(), elem_classes="markdown-text")
|
403 |
|
404 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
scheduler = BackgroundScheduler()
|
406 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
407 |
scheduler.start()
|
docs.md
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
# About
|
2 |
|
3 |
As large language models (LLMs) continue to improve, evaluating how well they avoid hallucinations (producing information that is unfaithful or factually incorrect) has become increasingly important. While many models claim to be reliable, their factual grounding can vary significantly across tasks and settings.
|
|
|
1 |
+
<!--
|
2 |
+
keywords: hallucination detection documentation, LLM hallucination benchmark, RAG evaluation guide, Verify API, kluster.ai, retrieval-augmented generation evaluation, large language model accuracy
|
3 |
+
-->
|
4 |
+
|
5 |
# About
|
6 |
|
7 |
As large language models (LLMs) continue to improve, evaluating how well they avoid hallucinations (producing information that is unfaithful or factually incorrect) has become increasingly important. While many models claim to be reliable, their factual grounding can vary significantly across tasks and settings.
|
introduction.md
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--
|
2 |
+
keywords: LLM hallucination detection, hallucination leaderboard, RAG hallucination benchmark, UltraChat hallucination rate, Verify API, kluster.ai, factual accuracy of language models, large language model evaluation
|
3 |
+
-->
|
4 |
+
|
5 |
+
The **LLM Hallucination Detection Leaderboard** is a public, continuously updated comparison of how well popular Large Language Models (LLMs) avoid *hallucinations*, responses that are factually incorrect, fabricated, or unsupported by evidence. By surfacing transparent metrics across tasks, we help practitioners choose models that they can trust in production.
|
6 |
+
|
7 |
+
### Why does hallucination detection matter?
|
8 |
+
|
9 |
+
* **User Trust & Safety** β Hallucinations undermine confidence and can damage reputation.
|
10 |
+
* **Retrieval-Augmented Generation (RAG) Quality** β In enterprise workflows, LLMs must remain faithful to supplied context. Measuring hallucinations highlights which models respect that constraint.
|
11 |
+
* **Regulatory & Compliance Pressure** β Upcoming AI regulations require demonstrable accuracy standards. Reliable hallucination metrics can help you meet these requirements.
|
12 |
+
|
13 |
+
### How we measure hallucinations
|
14 |
+
|
15 |
+
We evaluate each model on two complementary benchmarks and compute a *hallucination rate* (lower = better):
|
16 |
+
|
17 |
+
1. **HaluEval-QA (RAG setting)** β Given a question *and* a supporting document, the model must answer *only* using the provided context.
|
18 |
+
2. **UltraChat Filtered (Non-RAG setting)** β Open-domain questions with **no** extra context test the model's internal knowledge.
|
19 |
+
|
20 |
+
Outputs are automatically verified by [Verify](https://platform.kluster.ai/verify) from [kluster.ai](https://kluster.ai/), which cross-checks claims against the source document or web results.
|
21 |
+
|
22 |
+
> **Note:** Full experiment details, including prompt templates, dataset description, and evaluation methodology, are provided at the end of this page for reference.
|
23 |
+
|
24 |
+
---
|
25 |
+
|
26 |
+
Stay informed as we add new models and tasks, and follow us on [X](https://x.com/klusterai) or join Discord [here](https://discord.com/invite/klusterai) for the latest updates on trustworthy LLMs.
|
27 |
+
|
leaderboard/data/leaderboard.csv
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
Models,ha_rag_rate,ha_non_rag_rate
|
2 |
-
klusterai
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
klusterai
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
1 |
Models,ha_rag_rate,ha_non_rag_rate
|
2 |
+
klusterai-Meta-Llama-3.1-8B-Instruct-Turbo,8.1,12.5
|
3 |
+
Qwen2.5-VL-7B-Instruct,9.35,4.55
|
4 |
+
Mistral-Nemo-Instruct-2407,10.63,8.74
|
5 |
+
Llama-4-Maverick-17B-128E-Instruct-FP8,3.34,0.69
|
6 |
+
Llama-4-Scout-17B-16E-Instruct,4.23,2.48
|
7 |
+
Mistral-Small-24B-Instruct-2501,4.74,7.85
|
8 |
+
Magistral-Small-2506,8.62,28.07
|
9 |
+
gemma-3-27b-it,3.71,0.48
|
10 |
+
klusterai-Meta-Llama-3.3-70B-Instruct-Turbo,2.12,1.09
|
11 |
+
DeepSeek-V3-0324,4.66,0.91
|
12 |
+
Qwen3-235B-A22B-FP8,5.04,0.88
|
13 |
+
DeepSeek-R1-0528,2.26,0.78
|
14 |
+
gpt-4o,6.05,0.64
|
15 |
+
claude-sonnet-4,2.21,0.6
|
16 |
+
gemini-2.5-pro,1.57,0.36
|
leaderboard/data/rag_methods_compare.csv
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
Models,rag1,rag2,rag3
|
2 |
-
Meta
|
3 |
-
Qwen2.5-VL
|
4 |
-
Mistral
|
5 |
-
|
6 |
-
|
7 |
-
Mistral
|
8 |
-
Magistral
|
9 |
-
|
10 |
-
Meta
|
11 |
-
DeepSeek-V3-0324,7.
|
12 |
-
Qwen3-235B-A22B,6.
|
13 |
-
DeepSeek-R1-0528,3.
|
|
|
1 |
Models,rag1,rag2,rag3
|
2 |
+
klusterai-Meta-Llama-3.1-8B-Instruct-Turbo,13.09,8.1,11.92
|
3 |
+
Qwen2.5-VL-7B-Instruct,13.39,9.35,13.24
|
4 |
+
Mistral-Nemo-Instruct-2407,13.99,10.63,14.42
|
5 |
+
Llama-4-Maverick-17B-128E-Instruct-FP8,6.27,3.34,5.72
|
6 |
+
Llama-4-Scout-17B-16E-Instruct,7.17,4.23,6.98
|
7 |
+
Mistral-Small-24B-Instruct-2501,7.5,4.74,7.09
|
8 |
+
Magistral-Small-2506,12.09,8.62,11.87
|
9 |
+
gemma-3-27b-it,6.09,3.71,-
|
10 |
+
klusterai-Meta-Llama-3.3-70B-Instruct-Turbo,4.63,2.12,4.65
|
11 |
+
DeepSeek-V3-0324,7.71,4.66,7.09
|
12 |
+
Qwen3-235B-A22B-FP8,6.8,5.04,6.63
|
13 |
+
DeepSeek-R1-0528,3.52,2.26,3.58
|
rag_techniques.md
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--
|
2 |
+
keywords: RAG techniques, Retrieval-Augmented Generation prompt engineering, hallucination detection, LLM hallucination rate, kluster.ai, Verify API, prompt design comparison, large language model evaluation
|
3 |
+
-->
|
4 |
+
# Comparison of Retrieval-Augmented Generation (RAG) Prompting Techniques and Hallucinations
|
5 |
+
|
6 |
+
LLMs can generate fluent answers but still hallucinate facts, especially in Retrieval-Augmented Generation (RAG) workflows. This leaderboard aims to understand how different prompt engineering strategies impact hallucination rates across models. In other words: Which prompt format is most reliable? Which models are more sensitive to prompt structure? The goal is to inform better design of RAG pipelines so you can reduce factual errors in downstream applications.
|
7 |
+
|
8 |
+
We present hallucination rates for various LLMs under three RAG request strategies. Each method delivers the same document context and question, but differs in how the information is structured during the request.
|
9 |
+
|
10 |
+
## Overview
|
11 |
+
|
12 |
+
- **What we measure**: Hallucination rate (%) across three RAG request patterns.
|
13 |
+
- **RAG patterns compared**:
|
14 |
+
1) **System Prompt**: context is placed in the system message; user sends only the question.
|
15 |
+
2) **Single-Turn**: one user message that includes both the context *and* the question.
|
16 |
+
3) **Two-Turn**: first user message provides the context, a second user message provides the question.
|
17 |
+
- **Why it matters**: Request structure can change reliability significantly. Knowing the safest default helps you ship trustworthy RAG systems faster.
|
18 |
+
- **Detect & reduce hallucinations**: The same [Verify](https://platform.kluster.ai/verify) API used for these evaluations can be plugged into your pipeline to flag and filter ungrounded answers in real time.
|
19 |
+
- **How to read the charts**: Lower bars = fewer hallucinations. Error bars show Β±1 SD across models.
|
20 |
+
- **Experiment summary**: 10,000 HaluEval-QA examples, temperature 0, judged with [Verify](https://docs.kluster.ai/get-started/verify/overview/).
|
21 |
+
|
22 |
+
### RAG Techniques Evaluated
|
23 |
+
|
24 |
+
**1. RAG with Context in System Prompt**
|
25 |
+
The document is embedded inside the system prompt, and the user sends only the question:
|
26 |
+
```text
|
27 |
+
[System]: You are an assistant for question-answering tasks.
|
28 |
+
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
29 |
+
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
30 |
+
The ANSWER also must not contradict information provided in the DOCUMENT.
|
31 |
+
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
32 |
+
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
33 |
+
Use three sentences maximum and keep the answer concise.
|
34 |
+
DOCUMENT: <context>
|
35 |
+
|
36 |
+
[User]: <prompt>
|
37 |
+
```
|
38 |
+
|
39 |
+
**2. RAG with Context and Question in Single-Turn**
|
40 |
+
Both the document and question are concatenated in a single user message:
|
41 |
+
```text
|
42 |
+
[System]: You are an assistant for question-answering tasks.
|
43 |
+
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
44 |
+
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
45 |
+
The ANSWER also must not contradict information provided in the DOCUMENT.
|
46 |
+
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
47 |
+
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
48 |
+
Use three sentences maximum and keep the answer concise.
|
49 |
+
|
50 |
+
[User]:
|
51 |
+
DOCUMENT: <context>
|
52 |
+
QUESTION: <prompt>
|
53 |
+
|
54 |
+
```
|
55 |
+
|
56 |
+
**3. RAG with Context and Question in Two-Turns**
|
57 |
+
The document and question are sent in separate user messages:
|
58 |
+
```text
|
59 |
+
[System]: You are an assistant for question-answering tasks.
|
60 |
+
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
61 |
+
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
62 |
+
The ANSWER also must not contradict information provided in the DOCUMENT.
|
63 |
+
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
64 |
+
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
65 |
+
Use three sentences maximum and keep the answer concise.
|
66 |
+
|
67 |
+
[User]: DOCUMENT: <context>
|
68 |
+
[User]: QUESTION: <prompt>
|
69 |
+
```
|
70 |
+
*Note: This method did **not** work on Gemma 3 27B with the default chat template due to its restriction on consecutive user messages without an intervening assistant response.*
|
71 |
+
|
72 |
+
### Dataset
|
73 |
+
We evaluate all three prompting strategies on the **HaluEval QA** benchmark, a large-scale collection of RAG question-answer examples.
|
74 |
+
- **Source**: [HaluEval QA](https://huggingface.co/datasets/pminervini/HaluEval/viewer/qa?views%5B%5D=qa)
|
75 |
+
- **Size**: 10,000 question-document pairs
|
76 |
+
- **Content**: Each example contains a short passage (extracted primarily from Wikipedia-style articles) and an accompanying question that can be answered **only** from that passage.
|
77 |
+
- **Use case**: Designed to measure whether an LLM can remain faithful to supplied context without inventing new facts.
|
78 |
+
|
79 |
+
All prompts are generated with *temperature = 0* to remove randomness so that differences in hallucination rate stem solely from the prompt format.
|
80 |
+
|
81 |
+
### Metric
|
82 |
+
|
83 |
+
The values in the table indicate the **hallucination rate (%)** of answers deemed factually incorrect or ungrounded given the provided context.
|
84 |
+
|
85 |
+
Hallucination rates are automatically computed using **[Verify](https://platform.kluster.ai/verify)** by [kluster.ai](https://kluster.ai/), the [leading](https://www.kluster.ai/blog/introducing-verify-by-kluster-ai-the-missing-trust-layer-in-your-ai-stack) AI-powered hallucination detection API that cross-checks model claims against the source document.
|
86 |
+
|
rag_techniques_details.md
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### RAG Techniques Evaluated
|
2 |
+
|
3 |
+
**1. RAG with Context in System Prompt**
|
4 |
+
The document is embedded inside the system prompt, and the user sends only the question:
|
5 |
+
```text
|
6 |
+
[System]: You are an assistant for question-answering tasks.
|
7 |
+
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
8 |
+
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
9 |
+
The ANSWER also must not contradict information provided in the DOCUMENT.
|
10 |
+
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
11 |
+
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
12 |
+
Use three sentences maximum and keep the answer concise.
|
13 |
+
DOCUMENT: <context>
|
14 |
+
|
15 |
+
[User]: <prompt>
|
16 |
+
```
|
17 |
+
|
18 |
+
**2. RAG with Context and Question in Single-Turn**
|
19 |
+
Both the document and question are concatenated in a single user message:
|
20 |
+
```text
|
21 |
+
[System]: You are an assistant for question-answering tasks.
|
22 |
+
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
23 |
+
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
24 |
+
The ANSWER also must not contradict information provided in the DOCUMENT.
|
25 |
+
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
26 |
+
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
27 |
+
Use three sentences maximum and keep the answer concise.
|
28 |
+
|
29 |
+
[User]:
|
30 |
+
DOCUMENT: <context>
|
31 |
+
QUESTION: <prompt>
|
32 |
+
|
33 |
+
```
|
34 |
+
|
35 |
+
**3. RAG with Context and Question in Two-Turns**
|
36 |
+
The document and question are sent in separate user messages:
|
37 |
+
```text
|
38 |
+
[System]: You are an assistant for question-answering tasks.
|
39 |
+
Given the QUESTION and DOCUMENT you must answer the QUESTION using the information in the DOCUMENT.
|
40 |
+
You must not offer new information beyond the context provided in the DOCUMENT. Do not add any external knowledge.
|
41 |
+
The ANSWER also must not contradict information provided in the DOCUMENT.
|
42 |
+
If the DOCUMENT does not contain the facts to answer the QUESTION or you do not know the answer, you truthfully say that you do not know.
|
43 |
+
You have access to information provided by the user as DOCUMENT to answer the QUESTION, and nothing else.
|
44 |
+
Use three sentences maximum and keep the answer concise.
|
45 |
+
|
46 |
+
[User]: DOCUMENT: <context>
|
47 |
+
[User]: QUESTION: <prompt>
|
48 |
+
```
|
49 |
+
*Note: This method did **not** work on Gemma 3 27B with the default chat template due to its restriction on consecutive user messages without an intervening assistant response.*
|
50 |
+
|
51 |
+
### Dataset
|
52 |
+
We evaluate all three prompting strategies on the **HaluEval QA** benchmark, a large-scale collection of RAG question-answer examples.
|
53 |
+
- **Source**: [HaluEval QA](https://huggingface.co/datasets/pminervini/HaluEval/viewer/qa?views%5B%5D=qa)
|
54 |
+
- **Size**: 10,000 question-document pairs
|
55 |
+
- **Content**: Each example contains a short passage (extracted primarily from Wikipedia-style articles) and an accompanying question that can be answered **only** from that passage.
|
56 |
+
- **Use case**: Designed to measure whether an LLM can remain faithful to supplied context without inventing new facts.
|
57 |
+
|
58 |
+
All prompts are generated with *temperature = 0* to remove randomness so that differences in hallucination rate stem solely from the prompt format.
|
59 |
+
|
60 |
+
### Metric
|
61 |
+
|
62 |
+
The values in the table indicate the **hallucination rate (%)** of answers deemed factually incorrect or ungrounded given the provided context.
|
63 |
+
|
64 |
+
Hallucination rates are automatically computed using **[Verify](https://platform.kluster.ai/verify)** by [kluster.ai](https://kluster.ai/), the [leading](https://www.kluster.ai/blog/introducing-verify-by-kluster-ai-the-missing-trust-layer-in-your-ai-stack) AI-powered hallucination detection API that cross-checks model claims against the source document.
|
rag_techniques_intro.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--
|
2 |
+
keywords: RAG techniques, Retrieval-Augmented Generation prompt engineering, hallucination detection, LLM hallucination rate, kluster.ai, Verify API, prompt design comparison, large language model evaluation
|
3 |
+
-->
|
4 |
+
# Comparison of Retrieval-Augmented Generation (RAG) Prompting Techniques and Hallucinations
|
5 |
+
|
6 |
+
LLMs can generate fluent answers but still hallucinate facts, especially in Retrieval-Augmented Generation (RAG) workflows. This leaderboard aims to understand how different prompt engineering strategies impact hallucination rates across models. In other words: Which prompt format is most reliable? Which models are more sensitive to prompt structure? The goal is to inform better design of RAG pipelines so you can reduce factual errors in downstream applications.
|
7 |
+
|
8 |
+
We present hallucination rates for various LLMs under three RAG request strategies. Each method delivers the same document context and question, but differs in how the information is structured during the request.
|
9 |
+
|
10 |
+
## Overview
|
11 |
+
|
12 |
+
- **What we measure**: Hallucination rate (%) across three RAG request patterns.
|
13 |
+
- **RAG patterns compared**:
|
14 |
+
1) **System Prompt**: context is placed in the system message; user sends only the question.
|
15 |
+
2) **Single-Turn**: one user message that includes both the context *and* the question.
|
16 |
+
3) **Two-Turn**: first user message provides the context, a second user message provides the question.
|
17 |
+
- **Why it matters**: Request structure can change reliability significantly. Knowing the safest default helps you ship trustworthy RAG systems faster.
|
18 |
+
- **Detect & reduce hallucinations**: The same [Verify](https://platform.kluster.ai/verify) API used for these evaluations can be plugged into your pipeline to flag and filter ungrounded answers in real time.
|
19 |
+
- **How to read the charts**: Lower bars = fewer hallucinations. Error bars show Β±1 SD across models.
|
20 |
+
- **Experiment summary**: 10,000 HaluEval-QA examples, temperature 0, judged with [Verify](https://docs.kluster.ai/get-started/verify/overview/).
|
21 |
+
|
22 |
+
> **Note:** Full experiment details, including prompt templates, dataset description, and evaluation methodology, are provided at the end of this page for reference.
|
23 |
+
<br>
|
src/about.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
|
|
3 |
|
4 |
@dataclass
|
5 |
class Task:
|
@@ -23,32 +24,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
23 |
TITLE = """<h1 align="center" id="space-title">LLM Hallucination Detection Leaderboard</h1>"""
|
24 |
|
25 |
# What does your leaderboard evaluate?
|
26 |
-
INTRODUCTION_TEXT = ""
|
27 |
-
<!--
|
28 |
-
keywords: LLM hallucination detection, hallucination leaderboard, RAG hallucination benchmark, UltraChat hallucination rate, Verify API, kluster.ai, factual accuracy of language models, large language model evaluation
|
29 |
-
-->
|
30 |
-
|
31 |
-
The **LLM Hallucination Detection Leaderboard** is a public, continuously updated comparison of how well popular Large Language Models (LLMs) avoid *hallucinations*, responses that are factually incorrect, fabricated, or unsupported by evidence. By surfacing transparent metrics across tasks, we help practitioners choose models that they can trust in production.
|
32 |
-
|
33 |
-
### Why does hallucination detection matter?
|
34 |
-
|
35 |
-
* **User Trust & Safety** β Hallucinations undermine confidence and can damage reputation.
|
36 |
-
* **Retrieval-Augmented Generation (RAG) Quality** β In enterprise workflows, LLMs must remain faithful to supplied context. Measuring hallucinations highlights which models respect that constraint.
|
37 |
-
* **Regulatory & Compliance Pressure** β Upcoming AI regulations require demonstrable accuracy standards. Reliable hallucination metrics can help you meet these requirements.
|
38 |
-
|
39 |
-
### How we measure hallucinations
|
40 |
-
|
41 |
-
We evaluate each model on two complementary benchmarks and compute a *hallucination rate* (lower = better):
|
42 |
-
|
43 |
-
1. **HaluEval-QA (RAG setting)** β Given a question *and* a supporting document, the model must answer *only* using the provided context.
|
44 |
-
2. **UltraChat Filtered (Non-RAG setting)** β Open-domain questions with **no** extra context test the model's internal knowledge.
|
45 |
-
|
46 |
-
Outputs are automatically verified by [Verify](https://platform.kluster.ai/verify) from [kluster.ai](https://kluster.ai/), which cross-checks claims against the source document or web results.
|
47 |
-
|
48 |
-
---
|
49 |
-
|
50 |
-
Stay informed as we add new models and tasks, and follow us on [X](https://x.com/klusterai) or join Discord [here](https://discord.com/invite/klusterai) for the latest updates on trustworthy LLMs.
|
51 |
-
"""
|
52 |
|
53 |
LLM_BENCHMARKS_TEXT = f"""
|
54 |
## How it works
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
+
from pathlib import Path
|
4 |
|
5 |
@dataclass
|
6 |
class Task:
|
|
|
24 |
TITLE = """<h1 align="center" id="space-title">LLM Hallucination Detection Leaderboard</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
+
INTRODUCTION_TEXT = (Path(__file__).resolve().parent.parent / "introduction.md").read_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
LLM_BENCHMARKS_TEXT = f"""
|
30 |
## How it works
|
src/populate.py
CHANGED
@@ -15,11 +15,17 @@ def get_rag_leaderboard_df(csv_path):
|
|
15 |
|
16 |
pretty = {
|
17 |
"Models": "Models",
|
18 |
-
"rag1": "
|
19 |
-
"rag2": "
|
20 |
-
"rag3": "
|
21 |
}
|
22 |
df = df.rename(columns=pretty)
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
return df
|
25 |
|
|
|
15 |
|
16 |
pretty = {
|
17 |
"Models": "Models",
|
18 |
+
"rag1": "Context in System Prompt (%)",
|
19 |
+
"rag2": "Context and Question Single-Turn (%)",
|
20 |
+
"rag3": "Context and Question Two-Turns (%)",
|
21 |
}
|
22 |
df = df.rename(columns=pretty)
|
23 |
+
|
24 |
+
# sort so the lowest Single-Turn hallucination rate appears first
|
25 |
+
df = (
|
26 |
+
df.sort_values("Context and Question Single-Turn (%)", ascending=True)
|
27 |
+
.reset_index(drop=True)
|
28 |
+
)
|
29 |
|
30 |
return df
|
31 |
|
submit.md
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
# LLM Hallucination Detection Leaderboard Submission Guidelines
|
2 |
|
3 |
Thank you for your interest in contributing to the **LLM Hallucination Detection Leaderboard**! We welcome submissions from researchers and practitioners who have built or finetuned language models that can be evaluated on our hallucination benchmarks.
|
|
|
1 |
+
<!--
|
2 |
+
keywords: LLM hallucination leaderboard submission, Verify leaderboard guidelines, kluster.ai, hallucination benchmark contributions, large language model evaluation submission
|
3 |
+
-->
|
4 |
+
|
5 |
# LLM Hallucination Detection Leaderboard Submission Guidelines
|
6 |
|
7 |
Thank you for your interest in contributing to the **LLM Hallucination Detection Leaderboard**! We welcome submissions from researchers and practitioners who have built or finetuned language models that can be evaluated on our hallucination benchmarks.
|
verify.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--
|
2 |
+
keywords: Verify by kluster.ai, LLM verification API, hallucination detection tool, factual accuracy checker, AI output validation, large language model trust layer
|
3 |
+
-->
|
4 |
+
|
5 |
+
## Get started with Verify by kluster.ai
|
6 |
+
|
7 |
+
Verify is an intelligent agent that validates LLM outputs in real-time.
|
8 |
+
|
9 |
+
- **Blog post:** [Introducing Verify by kluster.ai](https://www.kluster.ai/blog/introducing-verify-by-kluster-ai-the-missing-trust-layer-in-your-ai-stack)
|
10 |
+
- **Documentation:** [Verify overview & API reference](https://docs.kluster.ai/get-started/verify/overview/)
|
11 |
+
- **Try it out in your browser:** [kluster.ai platform](https://platform.kluster.ai/verify)
|
12 |
+
|
13 |
+
### Quick API examples
|