aloe-vera commited on
Commit
73adc36
·
verified ·
1 Parent(s): 27b730c

leaderboard v1

Browse files
Files changed (4) hide show
  1. app.py +312 -203
  2. docs.md +48 -0
  3. leaderboard/data/leaderboard.csv +16 -0
  4. static/kluster-color.png +0 -0
app.py CHANGED
@@ -1,204 +1,313 @@
1
- import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
-
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f" Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  demo.queue(default_concurrency_limit=40).launch()
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ import plotly.express as px
5
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
6
+ from apscheduler.schedulers.background import BackgroundScheduler
7
+ from huggingface_hub import snapshot_download
8
+
9
+ from src.about import (
10
+ CITATION_BUTTON_LABEL,
11
+ CITATION_BUTTON_TEXT,
12
+ EVALUATION_QUEUE_TEXT,
13
+ INTRODUCTION_TEXT,
14
+ LLM_BENCHMARKS_TEXT,
15
+ TITLE,
16
+ )
17
+ from src.display.css_html_js import custom_css
18
+ from src.display.utils import (
19
+ BENCHMARK_COLS,
20
+ COLS,
21
+ EVAL_COLS,
22
+ EVAL_TYPES,
23
+ AutoEvalColumn,
24
+ ModelType,
25
+ fields,
26
+ WeightType,
27
+ Precision
28
+ )
29
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
+ from src.submission.submit import add_new_eval
32
+ import base64
33
+
34
+
35
+ def restart_space():
36
+ API.restart_space(repo_id=REPO_ID)
37
+
38
+
39
+
40
+ def make_rate_chart(df: pd.DataFrame):
41
+ """Return a Plotly bar chart of hallucination rates."""
42
+ # long-form dataframe for grouped bars
43
+ df_long = df.melt(
44
+ id_vars="Models",
45
+ value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
46
+ var_name="Benchmark",
47
+ value_name="Rate",
48
+ )
49
+ fig = px.bar(
50
+ df_long,
51
+ x="Models",
52
+ y="Rate",
53
+ color="Benchmark",
54
+ barmode="group",
55
+ title="Hallucination Rates by Model",
56
+ height=400,
57
+ )
58
+ fig.update_layout(xaxis_title="", yaxis_title="%")
59
+ return fig
60
+
61
+ def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
62
+ """
63
+ Return a horizontal bar chart sorted ascending by `col`.
64
+ Lowest value (best) at the top.
65
+ """
66
+ df_sorted = df.sort_values(col, ascending=False) # best → worst
67
+ fig = px.bar(
68
+ df_sorted,
69
+ x=col,
70
+ y="Models",
71
+ orientation="h",
72
+ title=title,
73
+ text_auto=".2f",
74
+ height=400,
75
+ color_discrete_sequence=[bar_color],
76
+ )
77
+ fig.update_traces(textposition="outside", cliponaxis=False)
78
+
79
+ fig.update_layout(
80
+ xaxis_title="Hallucination Rate (%)",
81
+ yaxis_title="",
82
+ yaxis=dict(dtick=1), # ensure every model shown
83
+ margin=dict(l=140, r=60, t=60, b=40)
84
+ )
85
+ fig.update_traces(textposition="outside")
86
+ return fig
87
+
88
+
89
+ def color_scale(s, cmap):
90
+ """
91
+ Return background-colour styles for a numeric Series (lower = greener,
92
+ higher = redder). Works with any palette length.
93
+ """
94
+ colours = px.colors.sequential.__dict__[cmap]
95
+ n = len(colours) - 1 # max valid index
96
+
97
+ rng = s.max() - s.min()
98
+ norm = (s - s.min()) / (rng if rng else 1)
99
+
100
+ return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]
101
+
102
+
103
+ ### Space initialisation
104
+ try:
105
+ print(EVAL_REQUESTS_PATH)
106
+ snapshot_download(
107
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
108
+ )
109
+ except Exception:
110
+ # restart_space()
111
+ print(f"[WARN] Skipping RESULTS sync: {e}")
112
+ try:
113
+ print(EVAL_RESULTS_PATH)
114
+ snapshot_download(
115
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
116
+ )
117
+ except Exception:
118
+ # restart_space()
119
+ print(f"[WARN] Skipping RESULTS sync: {e}")
120
+
121
+
122
+ # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
123
+ LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
124
+
125
+ # (
126
+ # finished_eval_queue_df,
127
+ # running_eval_queue_df,
128
+ # pending_eval_queue_df,
129
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
130
+
131
+ def init_leaderboard(df: pd.DataFrame):
132
+ if df is None or df.empty:
133
+ raise ValueError("Leaderboard DataFrame is empty or None.")
134
+
135
+ return Leaderboard(
136
+ value=df,
137
+ datatype=["markdown", "markdown", "number", "number", "number"],
138
+ select_columns=SelectColumns(
139
+ default_selection=[
140
+ "Rank", "Models",
141
+ "Average Hallucination Rate (%)",
142
+ "RAG Hallucination Rate (%)",
143
+ "Non-RAG Hallucination Rate (%)"
144
+ ],
145
+ cant_deselect=["Models", "Rank"],
146
+ label="Select Columns to Display:",
147
+ ),
148
+ search_columns=["Models"],
149
+ # column_widths=["3%"],
150
+ bool_checkboxgroup_label=None,
151
+ interactive=False,
152
+ )
153
+
154
+ image_path = "static/kluster-color.png"
155
+ with open(image_path, "rb") as img_file:
156
+ b64_string = base64.b64encode(img_file.read()).decode("utf-8")
157
+
158
+
159
+ # print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
160
+ demo = gr.Blocks(css=custom_css)
161
+ with demo:
162
+ gr.HTML(f"""
163
+ <div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
164
+ <img src="data:image/png;base64,{b64_string}" alt="KlusterAI logo" style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
165
+ <div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em;">
166
+ LLM Hallucination Detection <span style="color: #0057ff;">Leaderboard</span>
167
+ </div>
168
+ <div style="font-size: 1.5em; color: #444; margin-top: 0.5em;">
169
+ Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
170
+ <a href="https://platform.kluster.ai/verify" target="_blank" style="color: #0057ff; text-decoration: none;">
171
+ Verify
172
+ </a> by
173
+ <a href="https://platform.kluster.ai/" target="_blank" style="color: #0057ff; text-decoration: none;">
174
+ KlusterAI
175
+ </a>
176
+ </div>
177
+ </div>
178
+ """)
179
+
180
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
181
+
182
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
183
+ with gr.TabItem("🏅 Hallucination Benchmark", elem_id="llm-benchmark-tab-table", id=0):
184
+ # ---------- Chart ----------
185
+ with gr.Row():
186
+ gr.Plot(
187
+ make_leaderboard_plot(
188
+ LEADERBOARD_DF,
189
+ "RAG Hallucination Rate (%)",
190
+ "RAG Hallucination Rate (lower is better)",
191
+ bar_color="#4CAF50",
192
+ ),
193
+ show_label=False,
194
+ )
195
+ gr.Plot(
196
+ make_leaderboard_plot(
197
+ LEADERBOARD_DF,
198
+ "Non-RAG Hallucination Rate (%)",
199
+ "Non-RAG Hallucination Rate (lower is better)",
200
+ bar_color="#FF7043",
201
+ ),
202
+ show_label=False,
203
+ )
204
+
205
+ # ---------- Leaderboard ----------
206
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
207
+
208
+ with gr.TabItem("📝 Document", elem_id="llm-benchmark-tab-table", id=2):
209
+ gr.Markdown((Path(__file__).parent / "docs.md").read_text())
210
+
211
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
212
+ gr.Markdown((Path(__file__).parent / "submit.md").read_text())
213
+
214
+ # with gr.Column():
215
+ # with gr.Row():
216
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
217
+
218
+ # with gr.Column():
219
+ # with gr.Accordion(
220
+ # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
221
+ # open=False,
222
+ # ):
223
+ # with gr.Row():
224
+ # finished_eval_table = gr.components.Dataframe(
225
+ # value=finished_eval_queue_df,
226
+ # headers=EVAL_COLS,
227
+ # datatype=EVAL_TYPES,
228
+ # row_count=5,
229
+ # )
230
+ # with gr.Accordion(
231
+ # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
232
+ # open=False,
233
+ # ):
234
+ # with gr.Row():
235
+ # running_eval_table = gr.components.Dataframe(
236
+ # value=running_eval_queue_df,
237
+ # headers=EVAL_COLS,
238
+ # datatype=EVAL_TYPES,
239
+ # row_count=5,
240
+ # )
241
+
242
+ # with gr.Accordion(
243
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
244
+ # open=False,
245
+ # ):
246
+ # with gr.Row():
247
+ # pending_eval_table = gr.components.Dataframe(
248
+ # value=pending_eval_queue_df,
249
+ # headers=EVAL_COLS,
250
+ # datatype=EVAL_TYPES,
251
+ # row_count=5,
252
+ # )
253
+ # with gr.Row():
254
+ # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
255
+
256
+ # with gr.Row():
257
+ # with gr.Column():
258
+ # model_name_textbox = gr.Textbox(label="Model name")
259
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
260
+ # model_type = gr.Dropdown(
261
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
262
+ # label="Model type",
263
+ # multiselect=False,
264
+ # value=None,
265
+ # interactive=True,
266
+ # )
267
+
268
+ # with gr.Column():
269
+ # precision = gr.Dropdown(
270
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
271
+ # label="Precision",
272
+ # multiselect=False,
273
+ # value="float16",
274
+ # interactive=True,
275
+ # )
276
+ # weight_type = gr.Dropdown(
277
+ # choices=[i.value.name for i in WeightType],
278
+ # label="Weights type",
279
+ # multiselect=False,
280
+ # value="Original",
281
+ # interactive=True,
282
+ # )
283
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
284
+
285
+ # submit_button = gr.Button("Submit Eval")
286
+ # submission_result = gr.Markdown()
287
+ # submit_button.click(
288
+ # add_new_eval,
289
+ # [
290
+ # model_name_textbox,
291
+ # base_model_name_textbox,
292
+ # revision_name_textbox,
293
+ # precision,
294
+ # weight_type,
295
+ # model_type,
296
+ # ],
297
+ # submission_result,
298
+ # )
299
+
300
+ # with gr.Row():
301
+ # with gr.Accordion("📙 Citation", open=False):
302
+ # citation_button = gr.Textbox(
303
+ # value=CITATION_BUTTON_TEXT,
304
+ # label=CITATION_BUTTON_LABEL,
305
+ # lines=20,
306
+ # elem_id="citation-button",
307
+ # show_copy_button=True,
308
+ # )
309
+
310
+ scheduler = BackgroundScheduler()
311
+ scheduler.add_job(restart_space, "interval", seconds=1800)
312
+ scheduler.start()
313
  demo.queue(default_concurrency_limit=40).launch()
docs.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # About
2
+
3
+ As large language models (LLMs) continue to improve, evaluating how well they avoid hallucinations (producing information that is unfaithful or factually incorrect) has become increasingly important. While many models claim to be reliable, their factual grounding can vary significantly across tasks and settings.
4
+
5
+ This leaderboard provides a standardised evaluation of how different LLMs perform on hallucination detection tasks. Our goal is to help researchers and developers understand which models are more trustworthy in both grounded (context-based) and open-ended (real-world knowledge) settings. We use [Verify](https://platform.kluster.ai/verify) by [KlusterAI](https://platform.kluster.ai/), an automated hallucination detection tool, to evaluate the factual consistency of model outputs.
6
+
7
+ ---
8
+
9
+ # Tasks
10
+
11
+ We evaluate each model using two benchmarks:
12
+
13
+ ## Retrieval-Augmented Generation (RAG setting)
14
+
15
+ RAG evaluates how well a model stays faithful to a provided context when answering a question. The input consists of a synthetic or real context paired with a relevant question. Models are expected to generate answers using **only the information given**, without adding external knowledge or contradicting the context.
16
+
17
+ - **Source**: [HaluEval QA](https://huggingface.co/datasets/pminervini/HaluEval/viewer/qa?views%5B%5D=qa)
18
+ - **Dataset Size**: 10,000 question-context pairs
19
+ - **Prompt Format**: Prompt with relevant context document
20
+ - **Temperature**: 0 (to enforce deterministic, grounded outputs)
21
+ - **System Prompt**: Instructs the model to only use the document and avoid guessing.
22
+
23
+ ## Real-World Knowledge (Non-RAG setting)
24
+
25
+ This setting evaluates how factually accurate a model is when **no context is provided**. The model must rely solely on its internal knowledge to answer a broad range of user questions across many topics. The answers are then verified using web search to determine factual correctness.
26
+
27
+ - **Source**: Filtered from [UltraChat](https://huggingface.co/datasets/stingning/ultrachat) prompts
28
+ - **Dataset Size**: 11,746 single-turn user queries
29
+ - **Prompt Format**: Single user prompt without additional context
30
+ - **Temperature**: 1 (to reflect natural, fluent generation)
31
+ - **System Prompt**: Encourages helpfulness, accuracy, and honesty when unsure.
32
+
33
+ ---
34
+
35
+ # Evaluation Method
36
+
37
+ We use **Verify**, a hallucination detection tool built by KlusterAI, to classify model outputs:
38
+
39
+ - In the **RAG setting**, Verify checks if the output contradicts, fabricates, or strays from the input document.
40
+ - In the **real-world knowledge setting**, Verify uses search queries to fact-check the answer based on current, public information.
41
+
42
+ Each model's hallucination rate is computed as:
43
+
44
+ ### Hallucination Rate = (Number of hallucinated outputs) / (Total number of prompts)
45
+
46
+
47
+ A **lower** hallucination rate indicates **better** performance.
48
+
leaderboard/data/leaderboard.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,ha_rag_rate,ha_non_rag_rate
2
+ klusterai/Meta-Llama-3.1-8B-Instruct-Turbo,8.1,12.5
3
+ Qwen/Qwen2.5-VL-7B-Instruct,9.35,4.55
4
+ mistralai/Mistral-Nemo-Instruct-2407,10.63,8.74
5
+ meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,3.34,0.69
6
+ meta-llama/Llama-4-Scout-17B-16E-Instruct,4.23,2.48
7
+ mistralai/Mistral-Small-24B-Instruct-2501,4.74,7.85
8
+ mistralai/Magistral-Small-2506,8.62,28.07
9
+ google/gemma-3-27b-it,3.71,0.48
10
+ klusterai/Meta-Llama-3.3-70B-Instruct-Turbo,2.12,1.09
11
+ deepseek-ai/DeepSeek-V3-0324,4.66,0.91
12
+ Qwen/Qwen3-235B-A22B-FP8,5.04,0.88
13
+ deepseek-ai/DeepSeek-R1-0528,2.26,0.78
14
+ openai/gpt-4o,6.05,
15
+ anthropic/claude-sonnet-4,2.21,
16
+ google/gemini-2.5-pro,,
static/kluster-color.png ADDED