Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
8c3427d
1
Parent(s):
25328d1
new funix based app
Browse files- Dockerfile +24 -0
- README.md +8 -32
- app.py +0 -332
- app/app.py +67 -0
- app/app_utils.py +193 -0
- app/requirements.txt +8 -0
- app/results.json +860 -0
- app/vectara_theme.py +29 -0
Dockerfile
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY ./app/vectara_theme.py /app/vectara_theme.py
|
6 |
+
COPY ./app/requirements.txt /app/requirements.txt
|
7 |
+
COPY ./app/app.py /app/app.py
|
8 |
+
COPY ./app/app_utils.py /app/app_utils.py
|
9 |
+
COPY ./app/results.json /app/results.json
|
10 |
+
|
11 |
+
# RUN mkdir -p /app/results
|
12 |
+
|
13 |
+
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
|
14 |
+
|
15 |
+
# RUN useradd -m -u 1000 user
|
16 |
+
# USER user
|
17 |
+
# ENV HOME=/home/user \
|
18 |
+
# PATH=/home/user/.local/bin:$PATH
|
19 |
+
|
20 |
+
# WORKDIR $HOME/app
|
21 |
+
|
22 |
+
# COPY --chown=user . $HOME/app
|
23 |
+
|
24 |
+
CMD ["funix", "app.py", "--host", "0.0.0.0", "--port", "7860", "--no-browser"]
|
README.md
CHANGED
@@ -1,38 +1,14 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
sdk_version: 4.44.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: true
|
10 |
-
license: apache-2.0
|
11 |
-
tags:
|
12 |
-
- leaderboard
|
13 |
---
|
14 |
|
15 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
16 |
|
17 |
-
|
|
|
18 |
|
19 |
-
|
20 |
-
```
|
21 |
-
{
|
22 |
-
"config": {
|
23 |
-
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
24 |
-
"model_name": "path of the model on the hub: org/model",
|
25 |
-
"model_sha": "revision on the hub",
|
26 |
-
},
|
27 |
-
"results": {
|
28 |
-
"task_name": {
|
29 |
-
"metric_name": score,
|
30 |
-
},
|
31 |
-
"task_name2": {
|
32 |
-
"metric_name": score,
|
33 |
-
}
|
34 |
-
}
|
35 |
-
}
|
36 |
-
```
|
37 |
|
38 |
-
Request files are created automatically by this tool.
|
|
|
1 |
---
|
2 |
+
title: LLM Hallucination Leaderboard
|
3 |
+
sdk: docker
|
4 |
+
app_port: 7860
|
5 |
+
python_version: 3.10
|
6 |
+
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
---
|
8 |
|
|
|
9 |
|
10 |
+
LLM Hallucination Leaderboard
|
11 |
+
---
|
12 |
|
13 |
+
by Vectara, Inc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
|
app.py
DELETED
@@ -1,332 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import pandas as pd
|
3 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
-
from huggingface_hub import snapshot_download
|
5 |
-
|
6 |
-
import src.display.about as about
|
7 |
-
from src.display.css_html_js import custom_css
|
8 |
-
import src.display.utils as utils
|
9 |
-
import src.envs as envs
|
10 |
-
import src.populate as populate
|
11 |
-
import src.submission.submit as submit
|
12 |
-
|
13 |
-
|
14 |
-
def restart_space():
|
15 |
-
envs.API.restart_space(repo_id=envs.REPO_ID, token=envs.TOKEN)
|
16 |
-
|
17 |
-
try:
|
18 |
-
print(envs.EVAL_REQUESTS_PATH)
|
19 |
-
snapshot_download(
|
20 |
-
repo_id=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
21 |
-
)
|
22 |
-
except Exception:
|
23 |
-
restart_space()
|
24 |
-
try:
|
25 |
-
print(envs.EVAL_RESULTS_PATH)
|
26 |
-
snapshot_download(
|
27 |
-
repo_id=envs.RESULTS_REPO, local_dir=envs.EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
28 |
-
)
|
29 |
-
except Exception:
|
30 |
-
restart_space()
|
31 |
-
|
32 |
-
raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
|
33 |
-
leaderboard_df = original_df.copy()
|
34 |
-
|
35 |
-
(
|
36 |
-
finished_eval_queue_df,
|
37 |
-
running_eval_queue_df,
|
38 |
-
pending_eval_queue_df,
|
39 |
-
) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
|
40 |
-
|
41 |
-
|
42 |
-
# Searching and filtering
|
43 |
-
def update_table(
|
44 |
-
hidden_df: pd.DataFrame,
|
45 |
-
columns: list,
|
46 |
-
type_query: list,
|
47 |
-
precision_query: str,
|
48 |
-
size_query: list,
|
49 |
-
show_deleted: bool,
|
50 |
-
query: str,
|
51 |
-
):
|
52 |
-
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
53 |
-
filtered_df = filter_queries(query, filtered_df)
|
54 |
-
df = select_columns(filtered_df, columns)
|
55 |
-
return df
|
56 |
-
|
57 |
-
|
58 |
-
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
59 |
-
return df[(df[utils.AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
60 |
-
|
61 |
-
|
62 |
-
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
63 |
-
always_here_cols = [
|
64 |
-
utils.AutoEvalColumn.model_type_symbol.name,
|
65 |
-
utils.AutoEvalColumn.model.name,
|
66 |
-
]
|
67 |
-
# We use COLS to maintain sorting
|
68 |
-
filtered_df = df[
|
69 |
-
always_here_cols + [c for c in utils.COLS if c in df.columns and c in columns] + [utils.AutoEvalColumn.dummy.name]
|
70 |
-
]
|
71 |
-
return filtered_df
|
72 |
-
|
73 |
-
|
74 |
-
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
75 |
-
final_df = []
|
76 |
-
if query != "":
|
77 |
-
queries = [q.strip() for q in query.split(";")]
|
78 |
-
for _q in queries:
|
79 |
-
_q = _q.strip()
|
80 |
-
if _q != "":
|
81 |
-
temp_filtered_df = search_table(filtered_df, _q)
|
82 |
-
if len(temp_filtered_df) > 0:
|
83 |
-
final_df.append(temp_filtered_df)
|
84 |
-
if len(final_df) > 0:
|
85 |
-
filtered_df = pd.concat(final_df)
|
86 |
-
filtered_df = filtered_df.drop_duplicates(
|
87 |
-
subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name]
|
88 |
-
)
|
89 |
-
|
90 |
-
return filtered_df
|
91 |
-
|
92 |
-
|
93 |
-
def filter_models(
|
94 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
95 |
-
) -> pd.DataFrame:
|
96 |
-
# Show all models
|
97 |
-
# if show_deleted:
|
98 |
-
# filtered_df = df
|
99 |
-
# else: # Show only still on the hub models
|
100 |
-
# filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
|
101 |
-
|
102 |
-
filtered_df = df
|
103 |
-
|
104 |
-
type_emoji = [t[0] for t in type_query]
|
105 |
-
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
106 |
-
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
107 |
-
|
108 |
-
numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
|
109 |
-
params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
|
110 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
111 |
-
filtered_df = filtered_df.loc[mask]
|
112 |
-
|
113 |
-
return filtered_df
|
114 |
-
|
115 |
-
|
116 |
-
demo = gr.Blocks(css=custom_css)
|
117 |
-
with demo:
|
118 |
-
gr.HTML("""<img referrerpolicy="no-referrer-when-downgrade"
|
119 |
-
src="https://static.scarf.sh/a.png?x-pxid=5f53f560-5ba6-4e73-917b-c7049e9aea2c"
|
120 |
-
style="width:1px;height:1px;"/>
|
121 |
-
""")
|
122 |
-
gr.HTML(about.TITLE)
|
123 |
-
gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
|
124 |
-
|
125 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
126 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
127 |
-
with gr.Row():
|
128 |
-
with gr.Column():
|
129 |
-
with gr.Row():
|
130 |
-
search_bar = gr.Textbox(
|
131 |
-
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
132 |
-
show_label=False,
|
133 |
-
elem_id="search-bar",
|
134 |
-
)
|
135 |
-
with gr.Row():
|
136 |
-
shown_columns = gr.CheckboxGroup(
|
137 |
-
choices=[
|
138 |
-
c.name
|
139 |
-
for c in utils.fields(utils.AutoEvalColumn)
|
140 |
-
if not c.hidden and not c.never_hidden and not c.dummy
|
141 |
-
],
|
142 |
-
value=[
|
143 |
-
c.name
|
144 |
-
for c in utils.fields(utils.AutoEvalColumn)
|
145 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
146 |
-
],
|
147 |
-
label="Select columns to show",
|
148 |
-
elem_id="column-select",
|
149 |
-
interactive=True,
|
150 |
-
)
|
151 |
-
# with gr.Row():
|
152 |
-
# deleted_models_visibility = gr.Checkbox(
|
153 |
-
# value=False, label="Show gated/private/deleted models", interactive=True
|
154 |
-
# )
|
155 |
-
with gr.Column(min_width=320):
|
156 |
-
#with gr.Box(elem_id="box-filter"):
|
157 |
-
filter_columns_type = gr.CheckboxGroup(
|
158 |
-
label="Model types",
|
159 |
-
choices=[t.to_str() for t in utils.ModelType],
|
160 |
-
value=[t.to_str() for t in utils.ModelType],
|
161 |
-
interactive=True,
|
162 |
-
elem_id="filter-columns-type",
|
163 |
-
)
|
164 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
165 |
-
# label="Precision",
|
166 |
-
# choices=[i.value.name for i in utils.Precision],
|
167 |
-
# value=[i.value.name for i in utils.Precision],
|
168 |
-
# interactive=True,
|
169 |
-
# elem_id="filter-columns-precision",
|
170 |
-
# )
|
171 |
-
# filter_columns_size = gr.CheckboxGroup(
|
172 |
-
# label="Model sizes (in billions of parameters)",
|
173 |
-
# choices=list(utils.NUMERIC_INTERVALS.keys()),
|
174 |
-
# value=list(utils.NUMERIC_INTERVALS.keys()),
|
175 |
-
# interactive=True,
|
176 |
-
# elem_id="filter-columns-size",
|
177 |
-
# )
|
178 |
-
|
179 |
-
leaderboard_table = gr.components.Dataframe(
|
180 |
-
value=leaderboard_df[
|
181 |
-
[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
|
182 |
-
+ shown_columns.value
|
183 |
-
+ [utils.AutoEvalColumn.dummy.name]
|
184 |
-
],
|
185 |
-
headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
186 |
-
datatype=utils.TYPES,
|
187 |
-
elem_id="leaderboard-table",
|
188 |
-
interactive=False,
|
189 |
-
visible=True,
|
190 |
-
column_widths=["2%", "33%"]
|
191 |
-
)
|
192 |
-
|
193 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
194 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
195 |
-
value=original_df[utils.COLS],
|
196 |
-
headers=utils.COLS,
|
197 |
-
datatype=utils.TYPES,
|
198 |
-
visible=False,
|
199 |
-
)
|
200 |
-
search_bar.submit(
|
201 |
-
update_table,
|
202 |
-
[
|
203 |
-
hidden_leaderboard_table_for_search,
|
204 |
-
shown_columns,
|
205 |
-
filter_columns_type,
|
206 |
-
# filter_columns_precision,
|
207 |
-
# filter_columns_size,
|
208 |
-
# deleted_models_visibility,
|
209 |
-
search_bar,
|
210 |
-
],
|
211 |
-
leaderboard_table,
|
212 |
-
)
|
213 |
-
for selector in [shown_columns, filter_columns_type]: #, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
214 |
-
selector.change(
|
215 |
-
update_table,
|
216 |
-
[
|
217 |
-
hidden_leaderboard_table_for_search,
|
218 |
-
shown_columns,
|
219 |
-
filter_columns_type,
|
220 |
-
# filter_columns_precision,
|
221 |
-
# filter_columns_size,
|
222 |
-
# deleted_models_visibility,
|
223 |
-
search_bar,
|
224 |
-
],
|
225 |
-
leaderboard_table,
|
226 |
-
queue=True,
|
227 |
-
)
|
228 |
-
|
229 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
230 |
-
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
231 |
-
|
232 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
233 |
-
with gr.Column():
|
234 |
-
with gr.Row():
|
235 |
-
gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
236 |
-
|
237 |
-
with gr.Column():
|
238 |
-
with gr.Accordion(
|
239 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
240 |
-
open=False,
|
241 |
-
):
|
242 |
-
with gr.Row():
|
243 |
-
finished_eval_table = gr.components.Dataframe(
|
244 |
-
value=finished_eval_queue_df,
|
245 |
-
headers=utils.EVAL_COLS,
|
246 |
-
datatype=utils.EVAL_TYPES,
|
247 |
-
row_count=5,
|
248 |
-
)
|
249 |
-
with gr.Accordion(
|
250 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
251 |
-
open=False,
|
252 |
-
):
|
253 |
-
with gr.Row():
|
254 |
-
running_eval_table = gr.components.Dataframe(
|
255 |
-
value=running_eval_queue_df,
|
256 |
-
headers=utils.EVAL_COLS,
|
257 |
-
datatype=utils.EVAL_TYPES,
|
258 |
-
row_count=5,
|
259 |
-
)
|
260 |
-
|
261 |
-
with gr.Accordion(
|
262 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
263 |
-
open=False,
|
264 |
-
):
|
265 |
-
with gr.Row():
|
266 |
-
pending_eval_table = gr.components.Dataframe(
|
267 |
-
value=pending_eval_queue_df,
|
268 |
-
headers=utils.EVAL_COLS,
|
269 |
-
datatype=utils.EVAL_TYPES,
|
270 |
-
row_count=5,
|
271 |
-
)
|
272 |
-
with gr.Row():
|
273 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
274 |
-
|
275 |
-
with gr.Row():
|
276 |
-
with gr.Column():
|
277 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
278 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
279 |
-
model_type = gr.Dropdown(
|
280 |
-
choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
|
281 |
-
label="Model type",
|
282 |
-
multiselect=False,
|
283 |
-
value=None,
|
284 |
-
interactive=True,
|
285 |
-
)
|
286 |
-
|
287 |
-
with gr.Column():
|
288 |
-
precision = gr.Dropdown(
|
289 |
-
choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
|
290 |
-
label="Precision",
|
291 |
-
multiselect=False,
|
292 |
-
value="float16",
|
293 |
-
interactive=True,
|
294 |
-
)
|
295 |
-
weight_type = gr.Dropdown(
|
296 |
-
choices=[i.value.name for i in utils.WeightType],
|
297 |
-
label="Weights type",
|
298 |
-
multiselect=False,
|
299 |
-
value="Original",
|
300 |
-
interactive=True,
|
301 |
-
)
|
302 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
303 |
-
|
304 |
-
submit_button = gr.Button("Submit Eval")
|
305 |
-
submission_result = gr.Markdown()
|
306 |
-
submit_button.click(
|
307 |
-
submit.add_new_eval,
|
308 |
-
[
|
309 |
-
model_name_textbox,
|
310 |
-
base_model_name_textbox,
|
311 |
-
revision_name_textbox,
|
312 |
-
precision,
|
313 |
-
weight_type,
|
314 |
-
model_type,
|
315 |
-
],
|
316 |
-
submission_result,
|
317 |
-
)
|
318 |
-
|
319 |
-
with gr.Row():
|
320 |
-
with gr.Accordion("📙 Citation", open=False):
|
321 |
-
citation_button = gr.Textbox(
|
322 |
-
value=about.CITATION_BUTTON_TEXT,
|
323 |
-
label=about.CITATION_BUTTON_LABEL,
|
324 |
-
lines=20,
|
325 |
-
elem_id="citation-button",
|
326 |
-
show_copy_button=True,
|
327 |
-
)
|
328 |
-
|
329 |
-
scheduler = BackgroundScheduler()
|
330 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
331 |
-
scheduler.start()
|
332 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/app.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, Literal, List, Tuple
|
2 |
+
import json
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import matplotlib.figure
|
6 |
+
from IPython.display import Markdown
|
7 |
+
|
8 |
+
import dotenv
|
9 |
+
dotenv.load_dotenv() # load HF_TOKEN
|
10 |
+
|
11 |
+
from funix import funix, import_theme
|
12 |
+
from vectara_theme import vectara_theme
|
13 |
+
import_theme(vectara_theme)
|
14 |
+
|
15 |
+
from app_utils import load_results, visualize_leaderboard
|
16 |
+
|
17 |
+
results_df = load_results()
|
18 |
+
|
19 |
+
@funix(
|
20 |
+
title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
|
21 |
+
direction="column",
|
22 |
+
autorun="always",
|
23 |
+
theme="vectara",
|
24 |
+
figure_to_image= True,
|
25 |
+
# output_layout=[
|
26 |
+
# [{"return_index": 0, "width": 0.3}],
|
27 |
+
# [{"return_index": 1, "width": 0.7}],
|
28 |
+
# ]
|
29 |
+
)
|
30 |
+
def leaderboard(
|
31 |
+
filter_models_by_name: str = ""
|
32 |
+
# ) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
|
33 |
+
) -> Tuple[Markdown, pd.DataFrame]:
|
34 |
+
"""# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
|
35 |
+
|
36 |
+
Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
|
37 |
+
|
38 |
+
## Usage
|
39 |
+
|
40 |
+
* All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
|
41 |
+
* Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
|
42 |
+
* To sort the table, hover over a column header and click the arrow. The arrow automatically points up and down depending on the sort order.
|
43 |
+
* Click the "Refresh" button to refresh the leaderboard if the table is not shown or does not update when you change the filter.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
filter_models_by_name: filter models by name using comma-separated strings
|
47 |
+
"""
|
48 |
+
df = results_df
|
49 |
+
|
50 |
+
filter_models_by_name = filter_models_by_name.replace(",", ";")
|
51 |
+
filter_models_by_name = filter_models_by_name.replace(" ", "")
|
52 |
+
if len(filter_models_by_name) > 0:
|
53 |
+
filter_models_by_name = filter_models_by_name.split(";")
|
54 |
+
filter_models_by_name = [name for name in filter_models_by_name if name != ""]
|
55 |
+
df = df.copy()
|
56 |
+
df = df[df["LLM"].str.contains("|".join(filter_models_by_name), na=False)]
|
57 |
+
|
58 |
+
if len(df) == 0: # return an empty DF and an empty figure
|
59 |
+
# return pd.DataFrame(), matplotlib.figure.Figure(), Markdown("No models found")
|
60 |
+
return Markdown("No models found"), pd.DataFrame()
|
61 |
+
|
62 |
+
return Markdown(""), df
|
63 |
+
|
64 |
+
fig = visualize_leaderboard(df)
|
65 |
+
|
66 |
+
# return df, fig
|
67 |
+
# return Markdown(""), fig, df
|
app/app_utils.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from huggingface_hub import Repository
|
5 |
+
import pandas as pd
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import matplotlib.figure
|
8 |
+
from sklearn.preprocessing import MinMaxScaler
|
9 |
+
|
10 |
+
import dotenv
|
11 |
+
dotenv.load_dotenv()
|
12 |
+
|
13 |
+
min_max_scaler = MinMaxScaler()
|
14 |
+
|
15 |
+
# %%
|
16 |
+
def pull_results(results_dir: str):
|
17 |
+
repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset", token=os.getenv("HF_TOKEN"))
|
18 |
+
repo.git_pull()
|
19 |
+
|
20 |
+
def extract_info_from_result_file(result_file):
|
21 |
+
"""
|
22 |
+
{
|
23 |
+
"config": {
|
24 |
+
"model_dtype": "float16",
|
25 |
+
"model_name": "databricks/dbrx-instruct",
|
26 |
+
"model_sha": "main"
|
27 |
+
},
|
28 |
+
"results": {
|
29 |
+
"hallucination_rate": {
|
30 |
+
"hallucination_rate": 8.34990059642147
|
31 |
+
},
|
32 |
+
"factual_consistency_rate": {
|
33 |
+
"factual_consistency_rate": 91.65009940357854
|
34 |
+
},
|
35 |
+
"answer_rate": {
|
36 |
+
"answer_rate": 100.0
|
37 |
+
},
|
38 |
+
"average_summary_length": {
|
39 |
+
"average_summary_length": 85.9
|
40 |
+
}
|
41 |
+
}
|
42 |
+
"""
|
43 |
+
|
44 |
+
info = json.load(open(result_file, 'r'))
|
45 |
+
result = {
|
46 |
+
"LLM": info["config"]["model_name"],
|
47 |
+
"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
|
48 |
+
# "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
|
49 |
+
"Answer %": info["results"]["answer_rate"]["answer_rate"],
|
50 |
+
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
|
51 |
+
}
|
52 |
+
return result
|
53 |
+
|
54 |
+
def get_latest_result_file(dir: str):
|
55 |
+
"""
|
56 |
+
Get the latest result file in the given directory based on the timestamp in the file name.
|
57 |
+
"""
|
58 |
+
if not os.path.isdir(dir):
|
59 |
+
return None
|
60 |
+
files = os.listdir(dir)
|
61 |
+
files = [f for f in files if f.endswith(".json")]
|
62 |
+
if len(files) == 0:
|
63 |
+
return None
|
64 |
+
files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
|
65 |
+
# print ("Scanning: ", dir, "found latest file: ", files[0])
|
66 |
+
return os.path.join(dir, files[0])
|
67 |
+
|
68 |
+
def scan_and_extract(dir: str):
|
69 |
+
"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
|
70 |
+
"""
|
71 |
+
|
72 |
+
results = []
|
73 |
+
for root, dirs, files in os.walk(dir):
|
74 |
+
if len(dirs) == 0:
|
75 |
+
continue
|
76 |
+
for dir in dirs:
|
77 |
+
result_file = get_latest_result_file(os.path.join(root, dir))
|
78 |
+
if result_file is not None:
|
79 |
+
results.append(extract_info_from_result_file(result_file))
|
80 |
+
return results
|
81 |
+
|
82 |
+
def load_results(
|
83 |
+
results_dir: str = "./results",
|
84 |
+
results_json: str = "./results.json"
|
85 |
+
):
|
86 |
+
|
87 |
+
try:
|
88 |
+
pull_results(results_dir)
|
89 |
+
print ("Successfully pulled results from {results_dir}")
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Failed to pull and/or extract latest results: {e}")
|
92 |
+
|
93 |
+
try:
|
94 |
+
results = scan_and_extract(results_dir)
|
95 |
+
if len(results) > 0:
|
96 |
+
with open(results_json, "w") as f:
|
97 |
+
json.dump(results, f, indent=2)
|
98 |
+
print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
|
99 |
+
else:
|
100 |
+
print(f"No results found in {results_dir}")
|
101 |
+
except Exception as e:
|
102 |
+
print(f"Failed to scan and extract results from {results_dir}: {e}")
|
103 |
+
print(f"Using pre-dumped results from {results_json}")
|
104 |
+
|
105 |
+
results = json.load(open(results_json, "r"))
|
106 |
+
|
107 |
+
results_df = pd.DataFrame(results)
|
108 |
+
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
|
109 |
+
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
|
110 |
+
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
|
111 |
+
|
112 |
+
return results_df
|
113 |
+
|
114 |
+
# %%
|
115 |
+
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
|
116 |
+
# based on both hallucination percent and LLM name, determine font size
|
117 |
+
# if hallucination percentage is low and LLM name is long, use smaller font size
|
118 |
+
name_length = len(LLM)
|
119 |
+
if hallucination_percent < 0.25:
|
120 |
+
if name_length > 10:
|
121 |
+
return 8.5
|
122 |
+
else:
|
123 |
+
return 9
|
124 |
+
else:
|
125 |
+
return 9
|
126 |
+
|
127 |
+
def determine_font_color(hallucination_percent: float) -> str:
|
128 |
+
if hallucination_percent < 0.3:
|
129 |
+
return 'white'
|
130 |
+
elif hallucination_percent < 0.65:
|
131 |
+
return 'black'
|
132 |
+
else:
|
133 |
+
return 'white'
|
134 |
+
|
135 |
+
def determine_llm_x_position(LLM: str, hallucination_percent: float) -> float:
|
136 |
+
# determine the x position of the LLM name
|
137 |
+
# For an LLM, it's bar length is 10* its hallucination %
|
138 |
+
# if the LLM name cannot fit in the bar, move it to the left
|
139 |
+
# if the LLM name can fit in the bar, let its x position be 0.01
|
140 |
+
|
141 |
+
name_length = len(LLM)
|
142 |
+
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
|
143 |
+
|
144 |
+
hallu_rate_to_bar_length_ratio = 10
|
145 |
+
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
|
146 |
+
if name_length > bar_length:
|
147 |
+
return 0.01
|
148 |
+
else:
|
149 |
+
return hallucination_percent
|
150 |
+
|
151 |
+
def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
|
152 |
+
fig = plt.figure(figsize=(5, 4))
|
153 |
+
# plot using LLM as x-axis and Hallucination % as y-axis
|
154 |
+
# make bars horizontal
|
155 |
+
plot_df = df.head(10)
|
156 |
+
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
|
157 |
+
|
158 |
+
plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
|
159 |
+
|
160 |
+
# for i, row in plot_df.iterrows():
|
161 |
+
# plt.text(
|
162 |
+
# determine_llm_x_position(row["LLM"], row["Hallucination %"]),
|
163 |
+
# row["LLM"],
|
164 |
+
# f"{row['LLM']}",
|
165 |
+
# ha='left',
|
166 |
+
# va='center',
|
167 |
+
# fontsize=9,
|
168 |
+
# color=determine_font_color(row["normalized_hallucination_rate"])
|
169 |
+
# )
|
170 |
+
# plt.yticks([])
|
171 |
+
plt.tight_layout()
|
172 |
+
|
173 |
+
plt.xticks(fontsize=9)
|
174 |
+
# plt.xlabel("Hallucination %", fontsize=9)
|
175 |
+
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=9)
|
176 |
+
plt.gca().spines['top'].set_visible(False)
|
177 |
+
plt.gca().spines['right'].set_visible(False)
|
178 |
+
plt.gca().spines['left'].set_visible(False)
|
179 |
+
plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down
|
180 |
+
|
181 |
+
return fig
|
182 |
+
|
183 |
+
# %%
|
184 |
+
|
185 |
+
if __name__ == "__main__":
|
186 |
+
results = scan_and_extract("./results")
|
187 |
+
with open("./results.json", "w") as f:
|
188 |
+
json.dump(results, f, indent=2)
|
189 |
+
|
190 |
+
# %%
|
191 |
+
|
192 |
+
|
193 |
+
|
app/requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
funix==0.6.1
|
2 |
+
pandas
|
3 |
+
dotenv
|
4 |
+
huggingface_hub
|
5 |
+
matplotlib
|
6 |
+
scikit-learn
|
7 |
+
ipython
|
8 |
+
git-lfs
|
app/results.json
ADDED
@@ -0,0 +1,860 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"LLM": "gemini-2.0-flash-exp",
|
4 |
+
"Hallucination %": 1.3,
|
5 |
+
"Answer %": 99.9,
|
6 |
+
"Avg Summary Words": 60.0
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"LLM": "deepseek/deepseek-r1",
|
10 |
+
"Hallucination %": 14.3,
|
11 |
+
"Answer %": 100.0,
|
12 |
+
"Avg Summary Words": 77.1
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"LLM": "deepseek/deepseek-v3",
|
16 |
+
"Hallucination %": 3.9,
|
17 |
+
"Answer %": 100.0,
|
18 |
+
"Avg Summary Words": 88.2
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"LLM": "deepseek/deepseek-chat",
|
22 |
+
"Hallucination %": 2.4,
|
23 |
+
"Answer %": 100.0,
|
24 |
+
"Avg Summary Words": 83.2
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"LLM": "deepseek/deepseek-v3-0324",
|
28 |
+
"Hallucination %": 8.0,
|
29 |
+
"Answer %": 100.0,
|
30 |
+
"Avg Summary Words": 78.9
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"LLM": "openai/chatgpt-4o-latest",
|
34 |
+
"Hallucination %": 3.5,
|
35 |
+
"Answer %": 100.0,
|
36 |
+
"Avg Summary Words": 63.5
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"LLM": "openai/GPT-4",
|
40 |
+
"Hallucination %": 1.8050541516245486,
|
41 |
+
"Answer %": 100.0,
|
42 |
+
"Avg Summary Words": 81.1
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"LLM": "openai/o3-mini-high-reasoning",
|
46 |
+
"Hallucination %": 0.7952286282306176,
|
47 |
+
"Answer %": 100.0,
|
48 |
+
"Avg Summary Words": 79.51888667992047
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"LLM": "openai/gpt-4.1-mini",
|
52 |
+
"Hallucination %": 2.2,
|
53 |
+
"Answer %": 100.0,
|
54 |
+
"Avg Summary Words": 79.6
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"LLM": "openai/o1-pro",
|
58 |
+
"Hallucination %": 2.4,
|
59 |
+
"Answer %": 100.0,
|
60 |
+
"Avg Summary Words": 81.0
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"LLM": "openai/gpt-4.1-nano",
|
64 |
+
"Hallucination %": 2.0,
|
65 |
+
"Answer %": 100.0,
|
66 |
+
"Avg Summary Words": 70.2
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"LLM": "openai/o1-mini",
|
70 |
+
"Hallucination %": 1.4,
|
71 |
+
"Answer %": 100.0,
|
72 |
+
"Avg Summary Words": 78.3
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"LLM": "openai/GPT-4-Turbo",
|
76 |
+
"Hallucination %": 1.6898608349900597,
|
77 |
+
"Answer %": 100.0,
|
78 |
+
"Avg Summary Words": 86.2
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"LLM": "openai/o3",
|
82 |
+
"Hallucination %": 6.8,
|
83 |
+
"Answer %": 100.0,
|
84 |
+
"Avg Summary Words": 77.7
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"LLM": "openai/GPT-3.5-Turbo",
|
88 |
+
"Hallucination %": 1.9,
|
89 |
+
"Answer %": 99.6,
|
90 |
+
"Avg Summary Words": 84.1
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"LLM": "openai/o1",
|
94 |
+
"Hallucination %": 2.4,
|
95 |
+
"Answer %": 99.9,
|
96 |
+
"Avg Summary Words": 73.0
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"LLM": "openai/GPT-4o",
|
100 |
+
"Hallucination %": 1.4910536779324055,
|
101 |
+
"Answer %": 100.0,
|
102 |
+
"Avg Summary Words": 77.8
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"LLM": "openai/GPT-4o-mini",
|
106 |
+
"Hallucination %": 1.7,
|
107 |
+
"Answer %": 100.0,
|
108 |
+
"Avg Summary Words": 76.3
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"LLM": "openai/o1-preview",
|
112 |
+
"Hallucination %": 3.3,
|
113 |
+
"Answer %": 100.0,
|
114 |
+
"Avg Summary Words": 119.3
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"LLM": "openai/o4-mini",
|
118 |
+
"Hallucination %": 4.6,
|
119 |
+
"Answer %": 100.0,
|
120 |
+
"Avg Summary Words": 82.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"LLM": "openai/gpt-4.5-preview",
|
124 |
+
"Hallucination %": 1.2,
|
125 |
+
"Answer %": 100.0,
|
126 |
+
"Avg Summary Words": 77.0
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"LLM": "openai/gpt-4.1",
|
130 |
+
"Hallucination %": 2.0,
|
131 |
+
"Answer %": 100.0,
|
132 |
+
"Avg Summary Words": 71.9
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"LLM": "Qwen/Qwen2-VL-2B-Instruct",
|
136 |
+
"Hallucination %": 8.3,
|
137 |
+
"Answer %": 100.0,
|
138 |
+
"Avg Summary Words": 81.8
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"LLM": "Qwen/Qwen2.5-14B-Instruct",
|
142 |
+
"Hallucination %": 4.2,
|
143 |
+
"Answer %": 100.0,
|
144 |
+
"Avg Summary Words": 74.8
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"LLM": "Qwen/Qwen3-32B",
|
148 |
+
"Hallucination %": 2.8,
|
149 |
+
"Answer %": 100.0,
|
150 |
+
"Avg Summary Words": 82.4
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"LLM": "Qwen/Qwen2.5-32B-Instruct",
|
154 |
+
"Hallucination %": 3.0,
|
155 |
+
"Answer %": 100.0,
|
156 |
+
"Avg Summary Words": 67.9
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"LLM": "Qwen/QwQ-32B-Preview",
|
160 |
+
"Hallucination %": 12.9,
|
161 |
+
"Answer %": 100.0,
|
162 |
+
"Avg Summary Words": 140.2
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"LLM": "Qwen/Qwen3-0.6B",
|
166 |
+
"Hallucination %": 3.7,
|
167 |
+
"Answer %": 100.0,
|
168 |
+
"Avg Summary Words": 65.3
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"LLM": "Qwen/Qwen3-14B",
|
172 |
+
"Hallucination %": 2.2,
|
173 |
+
"Answer %": 100.0,
|
174 |
+
"Avg Summary Words": 82.4
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"LLM": "Qwen/Qwen2.5-3B-Instruct",
|
178 |
+
"Hallucination %": 7.0,
|
179 |
+
"Answer %": 100.0,
|
180 |
+
"Avg Summary Words": 70.4
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"LLM": "Qwen/Qwen2.5-1.5B-Instruct",
|
184 |
+
"Hallucination %": 15.8,
|
185 |
+
"Answer %": 100.0,
|
186 |
+
"Avg Summary Words": 70.7
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"LLM": "Qwen/Qwen2-VL-7B-Instruct",
|
190 |
+
"Hallucination %": 4.2,
|
191 |
+
"Answer %": 100.0,
|
192 |
+
"Avg Summary Words": 73.9
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"LLM": "Qwen/Qwen2.5-0.5B-Instruct",
|
196 |
+
"Hallucination %": 25.2,
|
197 |
+
"Answer %": 100.0,
|
198 |
+
"Avg Summary Words": 72.6
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"LLM": "Qwen/Qwen3-4B",
|
202 |
+
"Hallucination %": 2.7,
|
203 |
+
"Answer %": 100.0,
|
204 |
+
"Avg Summary Words": 87.7
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"LLM": "Qwen/Qwen2.5-72B-Instruct",
|
208 |
+
"Hallucination %": 4.3,
|
209 |
+
"Answer %": 100.0,
|
210 |
+
"Avg Summary Words": 80.8
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"LLM": "Qwen/Qwen3-8B",
|
214 |
+
"Hallucination %": 3.0,
|
215 |
+
"Answer %": 100.0,
|
216 |
+
"Avg Summary Words": 78.2
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"LLM": "Qwen/Qwen3-1.7B",
|
220 |
+
"Hallucination %": 4.4,
|
221 |
+
"Answer %": 100.0,
|
222 |
+
"Avg Summary Words": 69.0
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"LLM": "Qwen/Qwen2-72B-Instruct",
|
226 |
+
"Hallucination %": 4.7,
|
227 |
+
"Answer %": 100.0,
|
228 |
+
"Avg Summary Words": 100.1
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"LLM": "Qwen/Qwen2.5-7B-Instruct",
|
232 |
+
"Hallucination %": 2.8,
|
233 |
+
"Answer %": 100.0,
|
234 |
+
"Avg Summary Words": 71.0
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"LLM": "allenai/OLMo-2-1124-7B-Instruct",
|
238 |
+
"Hallucination %": 11.1,
|
239 |
+
"Answer %": 100.0,
|
240 |
+
"Avg Summary Words": 112.6
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"LLM": "allenai/OLMo-2-1124-13B-Instruct",
|
244 |
+
"Hallucination %": 10.8,
|
245 |
+
"Answer %": 100.0,
|
246 |
+
"Avg Summary Words": 82.0
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"LLM": "allenai/olmo-2-0325-32b-instruct",
|
250 |
+
"Hallucination %": 4.9,
|
251 |
+
"Answer %": 99.9,
|
252 |
+
"Avg Summary Words": 100.0
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"LLM": "amazon/Titan-Express",
|
256 |
+
"Hallucination %": 13.5,
|
257 |
+
"Answer %": 99.5,
|
258 |
+
"Avg Summary Words": 98.4
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"LLM": "amazon/nova-lite-v1",
|
262 |
+
"Hallucination %": 1.8,
|
263 |
+
"Answer %": 99.9,
|
264 |
+
"Avg Summary Words": 80.7
|
265 |
+
},
|
266 |
+
{
|
267 |
+
"LLM": "amazon/nova-pro-v1",
|
268 |
+
"Hallucination %": 1.8,
|
269 |
+
"Answer %": 100.0,
|
270 |
+
"Avg Summary Words": 85.5
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"LLM": "amazon/nova-micro-v1",
|
274 |
+
"Hallucination %": 1.6,
|
275 |
+
"Answer %": 100.0,
|
276 |
+
"Avg Summary Words": 90.0
|
277 |
+
},
|
278 |
+
{
|
279 |
+
"LLM": "google/gemini-2.5-pro-exp-03-25",
|
280 |
+
"Hallucination %": 1.1,
|
281 |
+
"Answer %": 95.1,
|
282 |
+
"Avg Summary Words": 72.9
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"LLM": "google/PaLM-2",
|
286 |
+
"Hallucination %": 14.1,
|
287 |
+
"Answer %": 99.8,
|
288 |
+
"Avg Summary Words": 86.6
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"LLM": "google/gemma-1.1-2b-it",
|
292 |
+
"Hallucination %": 27.8,
|
293 |
+
"Answer %": 100.0,
|
294 |
+
"Avg Summary Words": 66.8
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"LLM": "google/gemini-2.0-flash-thinking-exp",
|
298 |
+
"Hallucination %": 1.8,
|
299 |
+
"Answer %": 99.3,
|
300 |
+
"Avg Summary Words": 73.2
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"LLM": "google/gemma-3-1b-it",
|
304 |
+
"Hallucination %": 5.3,
|
305 |
+
"Answer %": 99.9,
|
306 |
+
"Avg Summary Words": 57.9
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"LLM": "google/gemma-2-2b-it",
|
310 |
+
"Hallucination %": 7.0,
|
311 |
+
"Answer %": 100.0,
|
312 |
+
"Avg Summary Words": 62.2
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"LLM": "google/flan-t5-large",
|
316 |
+
"Hallucination %": 18.3,
|
317 |
+
"Answer %": 99.3,
|
318 |
+
"Avg Summary Words": 20.9
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"LLM": "google/gemini-2.5-flash-preview-04-17",
|
322 |
+
"Hallucination %": 1.3,
|
323 |
+
"Answer %": 91.2,
|
324 |
+
"Avg Summary Words": 71.1
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"LLM": "google/Gemini-Pro",
|
328 |
+
"Hallucination %": 7.6767676767676765,
|
329 |
+
"Answer %": 98.4,
|
330 |
+
"Avg Summary Words": 89.5
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"LLM": "google/gemini-1.5-pro-001",
|
334 |
+
"Hallucination %": 9.1,
|
335 |
+
"Answer %": 99.8,
|
336 |
+
"Avg Summary Words": 61.6
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"LLM": "google/gemma-2-9b-it",
|
340 |
+
"Hallucination %": 10.139165009940358,
|
341 |
+
"Answer %": 100.0,
|
342 |
+
"Avg Summary Words": 70.2
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"LLM": "google/gemma-1.1-7b-it",
|
346 |
+
"Hallucination %": 17.0,
|
347 |
+
"Answer %": 100.0,
|
348 |
+
"Avg Summary Words": 64.3
|
349 |
+
},
|
350 |
+
{
|
351 |
+
"LLM": "google/gemma-3-4b-it",
|
352 |
+
"Hallucination %": 3.7,
|
353 |
+
"Answer %": 100.0,
|
354 |
+
"Avg Summary Words": 63.7
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"LLM": "google/gemini-2.0-pro-exp-02-05",
|
358 |
+
"Hallucination %": 0.8,
|
359 |
+
"Answer %": 99.7,
|
360 |
+
"Avg Summary Words": 61.5
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"LLM": "google/gemini-1.5-pro-002",
|
364 |
+
"Hallucination %": 6.6,
|
365 |
+
"Answer %": 99.9,
|
366 |
+
"Avg Summary Words": 62.0
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"LLM": "google/gemma-3-12b-it",
|
370 |
+
"Hallucination %": 2.8,
|
371 |
+
"Answer %": 100.0,
|
372 |
+
"Avg Summary Words": 69.6
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"LLM": "google/gemini-2.0-flash-001",
|
376 |
+
"Hallucination %": 0.7,
|
377 |
+
"Answer %": 100.0,
|
378 |
+
"Avg Summary Words": 65.2
|
379 |
+
},
|
380 |
+
{
|
381 |
+
"LLM": "google/gemini-1.5-flash-002",
|
382 |
+
"Hallucination %": 3.4,
|
383 |
+
"Answer %": 99.9,
|
384 |
+
"Avg Summary Words": 59.4
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"LLM": "google/gemma-7b-it",
|
388 |
+
"Hallucination %": 14.81113320079523,
|
389 |
+
"Answer %": 100.0,
|
390 |
+
"Avg Summary Words": 113.0
|
391 |
+
},
|
392 |
+
{
|
393 |
+
"LLM": "google/gemini-2.0-flash-lite-preview-02-05",
|
394 |
+
"Hallucination %": 1.2,
|
395 |
+
"Answer %": 99.5,
|
396 |
+
"Avg Summary Words": 60.9
|
397 |
+
},
|
398 |
+
{
|
399 |
+
"LLM": "google/gemini-1.5-flash-001",
|
400 |
+
"Hallucination %": 6.6,
|
401 |
+
"Answer %": 99.9,
|
402 |
+
"Avg Summary Words": 63.3
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"LLM": "google/gemma-3-27b-it",
|
406 |
+
"Hallucination %": 5.9,
|
407 |
+
"Answer %": 98.5,
|
408 |
+
"Avg Summary Words": 64.3
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"LLM": "snowflake/snowflake-arctic-instruct",
|
412 |
+
"Hallucination %": 3.0,
|
413 |
+
"Answer %": 100.0,
|
414 |
+
"Avg Summary Words": 68.7
|
415 |
+
},
|
416 |
+
{
|
417 |
+
"LLM": "01-ai/Yi-1.5-9B-Chat",
|
418 |
+
"Hallucination %": 4.9,
|
419 |
+
"Answer %": 100.0,
|
420 |
+
"Avg Summary Words": 85.7
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"LLM": "01-ai/Yi-1.5-6B-Chat",
|
424 |
+
"Hallucination %": 7.9,
|
425 |
+
"Answer %": 100.0,
|
426 |
+
"Avg Summary Words": 98.9
|
427 |
+
},
|
428 |
+
{
|
429 |
+
"LLM": "01-ai/Yi-1.5-34B-Chat",
|
430 |
+
"Hallucination %": 3.7,
|
431 |
+
"Answer %": 100.0,
|
432 |
+
"Avg Summary Words": 83.7
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"LLM": "ai21labs/AI21-Jamba-1.5-Mini",
|
436 |
+
"Hallucination %": 2.9,
|
437 |
+
"Answer %": 95.6,
|
438 |
+
"Avg Summary Words": 74.5
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"LLM": "cohere/c4ai-aya-expanse-32b",
|
442 |
+
"Hallucination %": 8.5,
|
443 |
+
"Answer %": 99.9,
|
444 |
+
"Avg Summary Words": 81.9
|
445 |
+
},
|
446 |
+
{
|
447 |
+
"LLM": "cohere/command-r-plus-08-2024",
|
448 |
+
"Hallucination %": 5.4,
|
449 |
+
"Answer %": 100.0,
|
450 |
+
"Avg Summary Words": 68.4
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"LLM": "cohere/c4ai-aya-expanse-8b",
|
454 |
+
"Hallucination %": 12.2,
|
455 |
+
"Answer %": 99.9,
|
456 |
+
"Avg Summary Words": 83.9
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"LLM": "cohere/command-a-03-2025",
|
460 |
+
"Hallucination %": 4.5,
|
461 |
+
"Answer %": 100.0,
|
462 |
+
"Avg Summary Words": 77.3
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"LLM": "cohere/command-r-08-2024",
|
466 |
+
"Hallucination %": 4.9,
|
467 |
+
"Answer %": 100.0,
|
468 |
+
"Avg Summary Words": 68.7
|
469 |
+
},
|
470 |
+
{
|
471 |
+
"LLM": "Intel/neural-chat-7b-v3-3",
|
472 |
+
"Hallucination %": 2.6,
|
473 |
+
"Answer %": 100.0,
|
474 |
+
"Avg Summary Words": 60.7
|
475 |
+
},
|
476 |
+
{
|
477 |
+
"LLM": "mistralai/pixtral-large-latest",
|
478 |
+
"Hallucination %": 6.6,
|
479 |
+
"Answer %": 100.0,
|
480 |
+
"Avg Summary Words": 76.4
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"LLM": "mistralai/Mixtral-8x22B-Instruct-v0.1",
|
484 |
+
"Hallucination %": 4.7,
|
485 |
+
"Answer %": 99.9,
|
486 |
+
"Avg Summary Words": 92.0
|
487 |
+
},
|
488 |
+
{
|
489 |
+
"LLM": "mistralai/mistral-small-latest",
|
490 |
+
"Hallucination %": 8.6,
|
491 |
+
"Answer %": 100.0,
|
492 |
+
"Avg Summary Words": 74.2
|
493 |
+
},
|
494 |
+
{
|
495 |
+
"LLM": "mistralai/mistral-large-latest",
|
496 |
+
"Hallucination %": 5.864811133200803,
|
497 |
+
"Answer %": 100.0,
|
498 |
+
"Avg Summary Words": 79.55367793240556
|
499 |
+
},
|
500 |
+
{
|
501 |
+
"LLM": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
502 |
+
"Hallucination %": 20.09950248756219,
|
503 |
+
"Answer %": 99.9,
|
504 |
+
"Avg Summary Words": 90.7
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"LLM": "mistralai/Mistral-Nemo-Instruct-2407",
|
508 |
+
"Hallucination %": 11.2,
|
509 |
+
"Answer %": 100.0,
|
510 |
+
"Avg Summary Words": 69.9
|
511 |
+
},
|
512 |
+
{
|
513 |
+
"LLM": "mistralai/Mistral-Large2",
|
514 |
+
"Hallucination %": 4.1,
|
515 |
+
"Answer %": 100.0,
|
516 |
+
"Avg Summary Words": 77.4
|
517 |
+
},
|
518 |
+
{
|
519 |
+
"LLM": "mistralai/Mistral-7B-Instruct-v0.3",
|
520 |
+
"Hallucination %": 9.5,
|
521 |
+
"Answer %": 100.0,
|
522 |
+
"Avg Summary Words": 98.4
|
523 |
+
},
|
524 |
+
{
|
525 |
+
"LLM": "mistralai/ministral-3b-latest",
|
526 |
+
"Hallucination %": 8.3,
|
527 |
+
"Answer %": 100.0,
|
528 |
+
"Avg Summary Words": 73.2
|
529 |
+
},
|
530 |
+
{
|
531 |
+
"LLM": "mistralai/ministral-8b-latest",
|
532 |
+
"Hallucination %": 7.5,
|
533 |
+
"Answer %": 100.0,
|
534 |
+
"Avg Summary Words": 62.7
|
535 |
+
},
|
536 |
+
{
|
537 |
+
"LLM": "mistralai/Mistral-Small-24B-Instruct-2501",
|
538 |
+
"Hallucination %": 3.1,
|
539 |
+
"Answer %": 100.0,
|
540 |
+
"Avg Summary Words": 74.9
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"LLM": "mistralai/mistral-small-3.1-24b-instruct",
|
544 |
+
"Hallucination %": 5.6,
|
545 |
+
"Answer %": 100.0,
|
546 |
+
"Avg Summary Words": 73.1
|
547 |
+
},
|
548 |
+
{
|
549 |
+
"LLM": "anthropic/Claude-3-5-Sonnet",
|
550 |
+
"Hallucination %": 8.6,
|
551 |
+
"Answer %": 100.0,
|
552 |
+
"Avg Summary Words": 103.0
|
553 |
+
},
|
554 |
+
{
|
555 |
+
"LLM": "anthropic/claude-3-7-sonnet-latest",
|
556 |
+
"Hallucination %": 4.4,
|
557 |
+
"Answer %": 100.0,
|
558 |
+
"Avg Summary Words": 97.8
|
559 |
+
},
|
560 |
+
{
|
561 |
+
"LLM": "anthropic/Claude-3-opus",
|
562 |
+
"Hallucination %": 10.092687950566425,
|
563 |
+
"Answer %": 95.5,
|
564 |
+
"Avg Summary Words": 92.1
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"LLM": "anthropic/Claude-2",
|
568 |
+
"Hallucination %": 17.448856799037305,
|
569 |
+
"Answer %": 99.3,
|
570 |
+
"Avg Summary Words": 87.5
|
571 |
+
},
|
572 |
+
{
|
573 |
+
"LLM": "anthropic/claude-3-5-haiku-20241022",
|
574 |
+
"Hallucination %": 4.9,
|
575 |
+
"Answer %": 100.0,
|
576 |
+
"Avg Summary Words": 92.2
|
577 |
+
},
|
578 |
+
{
|
579 |
+
"LLM": "anthropic/Claude-3-sonnet",
|
580 |
+
"Hallucination %": 16.302186878727635,
|
581 |
+
"Answer %": 100.0,
|
582 |
+
"Avg Summary Words": 108.5
|
583 |
+
},
|
584 |
+
{
|
585 |
+
"LLM": "anthropic/claude-3-7-sonnet-latest-think",
|
586 |
+
"Hallucination %": 4.5,
|
587 |
+
"Answer %": 99.8,
|
588 |
+
"Avg Summary Words": 99.9
|
589 |
+
},
|
590 |
+
{
|
591 |
+
"LLM": "ai21/jamba-1.6-mini",
|
592 |
+
"Hallucination %": 4.6,
|
593 |
+
"Answer %": 100.0,
|
594 |
+
"Avg Summary Words": 82.3
|
595 |
+
},
|
596 |
+
{
|
597 |
+
"LLM": "ai21/jamba-1.6-large",
|
598 |
+
"Hallucination %": 2.3,
|
599 |
+
"Answer %": 99.9,
|
600 |
+
"Avg Summary Words": 85.6
|
601 |
+
},
|
602 |
+
{
|
603 |
+
"LLM": "qwen/qwen3-235b-a22b",
|
604 |
+
"Hallucination %": 13.0,
|
605 |
+
"Answer %": 99.2,
|
606 |
+
"Avg Summary Words": 86.6
|
607 |
+
},
|
608 |
+
{
|
609 |
+
"LLM": "qwen/qwen-max",
|
610 |
+
"Hallucination %": 2.9,
|
611 |
+
"Answer %": 88.4,
|
612 |
+
"Avg Summary Words": 90.4
|
613 |
+
},
|
614 |
+
{
|
615 |
+
"LLM": "qwen/qwen3-30b-a3b",
|
616 |
+
"Hallucination %": 7.6,
|
617 |
+
"Answer %": 99.9,
|
618 |
+
"Avg Summary Words": 69.9
|
619 |
+
},
|
620 |
+
{
|
621 |
+
"LLM": "x-ai/grok-2-1212",
|
622 |
+
"Hallucination %": 1.9,
|
623 |
+
"Answer %": 100.0,
|
624 |
+
"Avg Summary Words": 86.5
|
625 |
+
},
|
626 |
+
{
|
627 |
+
"LLM": "x-ai/grok-2-vision-1212",
|
628 |
+
"Hallucination %": 2.9,
|
629 |
+
"Answer %": 100.0,
|
630 |
+
"Avg Summary Words": 79.8
|
631 |
+
},
|
632 |
+
{
|
633 |
+
"LLM": "databricks/dbrx-instruct",
|
634 |
+
"Hallucination %": 8.3,
|
635 |
+
"Answer %": 100.0,
|
636 |
+
"Avg Summary Words": 85.9
|
637 |
+
},
|
638 |
+
{
|
639 |
+
"LLM": "xai/grok-3-mini-latest",
|
640 |
+
"Hallucination %": 3.3,
|
641 |
+
"Answer %": 100.0,
|
642 |
+
"Avg Summary Words": 90.2
|
643 |
+
},
|
644 |
+
{
|
645 |
+
"LLM": "xai/grok-beta",
|
646 |
+
"Hallucination %": 4.6,
|
647 |
+
"Answer %": 100.0,
|
648 |
+
"Avg Summary Words": 91.0
|
649 |
+
},
|
650 |
+
{
|
651 |
+
"LLM": "xai/grok-3-latest",
|
652 |
+
"Hallucination %": 2.1,
|
653 |
+
"Answer %": 100.0,
|
654 |
+
"Avg Summary Words": 97.7
|
655 |
+
},
|
656 |
+
{
|
657 |
+
"LLM": "apple/OpenELM-3B-Instruct",
|
658 |
+
"Hallucination %": 24.776119402985074,
|
659 |
+
"Answer %": 99.3,
|
660 |
+
"Avg Summary Words": 47.2
|
661 |
+
},
|
662 |
+
{
|
663 |
+
"LLM": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
|
664 |
+
"Hallucination %": 7.9,
|
665 |
+
"Answer %": 100.0,
|
666 |
+
"Avg Summary Words": 72.2
|
667 |
+
},
|
668 |
+
{
|
669 |
+
"LLM": "meta-llama/Llama-2-70b-chat-hf",
|
670 |
+
"Hallucination %": 5.896510228640193,
|
671 |
+
"Answer %": 99.9,
|
672 |
+
"Avg Summary Words": 84.9
|
673 |
+
},
|
674 |
+
{
|
675 |
+
"LLM": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
676 |
+
"Hallucination %": 3.9,
|
677 |
+
"Answer %": 99.6,
|
678 |
+
"Avg Summary Words": 85.7
|
679 |
+
},
|
680 |
+
{
|
681 |
+
"LLM": "meta-llama/Llama-3.3-70B-Instruct",
|
682 |
+
"Hallucination %": 4.0,
|
683 |
+
"Answer %": 100.0,
|
684 |
+
"Avg Summary Words": 85.3
|
685 |
+
},
|
686 |
+
{
|
687 |
+
"LLM": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
688 |
+
"Hallucination %": 5.4,
|
689 |
+
"Answer %": 100.0,
|
690 |
+
"Avg Summary Words": 71.0
|
691 |
+
},
|
692 |
+
{
|
693 |
+
"LLM": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
694 |
+
"Hallucination %": 5.0,
|
695 |
+
"Answer %": 100.0,
|
696 |
+
"Avg Summary Words": 79.6
|
697 |
+
},
|
698 |
+
{
|
699 |
+
"LLM": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
700 |
+
"Hallucination %": 8.9,
|
701 |
+
"Answer %": 100.0,
|
702 |
+
"Avg Summary Words": 73.1
|
703 |
+
},
|
704 |
+
{
|
705 |
+
"LLM": "meta-llama/Llama-3.2-1B-Instruct",
|
706 |
+
"Hallucination %": 20.7,
|
707 |
+
"Answer %": 100.0,
|
708 |
+
"Avg Summary Words": 71.5
|
709 |
+
},
|
710 |
+
{
|
711 |
+
"LLM": "meta-llama/Llama-3-70B-chat-hf",
|
712 |
+
"Hallucination %": 4.1,
|
713 |
+
"Answer %": 99.2,
|
714 |
+
"Avg Summary Words": 68.5
|
715 |
+
},
|
716 |
+
{
|
717 |
+
"LLM": "meta-llama/Llama-3-8B-chat-hf",
|
718 |
+
"Hallucination %": 7.370517928286853,
|
719 |
+
"Answer %": 99.8,
|
720 |
+
"Avg Summary Words": 79.7
|
721 |
+
},
|
722 |
+
{
|
723 |
+
"LLM": "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
|
724 |
+
"Hallucination %": 4.3,
|
725 |
+
"Answer %": 100.0,
|
726 |
+
"Avg Summary Words": 79.8
|
727 |
+
},
|
728 |
+
{
|
729 |
+
"LLM": "meta-llama/llama-4-scout",
|
730 |
+
"Hallucination %": 4.7,
|
731 |
+
"Answer %": 100.0,
|
732 |
+
"Avg Summary Words": 80.7
|
733 |
+
},
|
734 |
+
{
|
735 |
+
"LLM": "meta-llama/Llama-2-7b-chat-hf",
|
736 |
+
"Hallucination %": 11.3,
|
737 |
+
"Answer %": 99.6,
|
738 |
+
"Avg Summary Words": 119.9
|
739 |
+
},
|
740 |
+
{
|
741 |
+
"LLM": "meta-llama/Llama-2-13b-chat-hf",
|
742 |
+
"Hallucination %": 10.5,
|
743 |
+
"Answer %": 99.8,
|
744 |
+
"Avg Summary Words": 82.1
|
745 |
+
},
|
746 |
+
{
|
747 |
+
"LLM": "meta-llama/llama-4-maverick",
|
748 |
+
"Hallucination %": 4.6,
|
749 |
+
"Answer %": 100.0,
|
750 |
+
"Avg Summary Words": 84.8
|
751 |
+
},
|
752 |
+
{
|
753 |
+
"LLM": "microsoft/Orca-2-13b",
|
754 |
+
"Hallucination %": 2.5,
|
755 |
+
"Answer %": 100.0,
|
756 |
+
"Avg Summary Words": 66.2
|
757 |
+
},
|
758 |
+
{
|
759 |
+
"LLM": "microsoft/Phi-3.5-MoE-instruct",
|
760 |
+
"Hallucination %": 2.5,
|
761 |
+
"Answer %": 96.3,
|
762 |
+
"Avg Summary Words": 69.7
|
763 |
+
},
|
764 |
+
{
|
765 |
+
"LLM": "microsoft/Phi-3-mini-4k-instruct",
|
766 |
+
"Hallucination %": 3.9761431411530817,
|
767 |
+
"Answer %": 100.0,
|
768 |
+
"Avg Summary Words": 86.8
|
769 |
+
},
|
770 |
+
{
|
771 |
+
"LLM": "microsoft/phi-4",
|
772 |
+
"Hallucination %": 4.7,
|
773 |
+
"Answer %": 100.0,
|
774 |
+
"Avg Summary Words": 100.3
|
775 |
+
},
|
776 |
+
{
|
777 |
+
"LLM": "microsoft/Phi-3.5-mini-instruct",
|
778 |
+
"Hallucination %": 4.1,
|
779 |
+
"Answer %": 100.0,
|
780 |
+
"Avg Summary Words": 75.0
|
781 |
+
},
|
782 |
+
{
|
783 |
+
"LLM": "microsoft/Phi-3-mini-128k-instruct",
|
784 |
+
"Hallucination %": 3.1,
|
785 |
+
"Answer %": 100.0,
|
786 |
+
"Avg Summary Words": 60.1
|
787 |
+
},
|
788 |
+
{
|
789 |
+
"LLM": "microsoft/Phi-4-mini-instruct",
|
790 |
+
"Hallucination %": 3.4,
|
791 |
+
"Answer %": 100.0,
|
792 |
+
"Avg Summary Words": 69.7
|
793 |
+
},
|
794 |
+
{
|
795 |
+
"LLM": "microsoft/WizardLM-2-8x22B",
|
796 |
+
"Hallucination %": 11.741293532338307,
|
797 |
+
"Answer %": 99.9,
|
798 |
+
"Avg Summary Words": 140.8
|
799 |
+
},
|
800 |
+
{
|
801 |
+
"LLM": "microsoft/phi-2",
|
802 |
+
"Hallucination %": 6.666666666666667,
|
803 |
+
"Answer %": 91.5,
|
804 |
+
"Avg Summary Words": 80.8
|
805 |
+
},
|
806 |
+
{
|
807 |
+
"LLM": "THUDM/glm-4-9b-chat",
|
808 |
+
"Hallucination %": 1.3,
|
809 |
+
"Answer %": 100.0,
|
810 |
+
"Avg Summary Words": 58.1
|
811 |
+
},
|
812 |
+
{
|
813 |
+
"LLM": "internlm/internlm3-8b-instruct",
|
814 |
+
"Hallucination %": 4.0,
|
815 |
+
"Answer %": 100.0,
|
816 |
+
"Avg Summary Words": 97.5
|
817 |
+
},
|
818 |
+
{
|
819 |
+
"LLM": "ibm-granite/granite-3.1-8b-instruct",
|
820 |
+
"Hallucination %": 8.6,
|
821 |
+
"Answer %": 100.0,
|
822 |
+
"Avg Summary Words": 107.4
|
823 |
+
},
|
824 |
+
{
|
825 |
+
"LLM": "ibm-granite/granite-3.2-2b-instruct",
|
826 |
+
"Hallucination %": 16.5,
|
827 |
+
"Answer %": 100.0,
|
828 |
+
"Avg Summary Words": 117.3
|
829 |
+
},
|
830 |
+
{
|
831 |
+
"LLM": "ibm-granite/granite-3.1-2b-instruct",
|
832 |
+
"Hallucination %": 15.7,
|
833 |
+
"Answer %": 100.0,
|
834 |
+
"Avg Summary Words": 107.7
|
835 |
+
},
|
836 |
+
{
|
837 |
+
"LLM": "ibm-granite/granite-3.0-2b-instruct",
|
838 |
+
"Hallucination %": 8.8,
|
839 |
+
"Answer %": 100.0,
|
840 |
+
"Avg Summary Words": 81.6
|
841 |
+
},
|
842 |
+
{
|
843 |
+
"LLM": "ibm-granite/granite-3.0-8b-instruct",
|
844 |
+
"Hallucination %": 6.5,
|
845 |
+
"Answer %": 100.0,
|
846 |
+
"Avg Summary Words": 74.2
|
847 |
+
},
|
848 |
+
{
|
849 |
+
"LLM": "ibm-granite/granite-3.2-8b-instruct",
|
850 |
+
"Hallucination %": 8.7,
|
851 |
+
"Answer %": 100.0,
|
852 |
+
"Avg Summary Words": 120.1
|
853 |
+
},
|
854 |
+
{
|
855 |
+
"LLM": "tiiuae/falcon-7b-instruct",
|
856 |
+
"Hallucination %": 29.92047713717694,
|
857 |
+
"Answer %": 90.0,
|
858 |
+
"Avg Summary Words": 75.5
|
859 |
+
}
|
860 |
+
]
|
app/vectara_theme.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
vectara_theme = {
|
2 |
+
"name": "vectara",
|
3 |
+
"funix": {
|
4 |
+
"run_button": "Refresh",
|
5 |
+
"grid_height": 960,
|
6 |
+
"grid_checkbox": False
|
7 |
+
},
|
8 |
+
"overrides": {
|
9 |
+
"MuiAppBar": {
|
10 |
+
"styleOverrides": {
|
11 |
+
"root": {
|
12 |
+
"backgroundColor": "#ffffff",
|
13 |
+
"color": "rgba(0, 0, 0, 0.87)",
|
14 |
+
"& .MuiToolbar-root:before": {
|
15 |
+
"content": '""',
|
16 |
+
"background": "url('https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png')",
|
17 |
+
"display": "block",
|
18 |
+
"background-size": "contain",
|
19 |
+
"background-repeat": "no-repeat",
|
20 |
+
"background-position": "left",
|
21 |
+
"width": "125px",
|
22 |
+
"height": "40px",
|
23 |
+
"margin-right": "10px",
|
24 |
+
},
|
25 |
+
},
|
26 |
+
}
|
27 |
+
},
|
28 |
+
},
|
29 |
+
}
|