forrestbao commited on
Commit
8c3427d
·
1 Parent(s): 25328d1

new funix based app

Browse files
Files changed (8) hide show
  1. Dockerfile +24 -0
  2. README.md +8 -32
  3. app.py +0 -332
  4. app/app.py +67 -0
  5. app/app_utils.py +193 -0
  6. app/requirements.txt +8 -0
  7. app/results.json +860 -0
  8. app/vectara_theme.py +29 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /app
4
+
5
+ COPY ./app/vectara_theme.py /app/vectara_theme.py
6
+ COPY ./app/requirements.txt /app/requirements.txt
7
+ COPY ./app/app.py /app/app.py
8
+ COPY ./app/app_utils.py /app/app_utils.py
9
+ COPY ./app/results.json /app/results.json
10
+
11
+ # RUN mkdir -p /app/results
12
+
13
+ RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
14
+
15
+ # RUN useradd -m -u 1000 user
16
+ # USER user
17
+ # ENV HOME=/home/user \
18
+ # PATH=/home/user/.local/bin:$PATH
19
+
20
+ # WORKDIR $HOME/app
21
+
22
+ # COPY --chown=user . $HOME/app
23
+
24
+ CMD ["funix", "app.py", "--host", "0.0.0.0", "--port", "7860", "--no-browser"]
README.md CHANGED
@@ -1,38 +1,14 @@
1
  ---
2
- title: HHEM Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.44.0
8
- app_file: app.py
9
- pinned: true
10
- license: apache-2.0
11
- tags:
12
- - leaderboard
13
  ---
14
 
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
 
17
- Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
 
18
 
19
- Results files should have the following format:
20
- ```
21
- {
22
- "config": {
23
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
24
- "model_name": "path of the model on the hub: org/model",
25
- "model_sha": "revision on the hub",
26
- },
27
- "results": {
28
- "task_name": {
29
- "metric_name": score,
30
- },
31
- "task_name2": {
32
- "metric_name": score,
33
- }
34
- }
35
- }
36
- ```
37
 
38
- Request files are created automatically by this tool.
 
1
  ---
2
+ title: LLM Hallucination Leaderboard
3
+ sdk: docker
4
+ app_port: 7860
5
+ python_version: 3.10
6
+ pinned: false
 
 
 
 
 
 
7
  ---
8
 
 
9
 
10
+ LLM Hallucination Leaderboard
11
+ ---
12
 
13
+ by Vectara, Inc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
 
app.py DELETED
@@ -1,332 +0,0 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from apscheduler.schedulers.background import BackgroundScheduler
4
- from huggingface_hub import snapshot_download
5
-
6
- import src.display.about as about
7
- from src.display.css_html_js import custom_css
8
- import src.display.utils as utils
9
- import src.envs as envs
10
- import src.populate as populate
11
- import src.submission.submit as submit
12
-
13
-
14
- def restart_space():
15
- envs.API.restart_space(repo_id=envs.REPO_ID, token=envs.TOKEN)
16
-
17
- try:
18
- print(envs.EVAL_REQUESTS_PATH)
19
- snapshot_download(
20
- repo_id=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
21
- )
22
- except Exception:
23
- restart_space()
24
- try:
25
- print(envs.EVAL_RESULTS_PATH)
26
- snapshot_download(
27
- repo_id=envs.RESULTS_REPO, local_dir=envs.EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
28
- )
29
- except Exception:
30
- restart_space()
31
-
32
- raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
33
- leaderboard_df = original_df.copy()
34
-
35
- (
36
- finished_eval_queue_df,
37
- running_eval_queue_df,
38
- pending_eval_queue_df,
39
- ) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
40
-
41
-
42
- # Searching and filtering
43
- def update_table(
44
- hidden_df: pd.DataFrame,
45
- columns: list,
46
- type_query: list,
47
- precision_query: str,
48
- size_query: list,
49
- show_deleted: bool,
50
- query: str,
51
- ):
52
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
53
- filtered_df = filter_queries(query, filtered_df)
54
- df = select_columns(filtered_df, columns)
55
- return df
56
-
57
-
58
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
59
- return df[(df[utils.AutoEvalColumn.dummy.name].str.contains(query, case=False))]
60
-
61
-
62
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
63
- always_here_cols = [
64
- utils.AutoEvalColumn.model_type_symbol.name,
65
- utils.AutoEvalColumn.model.name,
66
- ]
67
- # We use COLS to maintain sorting
68
- filtered_df = df[
69
- always_here_cols + [c for c in utils.COLS if c in df.columns and c in columns] + [utils.AutoEvalColumn.dummy.name]
70
- ]
71
- return filtered_df
72
-
73
-
74
- def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
75
- final_df = []
76
- if query != "":
77
- queries = [q.strip() for q in query.split(";")]
78
- for _q in queries:
79
- _q = _q.strip()
80
- if _q != "":
81
- temp_filtered_df = search_table(filtered_df, _q)
82
- if len(temp_filtered_df) > 0:
83
- final_df.append(temp_filtered_df)
84
- if len(final_df) > 0:
85
- filtered_df = pd.concat(final_df)
86
- filtered_df = filtered_df.drop_duplicates(
87
- subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name]
88
- )
89
-
90
- return filtered_df
91
-
92
-
93
- def filter_models(
94
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
95
- ) -> pd.DataFrame:
96
- # Show all models
97
- # if show_deleted:
98
- # filtered_df = df
99
- # else: # Show only still on the hub models
100
- # filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
101
-
102
- filtered_df = df
103
-
104
- type_emoji = [t[0] for t in type_query]
105
- filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
106
- filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
107
-
108
- numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
109
- params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
110
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
111
- filtered_df = filtered_df.loc[mask]
112
-
113
- return filtered_df
114
-
115
-
116
- demo = gr.Blocks(css=custom_css)
117
- with demo:
118
- gr.HTML("""<img referrerpolicy="no-referrer-when-downgrade"
119
- src="https://static.scarf.sh/a.png?x-pxid=5f53f560-5ba6-4e73-917b-c7049e9aea2c"
120
- style="width:1px;height:1px;"/>
121
- """)
122
- gr.HTML(about.TITLE)
123
- gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
124
-
125
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
126
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
127
- with gr.Row():
128
- with gr.Column():
129
- with gr.Row():
130
- search_bar = gr.Textbox(
131
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
132
- show_label=False,
133
- elem_id="search-bar",
134
- )
135
- with gr.Row():
136
- shown_columns = gr.CheckboxGroup(
137
- choices=[
138
- c.name
139
- for c in utils.fields(utils.AutoEvalColumn)
140
- if not c.hidden and not c.never_hidden and not c.dummy
141
- ],
142
- value=[
143
- c.name
144
- for c in utils.fields(utils.AutoEvalColumn)
145
- if c.displayed_by_default and not c.hidden and not c.never_hidden
146
- ],
147
- label="Select columns to show",
148
- elem_id="column-select",
149
- interactive=True,
150
- )
151
- # with gr.Row():
152
- # deleted_models_visibility = gr.Checkbox(
153
- # value=False, label="Show gated/private/deleted models", interactive=True
154
- # )
155
- with gr.Column(min_width=320):
156
- #with gr.Box(elem_id="box-filter"):
157
- filter_columns_type = gr.CheckboxGroup(
158
- label="Model types",
159
- choices=[t.to_str() for t in utils.ModelType],
160
- value=[t.to_str() for t in utils.ModelType],
161
- interactive=True,
162
- elem_id="filter-columns-type",
163
- )
164
- # filter_columns_precision = gr.CheckboxGroup(
165
- # label="Precision",
166
- # choices=[i.value.name for i in utils.Precision],
167
- # value=[i.value.name for i in utils.Precision],
168
- # interactive=True,
169
- # elem_id="filter-columns-precision",
170
- # )
171
- # filter_columns_size = gr.CheckboxGroup(
172
- # label="Model sizes (in billions of parameters)",
173
- # choices=list(utils.NUMERIC_INTERVALS.keys()),
174
- # value=list(utils.NUMERIC_INTERVALS.keys()),
175
- # interactive=True,
176
- # elem_id="filter-columns-size",
177
- # )
178
-
179
- leaderboard_table = gr.components.Dataframe(
180
- value=leaderboard_df[
181
- [c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
182
- + shown_columns.value
183
- + [utils.AutoEvalColumn.dummy.name]
184
- ],
185
- headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
186
- datatype=utils.TYPES,
187
- elem_id="leaderboard-table",
188
- interactive=False,
189
- visible=True,
190
- column_widths=["2%", "33%"]
191
- )
192
-
193
- # Dummy leaderboard for handling the case when the user uses backspace key
194
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
195
- value=original_df[utils.COLS],
196
- headers=utils.COLS,
197
- datatype=utils.TYPES,
198
- visible=False,
199
- )
200
- search_bar.submit(
201
- update_table,
202
- [
203
- hidden_leaderboard_table_for_search,
204
- shown_columns,
205
- filter_columns_type,
206
- # filter_columns_precision,
207
- # filter_columns_size,
208
- # deleted_models_visibility,
209
- search_bar,
210
- ],
211
- leaderboard_table,
212
- )
213
- for selector in [shown_columns, filter_columns_type]: #, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
214
- selector.change(
215
- update_table,
216
- [
217
- hidden_leaderboard_table_for_search,
218
- shown_columns,
219
- filter_columns_type,
220
- # filter_columns_precision,
221
- # filter_columns_size,
222
- # deleted_models_visibility,
223
- search_bar,
224
- ],
225
- leaderboard_table,
226
- queue=True,
227
- )
228
-
229
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
230
- gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
231
-
232
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
233
- with gr.Column():
234
- with gr.Row():
235
- gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
236
-
237
- with gr.Column():
238
- with gr.Accordion(
239
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
240
- open=False,
241
- ):
242
- with gr.Row():
243
- finished_eval_table = gr.components.Dataframe(
244
- value=finished_eval_queue_df,
245
- headers=utils.EVAL_COLS,
246
- datatype=utils.EVAL_TYPES,
247
- row_count=5,
248
- )
249
- with gr.Accordion(
250
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
251
- open=False,
252
- ):
253
- with gr.Row():
254
- running_eval_table = gr.components.Dataframe(
255
- value=running_eval_queue_df,
256
- headers=utils.EVAL_COLS,
257
- datatype=utils.EVAL_TYPES,
258
- row_count=5,
259
- )
260
-
261
- with gr.Accordion(
262
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
263
- open=False,
264
- ):
265
- with gr.Row():
266
- pending_eval_table = gr.components.Dataframe(
267
- value=pending_eval_queue_df,
268
- headers=utils.EVAL_COLS,
269
- datatype=utils.EVAL_TYPES,
270
- row_count=5,
271
- )
272
- with gr.Row():
273
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
274
-
275
- with gr.Row():
276
- with gr.Column():
277
- model_name_textbox = gr.Textbox(label="Model name")
278
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
279
- model_type = gr.Dropdown(
280
- choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
281
- label="Model type",
282
- multiselect=False,
283
- value=None,
284
- interactive=True,
285
- )
286
-
287
- with gr.Column():
288
- precision = gr.Dropdown(
289
- choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
290
- label="Precision",
291
- multiselect=False,
292
- value="float16",
293
- interactive=True,
294
- )
295
- weight_type = gr.Dropdown(
296
- choices=[i.value.name for i in utils.WeightType],
297
- label="Weights type",
298
- multiselect=False,
299
- value="Original",
300
- interactive=True,
301
- )
302
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
303
-
304
- submit_button = gr.Button("Submit Eval")
305
- submission_result = gr.Markdown()
306
- submit_button.click(
307
- submit.add_new_eval,
308
- [
309
- model_name_textbox,
310
- base_model_name_textbox,
311
- revision_name_textbox,
312
- precision,
313
- weight_type,
314
- model_type,
315
- ],
316
- submission_result,
317
- )
318
-
319
- with gr.Row():
320
- with gr.Accordion("📙 Citation", open=False):
321
- citation_button = gr.Textbox(
322
- value=about.CITATION_BUTTON_TEXT,
323
- label=about.CITATION_BUTTON_LABEL,
324
- lines=20,
325
- elem_id="citation-button",
326
- show_copy_button=True,
327
- )
328
-
329
- scheduler = BackgroundScheduler()
330
- scheduler.add_job(restart_space, "interval", seconds=1800)
331
- scheduler.start()
332
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Literal, List, Tuple
2
+ import json
3
+
4
+ import pandas as pd
5
+ import matplotlib.figure
6
+ from IPython.display import Markdown
7
+
8
+ import dotenv
9
+ dotenv.load_dotenv() # load HF_TOKEN
10
+
11
+ from funix import funix, import_theme
12
+ from vectara_theme import vectara_theme
13
+ import_theme(vectara_theme)
14
+
15
+ from app_utils import load_results, visualize_leaderboard
16
+
17
+ results_df = load_results()
18
+
19
+ @funix(
20
+ title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
21
+ direction="column",
22
+ autorun="always",
23
+ theme="vectara",
24
+ figure_to_image= True,
25
+ # output_layout=[
26
+ # [{"return_index": 0, "width": 0.3}],
27
+ # [{"return_index": 1, "width": 0.7}],
28
+ # ]
29
+ )
30
+ def leaderboard(
31
+ filter_models_by_name: str = ""
32
+ # ) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
33
+ ) -> Tuple[Markdown, pd.DataFrame]:
34
+ """# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
35
+
36
+ Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
37
+
38
+ ## Usage
39
+
40
+ * All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
41
+ * Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
42
+ * To sort the table, hover over a column header and click the arrow. The arrow automatically points up and down depending on the sort order.
43
+ * Click the "Refresh" button to refresh the leaderboard if the table is not shown or does not update when you change the filter.
44
+
45
+ Args:
46
+ filter_models_by_name: filter models by name using comma-separated strings
47
+ """
48
+ df = results_df
49
+
50
+ filter_models_by_name = filter_models_by_name.replace(",", ";")
51
+ filter_models_by_name = filter_models_by_name.replace(" ", "")
52
+ if len(filter_models_by_name) > 0:
53
+ filter_models_by_name = filter_models_by_name.split(";")
54
+ filter_models_by_name = [name for name in filter_models_by_name if name != ""]
55
+ df = df.copy()
56
+ df = df[df["LLM"].str.contains("|".join(filter_models_by_name), na=False)]
57
+
58
+ if len(df) == 0: # return an empty DF and an empty figure
59
+ # return pd.DataFrame(), matplotlib.figure.Figure(), Markdown("No models found")
60
+ return Markdown("No models found"), pd.DataFrame()
61
+
62
+ return Markdown(""), df
63
+
64
+ fig = visualize_leaderboard(df)
65
+
66
+ # return df, fig
67
+ # return Markdown(""), fig, df
app/app_utils.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import os
3
+ import json
4
+ from huggingface_hub import Repository
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.figure
8
+ from sklearn.preprocessing import MinMaxScaler
9
+
10
+ import dotenv
11
+ dotenv.load_dotenv()
12
+
13
+ min_max_scaler = MinMaxScaler()
14
+
15
+ # %%
16
+ def pull_results(results_dir: str):
17
+ repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset", token=os.getenv("HF_TOKEN"))
18
+ repo.git_pull()
19
+
20
+ def extract_info_from_result_file(result_file):
21
+ """
22
+ {
23
+ "config": {
24
+ "model_dtype": "float16",
25
+ "model_name": "databricks/dbrx-instruct",
26
+ "model_sha": "main"
27
+ },
28
+ "results": {
29
+ "hallucination_rate": {
30
+ "hallucination_rate": 8.34990059642147
31
+ },
32
+ "factual_consistency_rate": {
33
+ "factual_consistency_rate": 91.65009940357854
34
+ },
35
+ "answer_rate": {
36
+ "answer_rate": 100.0
37
+ },
38
+ "average_summary_length": {
39
+ "average_summary_length": 85.9
40
+ }
41
+ }
42
+ """
43
+
44
+ info = json.load(open(result_file, 'r'))
45
+ result = {
46
+ "LLM": info["config"]["model_name"],
47
+ "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
48
+ # "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
49
+ "Answer %": info["results"]["answer_rate"]["answer_rate"],
50
+ "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
51
+ }
52
+ return result
53
+
54
+ def get_latest_result_file(dir: str):
55
+ """
56
+ Get the latest result file in the given directory based on the timestamp in the file name.
57
+ """
58
+ if not os.path.isdir(dir):
59
+ return None
60
+ files = os.listdir(dir)
61
+ files = [f for f in files if f.endswith(".json")]
62
+ if len(files) == 0:
63
+ return None
64
+ files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
65
+ # print ("Scanning: ", dir, "found latest file: ", files[0])
66
+ return os.path.join(dir, files[0])
67
+
68
+ def scan_and_extract(dir: str):
69
+ """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
70
+ """
71
+
72
+ results = []
73
+ for root, dirs, files in os.walk(dir):
74
+ if len(dirs) == 0:
75
+ continue
76
+ for dir in dirs:
77
+ result_file = get_latest_result_file(os.path.join(root, dir))
78
+ if result_file is not None:
79
+ results.append(extract_info_from_result_file(result_file))
80
+ return results
81
+
82
+ def load_results(
83
+ results_dir: str = "./results",
84
+ results_json: str = "./results.json"
85
+ ):
86
+
87
+ try:
88
+ pull_results(results_dir)
89
+ print ("Successfully pulled results from {results_dir}")
90
+ except Exception as e:
91
+ print(f"Failed to pull and/or extract latest results: {e}")
92
+
93
+ try:
94
+ results = scan_and_extract(results_dir)
95
+ if len(results) > 0:
96
+ with open(results_json, "w") as f:
97
+ json.dump(results, f, indent=2)
98
+ print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
99
+ else:
100
+ print(f"No results found in {results_dir}")
101
+ except Exception as e:
102
+ print(f"Failed to scan and extract results from {results_dir}: {e}")
103
+ print(f"Using pre-dumped results from {results_json}")
104
+
105
+ results = json.load(open(results_json, "r"))
106
+
107
+ results_df = pd.DataFrame(results)
108
+ results_df = results_df.sort_values(by="Hallucination %", ascending=True)
109
+ for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
110
+ results_df[column] = results_df[column].apply(lambda x: round(x, 3))
111
+
112
+ return results_df
113
+
114
+ # %%
115
+ def determine_font_size(LLM: str, hallucination_percent: float) -> int:
116
+ # based on both hallucination percent and LLM name, determine font size
117
+ # if hallucination percentage is low and LLM name is long, use smaller font size
118
+ name_length = len(LLM)
119
+ if hallucination_percent < 0.25:
120
+ if name_length > 10:
121
+ return 8.5
122
+ else:
123
+ return 9
124
+ else:
125
+ return 9
126
+
127
+ def determine_font_color(hallucination_percent: float) -> str:
128
+ if hallucination_percent < 0.3:
129
+ return 'white'
130
+ elif hallucination_percent < 0.65:
131
+ return 'black'
132
+ else:
133
+ return 'white'
134
+
135
+ def determine_llm_x_position(LLM: str, hallucination_percent: float) -> float:
136
+ # determine the x position of the LLM name
137
+ # For an LLM, it's bar length is 10* its hallucination %
138
+ # if the LLM name cannot fit in the bar, move it to the left
139
+ # if the LLM name can fit in the bar, let its x position be 0.01
140
+
141
+ name_length = len(LLM)
142
+ print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
143
+
144
+ hallu_rate_to_bar_length_ratio = 10
145
+ bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
146
+ if name_length > bar_length:
147
+ return 0.01
148
+ else:
149
+ return hallucination_percent
150
+
151
+ def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
152
+ fig = plt.figure(figsize=(5, 4))
153
+ # plot using LLM as x-axis and Hallucination % as y-axis
154
+ # make bars horizontal
155
+ plot_df = df.head(10)
156
+ plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
157
+
158
+ plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
159
+
160
+ # for i, row in plot_df.iterrows():
161
+ # plt.text(
162
+ # determine_llm_x_position(row["LLM"], row["Hallucination %"]),
163
+ # row["LLM"],
164
+ # f"{row['LLM']}",
165
+ # ha='left',
166
+ # va='center',
167
+ # fontsize=9,
168
+ # color=determine_font_color(row["normalized_hallucination_rate"])
169
+ # )
170
+ # plt.yticks([])
171
+ plt.tight_layout()
172
+
173
+ plt.xticks(fontsize=9)
174
+ # plt.xlabel("Hallucination %", fontsize=9)
175
+ plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=9)
176
+ plt.gca().spines['top'].set_visible(False)
177
+ plt.gca().spines['right'].set_visible(False)
178
+ plt.gca().spines['left'].set_visible(False)
179
+ plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down
180
+
181
+ return fig
182
+
183
+ # %%
184
+
185
+ if __name__ == "__main__":
186
+ results = scan_and_extract("./results")
187
+ with open("./results.json", "w") as f:
188
+ json.dump(results, f, indent=2)
189
+
190
+ # %%
191
+
192
+
193
+
app/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ funix==0.6.1
2
+ pandas
3
+ dotenv
4
+ huggingface_hub
5
+ matplotlib
6
+ scikit-learn
7
+ ipython
8
+ git-lfs
app/results.json ADDED
@@ -0,0 +1,860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "LLM": "gemini-2.0-flash-exp",
4
+ "Hallucination %": 1.3,
5
+ "Answer %": 99.9,
6
+ "Avg Summary Words": 60.0
7
+ },
8
+ {
9
+ "LLM": "deepseek/deepseek-r1",
10
+ "Hallucination %": 14.3,
11
+ "Answer %": 100.0,
12
+ "Avg Summary Words": 77.1
13
+ },
14
+ {
15
+ "LLM": "deepseek/deepseek-v3",
16
+ "Hallucination %": 3.9,
17
+ "Answer %": 100.0,
18
+ "Avg Summary Words": 88.2
19
+ },
20
+ {
21
+ "LLM": "deepseek/deepseek-chat",
22
+ "Hallucination %": 2.4,
23
+ "Answer %": 100.0,
24
+ "Avg Summary Words": 83.2
25
+ },
26
+ {
27
+ "LLM": "deepseek/deepseek-v3-0324",
28
+ "Hallucination %": 8.0,
29
+ "Answer %": 100.0,
30
+ "Avg Summary Words": 78.9
31
+ },
32
+ {
33
+ "LLM": "openai/chatgpt-4o-latest",
34
+ "Hallucination %": 3.5,
35
+ "Answer %": 100.0,
36
+ "Avg Summary Words": 63.5
37
+ },
38
+ {
39
+ "LLM": "openai/GPT-4",
40
+ "Hallucination %": 1.8050541516245486,
41
+ "Answer %": 100.0,
42
+ "Avg Summary Words": 81.1
43
+ },
44
+ {
45
+ "LLM": "openai/o3-mini-high-reasoning",
46
+ "Hallucination %": 0.7952286282306176,
47
+ "Answer %": 100.0,
48
+ "Avg Summary Words": 79.51888667992047
49
+ },
50
+ {
51
+ "LLM": "openai/gpt-4.1-mini",
52
+ "Hallucination %": 2.2,
53
+ "Answer %": 100.0,
54
+ "Avg Summary Words": 79.6
55
+ },
56
+ {
57
+ "LLM": "openai/o1-pro",
58
+ "Hallucination %": 2.4,
59
+ "Answer %": 100.0,
60
+ "Avg Summary Words": 81.0
61
+ },
62
+ {
63
+ "LLM": "openai/gpt-4.1-nano",
64
+ "Hallucination %": 2.0,
65
+ "Answer %": 100.0,
66
+ "Avg Summary Words": 70.2
67
+ },
68
+ {
69
+ "LLM": "openai/o1-mini",
70
+ "Hallucination %": 1.4,
71
+ "Answer %": 100.0,
72
+ "Avg Summary Words": 78.3
73
+ },
74
+ {
75
+ "LLM": "openai/GPT-4-Turbo",
76
+ "Hallucination %": 1.6898608349900597,
77
+ "Answer %": 100.0,
78
+ "Avg Summary Words": 86.2
79
+ },
80
+ {
81
+ "LLM": "openai/o3",
82
+ "Hallucination %": 6.8,
83
+ "Answer %": 100.0,
84
+ "Avg Summary Words": 77.7
85
+ },
86
+ {
87
+ "LLM": "openai/GPT-3.5-Turbo",
88
+ "Hallucination %": 1.9,
89
+ "Answer %": 99.6,
90
+ "Avg Summary Words": 84.1
91
+ },
92
+ {
93
+ "LLM": "openai/o1",
94
+ "Hallucination %": 2.4,
95
+ "Answer %": 99.9,
96
+ "Avg Summary Words": 73.0
97
+ },
98
+ {
99
+ "LLM": "openai/GPT-4o",
100
+ "Hallucination %": 1.4910536779324055,
101
+ "Answer %": 100.0,
102
+ "Avg Summary Words": 77.8
103
+ },
104
+ {
105
+ "LLM": "openai/GPT-4o-mini",
106
+ "Hallucination %": 1.7,
107
+ "Answer %": 100.0,
108
+ "Avg Summary Words": 76.3
109
+ },
110
+ {
111
+ "LLM": "openai/o1-preview",
112
+ "Hallucination %": 3.3,
113
+ "Answer %": 100.0,
114
+ "Avg Summary Words": 119.3
115
+ },
116
+ {
117
+ "LLM": "openai/o4-mini",
118
+ "Hallucination %": 4.6,
119
+ "Answer %": 100.0,
120
+ "Avg Summary Words": 82.0
121
+ },
122
+ {
123
+ "LLM": "openai/gpt-4.5-preview",
124
+ "Hallucination %": 1.2,
125
+ "Answer %": 100.0,
126
+ "Avg Summary Words": 77.0
127
+ },
128
+ {
129
+ "LLM": "openai/gpt-4.1",
130
+ "Hallucination %": 2.0,
131
+ "Answer %": 100.0,
132
+ "Avg Summary Words": 71.9
133
+ },
134
+ {
135
+ "LLM": "Qwen/Qwen2-VL-2B-Instruct",
136
+ "Hallucination %": 8.3,
137
+ "Answer %": 100.0,
138
+ "Avg Summary Words": 81.8
139
+ },
140
+ {
141
+ "LLM": "Qwen/Qwen2.5-14B-Instruct",
142
+ "Hallucination %": 4.2,
143
+ "Answer %": 100.0,
144
+ "Avg Summary Words": 74.8
145
+ },
146
+ {
147
+ "LLM": "Qwen/Qwen3-32B",
148
+ "Hallucination %": 2.8,
149
+ "Answer %": 100.0,
150
+ "Avg Summary Words": 82.4
151
+ },
152
+ {
153
+ "LLM": "Qwen/Qwen2.5-32B-Instruct",
154
+ "Hallucination %": 3.0,
155
+ "Answer %": 100.0,
156
+ "Avg Summary Words": 67.9
157
+ },
158
+ {
159
+ "LLM": "Qwen/QwQ-32B-Preview",
160
+ "Hallucination %": 12.9,
161
+ "Answer %": 100.0,
162
+ "Avg Summary Words": 140.2
163
+ },
164
+ {
165
+ "LLM": "Qwen/Qwen3-0.6B",
166
+ "Hallucination %": 3.7,
167
+ "Answer %": 100.0,
168
+ "Avg Summary Words": 65.3
169
+ },
170
+ {
171
+ "LLM": "Qwen/Qwen3-14B",
172
+ "Hallucination %": 2.2,
173
+ "Answer %": 100.0,
174
+ "Avg Summary Words": 82.4
175
+ },
176
+ {
177
+ "LLM": "Qwen/Qwen2.5-3B-Instruct",
178
+ "Hallucination %": 7.0,
179
+ "Answer %": 100.0,
180
+ "Avg Summary Words": 70.4
181
+ },
182
+ {
183
+ "LLM": "Qwen/Qwen2.5-1.5B-Instruct",
184
+ "Hallucination %": 15.8,
185
+ "Answer %": 100.0,
186
+ "Avg Summary Words": 70.7
187
+ },
188
+ {
189
+ "LLM": "Qwen/Qwen2-VL-7B-Instruct",
190
+ "Hallucination %": 4.2,
191
+ "Answer %": 100.0,
192
+ "Avg Summary Words": 73.9
193
+ },
194
+ {
195
+ "LLM": "Qwen/Qwen2.5-0.5B-Instruct",
196
+ "Hallucination %": 25.2,
197
+ "Answer %": 100.0,
198
+ "Avg Summary Words": 72.6
199
+ },
200
+ {
201
+ "LLM": "Qwen/Qwen3-4B",
202
+ "Hallucination %": 2.7,
203
+ "Answer %": 100.0,
204
+ "Avg Summary Words": 87.7
205
+ },
206
+ {
207
+ "LLM": "Qwen/Qwen2.5-72B-Instruct",
208
+ "Hallucination %": 4.3,
209
+ "Answer %": 100.0,
210
+ "Avg Summary Words": 80.8
211
+ },
212
+ {
213
+ "LLM": "Qwen/Qwen3-8B",
214
+ "Hallucination %": 3.0,
215
+ "Answer %": 100.0,
216
+ "Avg Summary Words": 78.2
217
+ },
218
+ {
219
+ "LLM": "Qwen/Qwen3-1.7B",
220
+ "Hallucination %": 4.4,
221
+ "Answer %": 100.0,
222
+ "Avg Summary Words": 69.0
223
+ },
224
+ {
225
+ "LLM": "Qwen/Qwen2-72B-Instruct",
226
+ "Hallucination %": 4.7,
227
+ "Answer %": 100.0,
228
+ "Avg Summary Words": 100.1
229
+ },
230
+ {
231
+ "LLM": "Qwen/Qwen2.5-7B-Instruct",
232
+ "Hallucination %": 2.8,
233
+ "Answer %": 100.0,
234
+ "Avg Summary Words": 71.0
235
+ },
236
+ {
237
+ "LLM": "allenai/OLMo-2-1124-7B-Instruct",
238
+ "Hallucination %": 11.1,
239
+ "Answer %": 100.0,
240
+ "Avg Summary Words": 112.6
241
+ },
242
+ {
243
+ "LLM": "allenai/OLMo-2-1124-13B-Instruct",
244
+ "Hallucination %": 10.8,
245
+ "Answer %": 100.0,
246
+ "Avg Summary Words": 82.0
247
+ },
248
+ {
249
+ "LLM": "allenai/olmo-2-0325-32b-instruct",
250
+ "Hallucination %": 4.9,
251
+ "Answer %": 99.9,
252
+ "Avg Summary Words": 100.0
253
+ },
254
+ {
255
+ "LLM": "amazon/Titan-Express",
256
+ "Hallucination %": 13.5,
257
+ "Answer %": 99.5,
258
+ "Avg Summary Words": 98.4
259
+ },
260
+ {
261
+ "LLM": "amazon/nova-lite-v1",
262
+ "Hallucination %": 1.8,
263
+ "Answer %": 99.9,
264
+ "Avg Summary Words": 80.7
265
+ },
266
+ {
267
+ "LLM": "amazon/nova-pro-v1",
268
+ "Hallucination %": 1.8,
269
+ "Answer %": 100.0,
270
+ "Avg Summary Words": 85.5
271
+ },
272
+ {
273
+ "LLM": "amazon/nova-micro-v1",
274
+ "Hallucination %": 1.6,
275
+ "Answer %": 100.0,
276
+ "Avg Summary Words": 90.0
277
+ },
278
+ {
279
+ "LLM": "google/gemini-2.5-pro-exp-03-25",
280
+ "Hallucination %": 1.1,
281
+ "Answer %": 95.1,
282
+ "Avg Summary Words": 72.9
283
+ },
284
+ {
285
+ "LLM": "google/PaLM-2",
286
+ "Hallucination %": 14.1,
287
+ "Answer %": 99.8,
288
+ "Avg Summary Words": 86.6
289
+ },
290
+ {
291
+ "LLM": "google/gemma-1.1-2b-it",
292
+ "Hallucination %": 27.8,
293
+ "Answer %": 100.0,
294
+ "Avg Summary Words": 66.8
295
+ },
296
+ {
297
+ "LLM": "google/gemini-2.0-flash-thinking-exp",
298
+ "Hallucination %": 1.8,
299
+ "Answer %": 99.3,
300
+ "Avg Summary Words": 73.2
301
+ },
302
+ {
303
+ "LLM": "google/gemma-3-1b-it",
304
+ "Hallucination %": 5.3,
305
+ "Answer %": 99.9,
306
+ "Avg Summary Words": 57.9
307
+ },
308
+ {
309
+ "LLM": "google/gemma-2-2b-it",
310
+ "Hallucination %": 7.0,
311
+ "Answer %": 100.0,
312
+ "Avg Summary Words": 62.2
313
+ },
314
+ {
315
+ "LLM": "google/flan-t5-large",
316
+ "Hallucination %": 18.3,
317
+ "Answer %": 99.3,
318
+ "Avg Summary Words": 20.9
319
+ },
320
+ {
321
+ "LLM": "google/gemini-2.5-flash-preview-04-17",
322
+ "Hallucination %": 1.3,
323
+ "Answer %": 91.2,
324
+ "Avg Summary Words": 71.1
325
+ },
326
+ {
327
+ "LLM": "google/Gemini-Pro",
328
+ "Hallucination %": 7.6767676767676765,
329
+ "Answer %": 98.4,
330
+ "Avg Summary Words": 89.5
331
+ },
332
+ {
333
+ "LLM": "google/gemini-1.5-pro-001",
334
+ "Hallucination %": 9.1,
335
+ "Answer %": 99.8,
336
+ "Avg Summary Words": 61.6
337
+ },
338
+ {
339
+ "LLM": "google/gemma-2-9b-it",
340
+ "Hallucination %": 10.139165009940358,
341
+ "Answer %": 100.0,
342
+ "Avg Summary Words": 70.2
343
+ },
344
+ {
345
+ "LLM": "google/gemma-1.1-7b-it",
346
+ "Hallucination %": 17.0,
347
+ "Answer %": 100.0,
348
+ "Avg Summary Words": 64.3
349
+ },
350
+ {
351
+ "LLM": "google/gemma-3-4b-it",
352
+ "Hallucination %": 3.7,
353
+ "Answer %": 100.0,
354
+ "Avg Summary Words": 63.7
355
+ },
356
+ {
357
+ "LLM": "google/gemini-2.0-pro-exp-02-05",
358
+ "Hallucination %": 0.8,
359
+ "Answer %": 99.7,
360
+ "Avg Summary Words": 61.5
361
+ },
362
+ {
363
+ "LLM": "google/gemini-1.5-pro-002",
364
+ "Hallucination %": 6.6,
365
+ "Answer %": 99.9,
366
+ "Avg Summary Words": 62.0
367
+ },
368
+ {
369
+ "LLM": "google/gemma-3-12b-it",
370
+ "Hallucination %": 2.8,
371
+ "Answer %": 100.0,
372
+ "Avg Summary Words": 69.6
373
+ },
374
+ {
375
+ "LLM": "google/gemini-2.0-flash-001",
376
+ "Hallucination %": 0.7,
377
+ "Answer %": 100.0,
378
+ "Avg Summary Words": 65.2
379
+ },
380
+ {
381
+ "LLM": "google/gemini-1.5-flash-002",
382
+ "Hallucination %": 3.4,
383
+ "Answer %": 99.9,
384
+ "Avg Summary Words": 59.4
385
+ },
386
+ {
387
+ "LLM": "google/gemma-7b-it",
388
+ "Hallucination %": 14.81113320079523,
389
+ "Answer %": 100.0,
390
+ "Avg Summary Words": 113.0
391
+ },
392
+ {
393
+ "LLM": "google/gemini-2.0-flash-lite-preview-02-05",
394
+ "Hallucination %": 1.2,
395
+ "Answer %": 99.5,
396
+ "Avg Summary Words": 60.9
397
+ },
398
+ {
399
+ "LLM": "google/gemini-1.5-flash-001",
400
+ "Hallucination %": 6.6,
401
+ "Answer %": 99.9,
402
+ "Avg Summary Words": 63.3
403
+ },
404
+ {
405
+ "LLM": "google/gemma-3-27b-it",
406
+ "Hallucination %": 5.9,
407
+ "Answer %": 98.5,
408
+ "Avg Summary Words": 64.3
409
+ },
410
+ {
411
+ "LLM": "snowflake/snowflake-arctic-instruct",
412
+ "Hallucination %": 3.0,
413
+ "Answer %": 100.0,
414
+ "Avg Summary Words": 68.7
415
+ },
416
+ {
417
+ "LLM": "01-ai/Yi-1.5-9B-Chat",
418
+ "Hallucination %": 4.9,
419
+ "Answer %": 100.0,
420
+ "Avg Summary Words": 85.7
421
+ },
422
+ {
423
+ "LLM": "01-ai/Yi-1.5-6B-Chat",
424
+ "Hallucination %": 7.9,
425
+ "Answer %": 100.0,
426
+ "Avg Summary Words": 98.9
427
+ },
428
+ {
429
+ "LLM": "01-ai/Yi-1.5-34B-Chat",
430
+ "Hallucination %": 3.7,
431
+ "Answer %": 100.0,
432
+ "Avg Summary Words": 83.7
433
+ },
434
+ {
435
+ "LLM": "ai21labs/AI21-Jamba-1.5-Mini",
436
+ "Hallucination %": 2.9,
437
+ "Answer %": 95.6,
438
+ "Avg Summary Words": 74.5
439
+ },
440
+ {
441
+ "LLM": "cohere/c4ai-aya-expanse-32b",
442
+ "Hallucination %": 8.5,
443
+ "Answer %": 99.9,
444
+ "Avg Summary Words": 81.9
445
+ },
446
+ {
447
+ "LLM": "cohere/command-r-plus-08-2024",
448
+ "Hallucination %": 5.4,
449
+ "Answer %": 100.0,
450
+ "Avg Summary Words": 68.4
451
+ },
452
+ {
453
+ "LLM": "cohere/c4ai-aya-expanse-8b",
454
+ "Hallucination %": 12.2,
455
+ "Answer %": 99.9,
456
+ "Avg Summary Words": 83.9
457
+ },
458
+ {
459
+ "LLM": "cohere/command-a-03-2025",
460
+ "Hallucination %": 4.5,
461
+ "Answer %": 100.0,
462
+ "Avg Summary Words": 77.3
463
+ },
464
+ {
465
+ "LLM": "cohere/command-r-08-2024",
466
+ "Hallucination %": 4.9,
467
+ "Answer %": 100.0,
468
+ "Avg Summary Words": 68.7
469
+ },
470
+ {
471
+ "LLM": "Intel/neural-chat-7b-v3-3",
472
+ "Hallucination %": 2.6,
473
+ "Answer %": 100.0,
474
+ "Avg Summary Words": 60.7
475
+ },
476
+ {
477
+ "LLM": "mistralai/pixtral-large-latest",
478
+ "Hallucination %": 6.6,
479
+ "Answer %": 100.0,
480
+ "Avg Summary Words": 76.4
481
+ },
482
+ {
483
+ "LLM": "mistralai/Mixtral-8x22B-Instruct-v0.1",
484
+ "Hallucination %": 4.7,
485
+ "Answer %": 99.9,
486
+ "Avg Summary Words": 92.0
487
+ },
488
+ {
489
+ "LLM": "mistralai/mistral-small-latest",
490
+ "Hallucination %": 8.6,
491
+ "Answer %": 100.0,
492
+ "Avg Summary Words": 74.2
493
+ },
494
+ {
495
+ "LLM": "mistralai/mistral-large-latest",
496
+ "Hallucination %": 5.864811133200803,
497
+ "Answer %": 100.0,
498
+ "Avg Summary Words": 79.55367793240556
499
+ },
500
+ {
501
+ "LLM": "mistralai/Mixtral-8x7B-Instruct-v0.1",
502
+ "Hallucination %": 20.09950248756219,
503
+ "Answer %": 99.9,
504
+ "Avg Summary Words": 90.7
505
+ },
506
+ {
507
+ "LLM": "mistralai/Mistral-Nemo-Instruct-2407",
508
+ "Hallucination %": 11.2,
509
+ "Answer %": 100.0,
510
+ "Avg Summary Words": 69.9
511
+ },
512
+ {
513
+ "LLM": "mistralai/Mistral-Large2",
514
+ "Hallucination %": 4.1,
515
+ "Answer %": 100.0,
516
+ "Avg Summary Words": 77.4
517
+ },
518
+ {
519
+ "LLM": "mistralai/Mistral-7B-Instruct-v0.3",
520
+ "Hallucination %": 9.5,
521
+ "Answer %": 100.0,
522
+ "Avg Summary Words": 98.4
523
+ },
524
+ {
525
+ "LLM": "mistralai/ministral-3b-latest",
526
+ "Hallucination %": 8.3,
527
+ "Answer %": 100.0,
528
+ "Avg Summary Words": 73.2
529
+ },
530
+ {
531
+ "LLM": "mistralai/ministral-8b-latest",
532
+ "Hallucination %": 7.5,
533
+ "Answer %": 100.0,
534
+ "Avg Summary Words": 62.7
535
+ },
536
+ {
537
+ "LLM": "mistralai/Mistral-Small-24B-Instruct-2501",
538
+ "Hallucination %": 3.1,
539
+ "Answer %": 100.0,
540
+ "Avg Summary Words": 74.9
541
+ },
542
+ {
543
+ "LLM": "mistralai/mistral-small-3.1-24b-instruct",
544
+ "Hallucination %": 5.6,
545
+ "Answer %": 100.0,
546
+ "Avg Summary Words": 73.1
547
+ },
548
+ {
549
+ "LLM": "anthropic/Claude-3-5-Sonnet",
550
+ "Hallucination %": 8.6,
551
+ "Answer %": 100.0,
552
+ "Avg Summary Words": 103.0
553
+ },
554
+ {
555
+ "LLM": "anthropic/claude-3-7-sonnet-latest",
556
+ "Hallucination %": 4.4,
557
+ "Answer %": 100.0,
558
+ "Avg Summary Words": 97.8
559
+ },
560
+ {
561
+ "LLM": "anthropic/Claude-3-opus",
562
+ "Hallucination %": 10.092687950566425,
563
+ "Answer %": 95.5,
564
+ "Avg Summary Words": 92.1
565
+ },
566
+ {
567
+ "LLM": "anthropic/Claude-2",
568
+ "Hallucination %": 17.448856799037305,
569
+ "Answer %": 99.3,
570
+ "Avg Summary Words": 87.5
571
+ },
572
+ {
573
+ "LLM": "anthropic/claude-3-5-haiku-20241022",
574
+ "Hallucination %": 4.9,
575
+ "Answer %": 100.0,
576
+ "Avg Summary Words": 92.2
577
+ },
578
+ {
579
+ "LLM": "anthropic/Claude-3-sonnet",
580
+ "Hallucination %": 16.302186878727635,
581
+ "Answer %": 100.0,
582
+ "Avg Summary Words": 108.5
583
+ },
584
+ {
585
+ "LLM": "anthropic/claude-3-7-sonnet-latest-think",
586
+ "Hallucination %": 4.5,
587
+ "Answer %": 99.8,
588
+ "Avg Summary Words": 99.9
589
+ },
590
+ {
591
+ "LLM": "ai21/jamba-1.6-mini",
592
+ "Hallucination %": 4.6,
593
+ "Answer %": 100.0,
594
+ "Avg Summary Words": 82.3
595
+ },
596
+ {
597
+ "LLM": "ai21/jamba-1.6-large",
598
+ "Hallucination %": 2.3,
599
+ "Answer %": 99.9,
600
+ "Avg Summary Words": 85.6
601
+ },
602
+ {
603
+ "LLM": "qwen/qwen3-235b-a22b",
604
+ "Hallucination %": 13.0,
605
+ "Answer %": 99.2,
606
+ "Avg Summary Words": 86.6
607
+ },
608
+ {
609
+ "LLM": "qwen/qwen-max",
610
+ "Hallucination %": 2.9,
611
+ "Answer %": 88.4,
612
+ "Avg Summary Words": 90.4
613
+ },
614
+ {
615
+ "LLM": "qwen/qwen3-30b-a3b",
616
+ "Hallucination %": 7.6,
617
+ "Answer %": 99.9,
618
+ "Avg Summary Words": 69.9
619
+ },
620
+ {
621
+ "LLM": "x-ai/grok-2-1212",
622
+ "Hallucination %": 1.9,
623
+ "Answer %": 100.0,
624
+ "Avg Summary Words": 86.5
625
+ },
626
+ {
627
+ "LLM": "x-ai/grok-2-vision-1212",
628
+ "Hallucination %": 2.9,
629
+ "Answer %": 100.0,
630
+ "Avg Summary Words": 79.8
631
+ },
632
+ {
633
+ "LLM": "databricks/dbrx-instruct",
634
+ "Hallucination %": 8.3,
635
+ "Answer %": 100.0,
636
+ "Avg Summary Words": 85.9
637
+ },
638
+ {
639
+ "LLM": "xai/grok-3-mini-latest",
640
+ "Hallucination %": 3.3,
641
+ "Answer %": 100.0,
642
+ "Avg Summary Words": 90.2
643
+ },
644
+ {
645
+ "LLM": "xai/grok-beta",
646
+ "Hallucination %": 4.6,
647
+ "Answer %": 100.0,
648
+ "Avg Summary Words": 91.0
649
+ },
650
+ {
651
+ "LLM": "xai/grok-3-latest",
652
+ "Hallucination %": 2.1,
653
+ "Answer %": 100.0,
654
+ "Avg Summary Words": 97.7
655
+ },
656
+ {
657
+ "LLM": "apple/OpenELM-3B-Instruct",
658
+ "Hallucination %": 24.776119402985074,
659
+ "Answer %": 99.3,
660
+ "Avg Summary Words": 47.2
661
+ },
662
+ {
663
+ "LLM": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
664
+ "Hallucination %": 7.9,
665
+ "Answer %": 100.0,
666
+ "Avg Summary Words": 72.2
667
+ },
668
+ {
669
+ "LLM": "meta-llama/Llama-2-70b-chat-hf",
670
+ "Hallucination %": 5.896510228640193,
671
+ "Answer %": 99.9,
672
+ "Avg Summary Words": 84.9
673
+ },
674
+ {
675
+ "LLM": "meta-llama/Meta-Llama-3.1-405B-Instruct",
676
+ "Hallucination %": 3.9,
677
+ "Answer %": 99.6,
678
+ "Avg Summary Words": 85.7
679
+ },
680
+ {
681
+ "LLM": "meta-llama/Llama-3.3-70B-Instruct",
682
+ "Hallucination %": 4.0,
683
+ "Answer %": 100.0,
684
+ "Avg Summary Words": 85.3
685
+ },
686
+ {
687
+ "LLM": "meta-llama/Meta-Llama-3.1-8B-Instruct",
688
+ "Hallucination %": 5.4,
689
+ "Answer %": 100.0,
690
+ "Avg Summary Words": 71.0
691
+ },
692
+ {
693
+ "LLM": "meta-llama/Meta-Llama-3.1-70B-Instruct",
694
+ "Hallucination %": 5.0,
695
+ "Answer %": 100.0,
696
+ "Avg Summary Words": 79.6
697
+ },
698
+ {
699
+ "LLM": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
700
+ "Hallucination %": 8.9,
701
+ "Answer %": 100.0,
702
+ "Avg Summary Words": 73.1
703
+ },
704
+ {
705
+ "LLM": "meta-llama/Llama-3.2-1B-Instruct",
706
+ "Hallucination %": 20.7,
707
+ "Answer %": 100.0,
708
+ "Avg Summary Words": 71.5
709
+ },
710
+ {
711
+ "LLM": "meta-llama/Llama-3-70B-chat-hf",
712
+ "Hallucination %": 4.1,
713
+ "Answer %": 99.2,
714
+ "Avg Summary Words": 68.5
715
+ },
716
+ {
717
+ "LLM": "meta-llama/Llama-3-8B-chat-hf",
718
+ "Hallucination %": 7.370517928286853,
719
+ "Answer %": 99.8,
720
+ "Avg Summary Words": 79.7
721
+ },
722
+ {
723
+ "LLM": "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
724
+ "Hallucination %": 4.3,
725
+ "Answer %": 100.0,
726
+ "Avg Summary Words": 79.8
727
+ },
728
+ {
729
+ "LLM": "meta-llama/llama-4-scout",
730
+ "Hallucination %": 4.7,
731
+ "Answer %": 100.0,
732
+ "Avg Summary Words": 80.7
733
+ },
734
+ {
735
+ "LLM": "meta-llama/Llama-2-7b-chat-hf",
736
+ "Hallucination %": 11.3,
737
+ "Answer %": 99.6,
738
+ "Avg Summary Words": 119.9
739
+ },
740
+ {
741
+ "LLM": "meta-llama/Llama-2-13b-chat-hf",
742
+ "Hallucination %": 10.5,
743
+ "Answer %": 99.8,
744
+ "Avg Summary Words": 82.1
745
+ },
746
+ {
747
+ "LLM": "meta-llama/llama-4-maverick",
748
+ "Hallucination %": 4.6,
749
+ "Answer %": 100.0,
750
+ "Avg Summary Words": 84.8
751
+ },
752
+ {
753
+ "LLM": "microsoft/Orca-2-13b",
754
+ "Hallucination %": 2.5,
755
+ "Answer %": 100.0,
756
+ "Avg Summary Words": 66.2
757
+ },
758
+ {
759
+ "LLM": "microsoft/Phi-3.5-MoE-instruct",
760
+ "Hallucination %": 2.5,
761
+ "Answer %": 96.3,
762
+ "Avg Summary Words": 69.7
763
+ },
764
+ {
765
+ "LLM": "microsoft/Phi-3-mini-4k-instruct",
766
+ "Hallucination %": 3.9761431411530817,
767
+ "Answer %": 100.0,
768
+ "Avg Summary Words": 86.8
769
+ },
770
+ {
771
+ "LLM": "microsoft/phi-4",
772
+ "Hallucination %": 4.7,
773
+ "Answer %": 100.0,
774
+ "Avg Summary Words": 100.3
775
+ },
776
+ {
777
+ "LLM": "microsoft/Phi-3.5-mini-instruct",
778
+ "Hallucination %": 4.1,
779
+ "Answer %": 100.0,
780
+ "Avg Summary Words": 75.0
781
+ },
782
+ {
783
+ "LLM": "microsoft/Phi-3-mini-128k-instruct",
784
+ "Hallucination %": 3.1,
785
+ "Answer %": 100.0,
786
+ "Avg Summary Words": 60.1
787
+ },
788
+ {
789
+ "LLM": "microsoft/Phi-4-mini-instruct",
790
+ "Hallucination %": 3.4,
791
+ "Answer %": 100.0,
792
+ "Avg Summary Words": 69.7
793
+ },
794
+ {
795
+ "LLM": "microsoft/WizardLM-2-8x22B",
796
+ "Hallucination %": 11.741293532338307,
797
+ "Answer %": 99.9,
798
+ "Avg Summary Words": 140.8
799
+ },
800
+ {
801
+ "LLM": "microsoft/phi-2",
802
+ "Hallucination %": 6.666666666666667,
803
+ "Answer %": 91.5,
804
+ "Avg Summary Words": 80.8
805
+ },
806
+ {
807
+ "LLM": "THUDM/glm-4-9b-chat",
808
+ "Hallucination %": 1.3,
809
+ "Answer %": 100.0,
810
+ "Avg Summary Words": 58.1
811
+ },
812
+ {
813
+ "LLM": "internlm/internlm3-8b-instruct",
814
+ "Hallucination %": 4.0,
815
+ "Answer %": 100.0,
816
+ "Avg Summary Words": 97.5
817
+ },
818
+ {
819
+ "LLM": "ibm-granite/granite-3.1-8b-instruct",
820
+ "Hallucination %": 8.6,
821
+ "Answer %": 100.0,
822
+ "Avg Summary Words": 107.4
823
+ },
824
+ {
825
+ "LLM": "ibm-granite/granite-3.2-2b-instruct",
826
+ "Hallucination %": 16.5,
827
+ "Answer %": 100.0,
828
+ "Avg Summary Words": 117.3
829
+ },
830
+ {
831
+ "LLM": "ibm-granite/granite-3.1-2b-instruct",
832
+ "Hallucination %": 15.7,
833
+ "Answer %": 100.0,
834
+ "Avg Summary Words": 107.7
835
+ },
836
+ {
837
+ "LLM": "ibm-granite/granite-3.0-2b-instruct",
838
+ "Hallucination %": 8.8,
839
+ "Answer %": 100.0,
840
+ "Avg Summary Words": 81.6
841
+ },
842
+ {
843
+ "LLM": "ibm-granite/granite-3.0-8b-instruct",
844
+ "Hallucination %": 6.5,
845
+ "Answer %": 100.0,
846
+ "Avg Summary Words": 74.2
847
+ },
848
+ {
849
+ "LLM": "ibm-granite/granite-3.2-8b-instruct",
850
+ "Hallucination %": 8.7,
851
+ "Answer %": 100.0,
852
+ "Avg Summary Words": 120.1
853
+ },
854
+ {
855
+ "LLM": "tiiuae/falcon-7b-instruct",
856
+ "Hallucination %": 29.92047713717694,
857
+ "Answer %": 90.0,
858
+ "Avg Summary Words": 75.5
859
+ }
860
+ ]
app/vectara_theme.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vectara_theme = {
2
+ "name": "vectara",
3
+ "funix": {
4
+ "run_button": "Refresh",
5
+ "grid_height": 960,
6
+ "grid_checkbox": False
7
+ },
8
+ "overrides": {
9
+ "MuiAppBar": {
10
+ "styleOverrides": {
11
+ "root": {
12
+ "backgroundColor": "#ffffff",
13
+ "color": "rgba(0, 0, 0, 0.87)",
14
+ "& .MuiToolbar-root:before": {
15
+ "content": '""',
16
+ "background": "url('https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png')",
17
+ "display": "block",
18
+ "background-size": "contain",
19
+ "background-repeat": "no-repeat",
20
+ "background-position": "left",
21
+ "width": "125px",
22
+ "height": "40px",
23
+ "margin-right": "10px",
24
+ },
25
+ },
26
+ }
27
+ },
28
+ },
29
+ }