sync from github
Browse files- README.md +1 -1
- app.py +18 -17
- requirements.txt +1 -1
- src/leaderboard/read_evals.py +16 -9
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 🔥
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
|
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.36.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
app.py
CHANGED
|
@@ -75,7 +75,7 @@ def restart_space():
|
|
| 75 |
|
| 76 |
|
| 77 |
def init_space():
|
| 78 |
-
dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
|
| 79 |
|
| 80 |
if socket.gethostname() not in {"neuromancer"}:
|
| 81 |
# sync model_type with open-llm-leaderboard
|
|
@@ -90,7 +90,8 @@ def init_space():
|
|
| 90 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
| 91 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
| 92 |
)
|
| 93 |
-
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
def add_benchmark_columns(shown_columns):
|
|
@@ -353,21 +354,21 @@ with demo:
|
|
| 353 |
queue=True,
|
| 354 |
)
|
| 355 |
|
| 356 |
-
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
|
| 372 |
with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
|
| 373 |
with gr.Column():
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
def init_space():
|
| 78 |
+
# dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
|
| 79 |
|
| 80 |
if socket.gethostname() not in {"neuromancer"}:
|
| 81 |
# sync model_type with open-llm-leaderboard
|
|
|
|
| 90 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
| 91 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
| 92 |
)
|
| 93 |
+
# return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 94 |
+
return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 95 |
|
| 96 |
|
| 97 |
def add_benchmark_columns(shown_columns):
|
|
|
|
| 354 |
queue=True,
|
| 355 |
)
|
| 356 |
|
| 357 |
+
# with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
| 358 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 359 |
+
|
| 360 |
+
# dataset_table = gr.components.Dataframe(
|
| 361 |
+
# value=dataset_df,
|
| 362 |
+
# headers=list(dataset_df.columns),
|
| 363 |
+
# datatype=["str", "markdown", "str", "str", "str"],
|
| 364 |
+
# elem_id="dataset-table",
|
| 365 |
+
# interactive=False,
|
| 366 |
+
# visible=True,
|
| 367 |
+
# column_widths=["15%", "20%"],
|
| 368 |
+
# )
|
| 369 |
+
|
| 370 |
+
# gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
|
| 371 |
+
# gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
| 372 |
|
| 373 |
with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
|
| 374 |
with gr.Column():
|
requirements.txt
CHANGED
|
@@ -4,7 +4,7 @@ APScheduler
|
|
| 4 |
black
|
| 5 |
click
|
| 6 |
datasets
|
| 7 |
-
gradio
|
| 8 |
gradio_client
|
| 9 |
huggingface-hub
|
| 10 |
matplotlib
|
|
|
|
| 4 |
black
|
| 5 |
click
|
| 6 |
datasets
|
| 7 |
+
gradio==4.36.1
|
| 8 |
gradio_client
|
| 9 |
huggingface-hub
|
| 10 |
matplotlib
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -277,15 +277,22 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
|
|
| 277 |
|
| 278 |
eval_results = {}
|
| 279 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
results = []
|
| 291 |
for v in eval_results.values():
|
|
|
|
| 277 |
|
| 278 |
eval_results = {}
|
| 279 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
| 280 |
+
try:
|
| 281 |
+
# Creation of result
|
| 282 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
|
| 283 |
+
eval_result.update_with_request_file(requests_path)
|
| 284 |
+
|
| 285 |
+
# Store results of same eval together
|
| 286 |
+
eval_name = eval_result.eval_name
|
| 287 |
+
if eval_name in eval_results.keys():
|
| 288 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 289 |
+
else:
|
| 290 |
+
eval_results[eval_name] = eval_result
|
| 291 |
+
|
| 292 |
+
except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
|
| 293 |
+
# Log the error and continue with the next file
|
| 294 |
+
print(f"Error processing file {model_result_filepath}: {e}")
|
| 295 |
+
continue
|
| 296 |
|
| 297 |
results = []
|
| 298 |
for v in eval_results.values():
|