aloe-vera commited on
Commit
27b730c
·
verified ·
1 Parent(s): d65adcf

leaderboard v1

Browse files
Files changed (2) hide show
  1. src/envs.py +26 -25
  2. src/populate.py +90 -58
src/envs.py CHANGED
@@ -1,25 +1,26 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
- API = HfApi(token=TOKEN)
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ # OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ OWNER = "kluster-ai"
11
+ # ----------------------------------
12
+
13
+ REPO_ID = f"{OWNER}/LLM-Hallucination-Detection-Leaderboard"
14
+ QUEUE_REPO = f"{OWNER}/requests"
15
+ RESULTS_REPO = f"{OWNER}/results"
16
+
17
+ # If you setup a cache later, just change HF_HOME
18
+ CACHE_PATH=os.getenv("HF_HOME", ".")
19
+
20
+ # Local caches
21
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
22
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
23
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
24
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
25
+
26
+ API = HfApi(token=TOKEN)
src/populate.py CHANGED
@@ -1,58 +1,90 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
-
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results
9
+
10
+
11
+ def get_leaderboard_df(results_path):
12
+ df = pd.read_csv(results_path)
13
+ # numeric formatting
14
+ df["ha_rag_rate"] = df["ha_rag_rate"].round(2)
15
+ df["ha_non_rag_rate"] = df["ha_non_rag_rate"].round(2)
16
+
17
+ # --- map to pretty headers just before returning ---
18
+ pretty = {
19
+ "Models": "Models",
20
+ "ha_rag_rate": "RAG Hallucination Rate (%)",
21
+ "ha_non_rag_rate": "Non-RAG Hallucination Rate (%)",
22
+ }
23
+ df = df.rename(columns=pretty) # this is what the UI will use
24
+ # ----------- Average column & ranking ---------------------------------------------
25
+ df["Average Hallucination Rate (%)"] = df[
26
+ ["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"]
27
+ ].mean(axis=1).round(2)
28
+
29
+ # sort so *lower* average = better (true leaderboard style)
30
+ df = df.sort_values("Average Hallucination Rate (%)", ascending=True).reset_index(drop=True)
31
+
32
+ # # Rank & medal
33
+ medal_map = {1: "🥇", 2: "🥈", 3: "🥉"}
34
+
35
+ def medal_html(rank):
36
+ m = medal_map.get(rank)
37
+ return f'<span style="font-size:2.0rem;">{m}</span>' if m else rank
38
+
39
+ df["Rank"] = df.index + 1
40
+ df["Rank"] = df["Rank"].apply(medal_html)
41
+
42
+
43
+ # ----------- column ordering ------------------------------------------------------
44
+ df = df[[
45
+ "Rank", # pretty column user sees
46
+ "Models",
47
+ "Average Hallucination Rate (%)",
48
+ "RAG Hallucination Rate (%)",
49
+ "Non-RAG Hallucination Rate (%)",
50
+ ]]
51
+
52
+ return df
53
+
54
+
55
+
56
+
57
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
58
+ """Creates the different dataframes for the evaluation queues requestes"""
59
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
60
+ all_evals = []
61
+
62
+ for entry in entries:
63
+ if ".json" in entry:
64
+ file_path = os.path.join(save_path, entry)
65
+ with open(file_path) as fp:
66
+ data = json.load(fp)
67
+
68
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
69
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
70
+
71
+ all_evals.append(data)
72
+ elif ".md" not in entry:
73
+ # this is a folder
74
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
75
+ for sub_entry in sub_entries:
76
+ file_path = os.path.join(save_path, entry, sub_entry)
77
+ with open(file_path) as fp:
78
+ data = json.load(fp)
79
+
80
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
81
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
82
+ all_evals.append(data)
83
+
84
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
85
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
86
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
87
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
88
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
89
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
90
+ return df_finished[cols], df_running[cols], df_pending[cols]