Lisa Dunlap
commited on
Commit
Β·
13ecd9b
1
Parent(s):
0ba05dc
moved delta to new column, updated ranking
Browse files
app.py
CHANGED
|
@@ -26,6 +26,8 @@ def make_default_md(arena_df, elo_results):
|
|
| 26 |
|
| 27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
| 28 |
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
return leaderboard_md
|
| 31 |
|
|
@@ -213,32 +215,57 @@ def get_full_table(arena_df, model_table_df):
|
|
| 213 |
|
| 214 |
def create_ranking_str(ranking, ranking_difference):
|
| 215 |
if ranking_difference > 0:
|
| 216 |
-
return f"{int(ranking)} (\u2191{int(ranking_difference)})"
|
|
|
|
| 217 |
elif ranking_difference < 0:
|
| 218 |
-
return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
|
|
|
|
| 219 |
else:
|
| 220 |
return f"{int(ranking)}"
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
| 223 |
-
|
| 224 |
-
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
| 225 |
arena_df = arena_df[arena_df["num_battles"] > 2000]
|
|
|
|
|
|
|
| 226 |
|
| 227 |
# arena_df["final_ranking"] = range(1, len(arena_df) + 1)
|
| 228 |
# sort by rating
|
| 229 |
if arena_subset_df is not None:
|
| 230 |
# filter out models not in the arena_df
|
| 231 |
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
| 232 |
-
|
| 233 |
-
arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
# assign ranking by the order
|
| 236 |
-
|
|
|
|
| 237 |
# join arena_df and arena_subset_df on index
|
| 238 |
-
arena_df = arena_subset_df.join(arena_df["
|
| 239 |
-
arena_df
|
|
|
|
|
|
|
|
|
|
| 240 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
| 241 |
-
|
| 242 |
values = []
|
| 243 |
for i in range(len(arena_df)):
|
| 244 |
row = []
|
|
@@ -247,10 +274,11 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
| 247 |
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
|
| 248 |
0
|
| 249 |
]
|
| 250 |
-
|
| 251 |
# rank
|
| 252 |
ranking = arena_df.iloc[i].get("final_ranking") or i+1
|
| 253 |
row.append(ranking)
|
|
|
|
|
|
|
| 254 |
# model display name
|
| 255 |
row.append(model_name)
|
| 256 |
# elo rating
|
|
@@ -272,7 +300,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
| 272 |
row.append(
|
| 273 |
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
|
| 274 |
)
|
| 275 |
-
|
| 276 |
cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
|
| 277 |
if cutoff_date == "-":
|
| 278 |
row.append("Unknown")
|
|
@@ -421,13 +448,85 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 421 |
else:
|
| 422 |
pass
|
| 423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
def update_leaderboard_and_plots(category):
|
| 425 |
arena_subset_df = arena_dfs[category]
|
|
|
|
| 426 |
elo_subset_results = category_elo_results[category]
|
| 427 |
arena_df = arena_dfs["Total"]
|
| 428 |
-
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
|
| 429 |
if category != "Total":
|
| 430 |
arena_values = update_leaderboard_df(arena_values)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
| 432 |
p2 = elo_subset_results["battle_count_heatmap"]
|
| 433 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
|
@@ -436,18 +535,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 436 |
"""
|
| 437 |
leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
|
| 438 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
| 439 |
-
|
| 440 |
-
def update_leaderboard_df(arena_table_vals):
|
| 441 |
-
elo_datarame = pd.DataFrame(arena_table_vals, columns=["Rank", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff"])
|
| 442 |
-
|
| 443 |
-
# goal: color the rows based on the rank with styler
|
| 444 |
-
def highlight_max(s):
|
| 445 |
-
# all items in S which contain up arrow should be green, down arrow should be red, otherwise black
|
| 446 |
-
return ["color: green" if "\u2191" in v else "color: red" if "\u2193" in v else "" for v in s]
|
| 447 |
-
|
| 448 |
-
styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank"])
|
| 449 |
-
|
| 450 |
-
return styled_df
|
| 451 |
|
| 452 |
category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
|
| 453 |
|
|
|
|
| 26 |
|
| 27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
| 28 |
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
|
| 29 |
+
|
| 30 |
+
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
| 31 |
"""
|
| 32 |
return leaderboard_md
|
| 33 |
|
|
|
|
| 215 |
|
| 216 |
def create_ranking_str(ranking, ranking_difference):
|
| 217 |
if ranking_difference > 0:
|
| 218 |
+
# return f"{int(ranking)} (\u2191{int(ranking_difference)})"
|
| 219 |
+
return f"{int(ranking)} \u2191"
|
| 220 |
elif ranking_difference < 0:
|
| 221 |
+
# return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
|
| 222 |
+
return f"{int(ranking)} \u2193"
|
| 223 |
else:
|
| 224 |
return f"{int(ranking)}"
|
| 225 |
|
| 226 |
+
def recompute_final_ranking(arena_df):
|
| 227 |
+
# compute ranking based on CI
|
| 228 |
+
ranking = {}
|
| 229 |
+
for i, model_a in enumerate(arena_df.index):
|
| 230 |
+
ranking[model_a] = 1
|
| 231 |
+
for j, model_b in enumerate(arena_df.index):
|
| 232 |
+
if i == j:
|
| 233 |
+
continue
|
| 234 |
+
if arena_df.loc[model_b]["rating_q025"] > arena_df.loc[model_a]["rating_q975"]:
|
| 235 |
+
ranking[model_a] += 1
|
| 236 |
+
return list(ranking.values())
|
| 237 |
+
|
| 238 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
| 239 |
+
arena_df = arena_df.sort_values(by=["rating"], ascending=False)
|
|
|
|
| 240 |
arena_df = arena_df[arena_df["num_battles"] > 2000]
|
| 241 |
+
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
| 242 |
+
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
| 243 |
|
| 244 |
# arena_df["final_ranking"] = range(1, len(arena_df) + 1)
|
| 245 |
# sort by rating
|
| 246 |
if arena_subset_df is not None:
|
| 247 |
# filter out models not in the arena_df
|
| 248 |
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
| 249 |
+
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
|
| 250 |
+
# arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
|
| 251 |
+
# arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
| 252 |
+
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
|
| 253 |
+
# keep only the models in the subset in arena_df and recompute final_ranking
|
| 254 |
+
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
|
| 255 |
+
# recompute final ranking
|
| 256 |
+
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
| 257 |
|
| 258 |
# assign ranking by the order
|
| 259 |
+
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
|
| 260 |
+
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
|
| 261 |
# join arena_df and arena_subset_df on index
|
| 262 |
+
arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
|
| 263 |
+
# arena_df = arena_subset_df.join(arena_df["rating"], rsuffix="_global", how="inner")
|
| 264 |
+
arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
|
| 265 |
+
# arena_df['ranking_difference'] = arena_df['rating_global'] - arena_df['rating']
|
| 266 |
+
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
| 267 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
| 268 |
+
|
| 269 |
values = []
|
| 270 |
for i in range(len(arena_df)):
|
| 271 |
row = []
|
|
|
|
| 274 |
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
|
| 275 |
0
|
| 276 |
]
|
|
|
|
| 277 |
# rank
|
| 278 |
ranking = arena_df.iloc[i].get("final_ranking") or i+1
|
| 279 |
row.append(ranking)
|
| 280 |
+
if arena_subset_df is not None:
|
| 281 |
+
row.append(arena_df.iloc[i].get("ranking_difference") or 0)
|
| 282 |
# model display name
|
| 283 |
row.append(model_name)
|
| 284 |
# elo rating
|
|
|
|
| 300 |
row.append(
|
| 301 |
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
|
| 302 |
)
|
|
|
|
| 303 |
cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
|
| 304 |
if cutoff_date == "-":
|
| 305 |
row.append("Unknown")
|
|
|
|
| 448 |
else:
|
| 449 |
pass
|
| 450 |
|
| 451 |
+
def update_leaderboard_df(arena_table_vals):
|
| 452 |
+
elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Ξ", "π€ Model", "β Arena Elo", "π 95% CI", "π³οΈ Votes", "Organization", "License", "Knowledge Cutoff"])
|
| 453 |
+
|
| 454 |
+
# goal: color the rows based on the rank with styler
|
| 455 |
+
def highlight_max(s):
|
| 456 |
+
# all items in S which contain up arrow should be green, down arrow should be red, otherwise black
|
| 457 |
+
return ["color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s]
|
| 458 |
+
|
| 459 |
+
def highlight_rank_max(s):
|
| 460 |
+
return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
|
| 461 |
+
|
| 462 |
+
return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Ξ"])
|
| 463 |
+
|
| 464 |
def update_leaderboard_and_plots(category):
|
| 465 |
arena_subset_df = arena_dfs[category]
|
| 466 |
+
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
| 467 |
elo_subset_results = category_elo_results[category]
|
| 468 |
arena_df = arena_dfs["Total"]
|
| 469 |
+
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Total" else None)
|
| 470 |
if category != "Total":
|
| 471 |
arena_values = update_leaderboard_df(arena_values)
|
| 472 |
+
arena_values = gr.Dataframe(
|
| 473 |
+
headers=[
|
| 474 |
+
"Rank",
|
| 475 |
+
"Ξ",
|
| 476 |
+
"π€ Model",
|
| 477 |
+
"β Arena Elo",
|
| 478 |
+
"π 95% CI",
|
| 479 |
+
"π³οΈ Votes",
|
| 480 |
+
"Organization",
|
| 481 |
+
"License",
|
| 482 |
+
"Knowledge Cutoff",
|
| 483 |
+
],
|
| 484 |
+
datatype=[
|
| 485 |
+
"str",
|
| 486 |
+
"number",
|
| 487 |
+
"markdown",
|
| 488 |
+
"number",
|
| 489 |
+
"str",
|
| 490 |
+
"number",
|
| 491 |
+
"str",
|
| 492 |
+
"str",
|
| 493 |
+
"str",
|
| 494 |
+
],
|
| 495 |
+
value=arena_values,
|
| 496 |
+
elem_id="arena_leaderboard_dataframe",
|
| 497 |
+
height=700,
|
| 498 |
+
column_widths=[50, 50, 190, 110, 100, 90, 160, 150, 140],
|
| 499 |
+
wrap=True,
|
| 500 |
+
)
|
| 501 |
+
else:
|
| 502 |
+
arena_values = gr.Dataframe(
|
| 503 |
+
headers=[
|
| 504 |
+
"Rank",
|
| 505 |
+
"π€ Model",
|
| 506 |
+
"β Arena Elo",
|
| 507 |
+
"π 95% CI",
|
| 508 |
+
"π³οΈ Votes",
|
| 509 |
+
"Organization",
|
| 510 |
+
"License",
|
| 511 |
+
"Knowledge Cutoff",
|
| 512 |
+
],
|
| 513 |
+
datatype=[
|
| 514 |
+
"str",
|
| 515 |
+
"markdown",
|
| 516 |
+
"number",
|
| 517 |
+
"str",
|
| 518 |
+
"number",
|
| 519 |
+
"str",
|
| 520 |
+
"str",
|
| 521 |
+
"str",
|
| 522 |
+
],
|
| 523 |
+
value=arena_values,
|
| 524 |
+
elem_id="arena_leaderboard_dataframe",
|
| 525 |
+
height=700,
|
| 526 |
+
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
| 527 |
+
wrap=True,
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
| 531 |
p2 = elo_subset_results["battle_count_heatmap"]
|
| 532 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
|
|
|
| 535 |
"""
|
| 536 |
leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
|
| 537 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
|
| 540 |
|