agent-leaderboard / tabs /leaderboard.py
pratikbhavsar's picture
added sonnet and improved data explorer
f226f06
raw
history blame
6.83 kB
import gradio as gr
from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
from visualization import (
get_performance_chart,
get_performance_cost_chart,
)
from utils import (
get_rank_badge,
get_score_bar,
get_type_badge,
)
def filter_leaderboard(df, model_type, category, sort_by):
filtered_df = df.copy()
if model_type != "All":
filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
dataset_columns = CATEGORIES.get(category, ["Model Avg"])
filtered_df["Category Score"] = filtered_df[dataset_columns].mean(axis=1)
if sort_by == "Performance":
filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
else:
filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
perf_chart = get_performance_chart(filtered_df, category)
cost_chart = get_performance_cost_chart(filtered_df, category)
# Generate styled table HTML
table_html = f"""
<style>
@media (prefers-color-scheme: dark) {{
:root {{
--bg-color: #1a1b1e;
--text-color: #ffffff;
--border-color: #2d2e32;
--hover-bg: #2d2e32;
--note-bg: #2d2e32;
--note-text: #a1a1aa;
--accent-blue: #60A5FA;
--accent-purple: #A78BFA;
--accent-pink: #F472B6;
--score-bg: rgba(255, 255, 255, 0.1);
}}
}}
@media (prefers-color-scheme: light) {{
:root {{
--bg-color: #ffffff;
--text-color: #000000;
--border-color: #e5e7eb;
--hover-bg: #f3f4f6;
--note-bg: #f3f4f6;
--note-text: #4b5563;
--accent-blue: #3B82F6;
--accent-purple: #8B5CF6;
--accent-pink: #EC4899;
--score-bg: rgba(0, 0, 0, 0.1);
}}
}}
.dark-table-container {{
background: var(--bg-color);
border-radius: 12px;
padding: 1px;
margin: 20px 0;
}}
.dark-styled-table {{
width: 100%;
border-collapse: collapse;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
background: var(--bg-color);
color: var(--text-color);
}}
.dark-styled-table thead {{
position: sticky;
top: 0;
background: var(--bg-color);
z-index: 1;
}}
.dark-styled-table th {{
padding: 16px;
text-align: left;
font-weight: 500;
color: var(--text-color);
border-bottom: 1px solid var(--border-color);
}}
.dark-styled-table td {{
padding: 16px;
border-bottom: 1px solid var(--border-color);
color: var(--text-color);
}}
.dark-styled-table tbody tr:hover {{
background: var(--hover-bg);
}}
.model-cell {{
font-weight: 500;
}}
.score-cell {{
font-weight: 500;
}}
.note-box {{
margin-top: 20px;
padding: 16px;
background: var(--note-bg);
border-radius: 8px;
color: var(--note-text);
}}
</style>
<div class="note-box">
<p style="margin: 0; font-size: 1em;">
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. DeepSeek V3 and R1 were excluded from rankings due to limited function support. Pricing for Gemini models shown reflects <a href="https://cloud.google.com/vertex-ai/generative-ai/pricing">Vertex AI</a>. Google AI Studio offers <a href="https://ai.google.dev/gemini-api/docs/pricing">Gemini API Access</a> at a lower cost with an API Key.
</p>
</div>
<div class="dark-table-container">
<table class="dark-styled-table">
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Type</th>
<th>Vendor</th>
<th>Cost (I/O)</th>
<th>Avg Category Score (TSQ)</th>
</tr>
</thead>
<tbody>
"""
for _, row in filtered_df.iterrows():
table_html += f"""
<tr>
<td>{get_rank_badge(row['Rank'])}</td>
<td class="model-cell">{row['Model']}</td>
<td>{get_type_badge(row['Model Type'])}</td>
<td class="vendor-cell">{row['Vendor']}</td>
<td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
<td class="score-cell">{get_score_bar(row['Category Score'])}</td>
</tr>
"""
return table_html, perf_chart, cost_chart
def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
with gr.Tab("Leaderboard"):
gr.HTML(HEADER_CONTENT + CARDS)
gr.HTML(DESCRIPTION_HTML)
# Filters row
with gr.Row(equal_height=True):
with gr.Column(scale=1):
model_type = gr.Dropdown(
choices=["All"] + df["Model Type"].unique().tolist(),
value="All",
label="Model Type",
)
with gr.Column(scale=1):
category = gr.Dropdown(
choices=list(CATEGORIES.keys()),
value=list(CATEGORIES.keys())[0],
label="Category",
)
with gr.Column(scale=1):
sort_by = gr.Radio(
choices=["Performance", "Cost"],
value="Performance",
label="Sort by",
)
# Content
output = gr.HTML()
plot1 = gr.Plot()
plot2 = gr.Plot()
gr.HTML(
"""<div class="note-box">
<p style="margin: 0; font-size: 1em;">
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
</p>
</div>"""
)
gr.HTML(METHODOLOGY)
for input_comp in [model_type, category, sort_by]:
input_comp.change(
fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
inputs=[model_type, category, sort_by],
outputs=[output, plot1, plot2],
)
return output, plot1, plot2