Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

App Files Files Community

agent-leaderboard / tabs /leaderboard.py

pratikbhavsar

added sonnet and improved data explorer

f226f06 4 months ago

raw

history blame

6.83 kB

	import gradio as gr

	from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
	from visualization import (
	get_performance_chart,
	get_performance_cost_chart,
	)
	from utils import (
	get_rank_badge,
	get_score_bar,
	get_type_badge,
	)

	def filter_leaderboard(df, model_type, category, sort_by):
	filtered_df = df.copy()
	if model_type != "All":
	filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]

	dataset_columns = CATEGORIES.get(category, ["Model Avg"])
	filtered_df["Category Score"] = filtered_df[dataset_columns].mean(axis=1)

	if sort_by == "Performance":
	filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
	else:
	filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)

	filtered_df["Rank"] = range(1, len(filtered_df) + 1)
	perf_chart = get_performance_chart(filtered_df, category)
	cost_chart = get_performance_cost_chart(filtered_df, category)

	# Generate styled table HTML
	table_html = f"""
	<style>
	@media (prefers-color-scheme: dark) {{
	:root {{
	--bg-color: #1a1b1e;
	--text-color: #ffffff;
	--border-color: #2d2e32;
	--hover-bg: #2d2e32;
	--note-bg: #2d2e32;
	--note-text: #a1a1aa;
	--accent-blue: #60A5FA;
	--accent-purple: #A78BFA;
	--accent-pink: #F472B6;
	--score-bg: rgba(255, 255, 255, 0.1);
	}}
	}}

	@media (prefers-color-scheme: light) {{
	:root {{
	--bg-color: #ffffff;
	--text-color: #000000;
	--border-color: #e5e7eb;
	--hover-bg: #f3f4f6;
	--note-bg: #f3f4f6;
	--note-text: #4b5563;
	--accent-blue: #3B82F6;
	--accent-purple: #8B5CF6;
	--accent-pink: #EC4899;
	--score-bg: rgba(0, 0, 0, 0.1);
	}}
	}}

	.dark-table-container {{
	background: var(--bg-color);
	border-radius: 12px;
	padding: 1px;
	margin: 20px 0;
	}}

	.dark-styled-table {{
	width: 100%;
	border-collapse: collapse;
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
	background: var(--bg-color);
	color: var(--text-color);
	}}

	.dark-styled-table thead {{
	position: sticky;
	top: 0;
	background: var(--bg-color);
	z-index: 1;
	}}

	.dark-styled-table th {{
	padding: 16px;
	text-align: left;
	font-weight: 500;
	color: var(--text-color);
	border-bottom: 1px solid var(--border-color);
	}}

	.dark-styled-table td {{
	padding: 16px;
	border-bottom: 1px solid var(--border-color);
	color: var(--text-color);
	}}

	.dark-styled-table tbody tr:hover {{
	background: var(--hover-bg);
	}}

	.model-cell {{
	font-weight: 500;
	}}

	.score-cell {{
	font-weight: 500;
	}}

	.note-box {{
	margin-top: 20px;
	padding: 16px;
	background: var(--note-bg);
	border-radius: 8px;
	color: var(--note-text);
	}}
	</style>

	<div class="note-box">
	<p style="margin: 0; font-size: 1em;">
	Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. DeepSeek V3 and R1 were excluded from rankings due to limited function support. Pricing for Gemini models shown reflects <a href="https://cloud.google.com/vertex-ai/generative-ai/pricing">Vertex AI</a>. Google AI Studio offers <a href="https://ai.google.dev/gemini-api/docs/pricing">Gemini API Access</a> at a lower cost with an API Key.

	</p>
	</div>

	<div class="dark-table-container">
	<table class="dark-styled-table">
	<thead>
	<tr>
	<th>Rank</th>
	<th>Model</th>
	<th>Type</th>
	<th>Vendor</th>
	<th>Cost (I/O)</th>
	<th>Avg Category Score (TSQ)</th>
	</tr>
	</thead>
	<tbody>
	"""

	for _, row in filtered_df.iterrows():
	table_html += f"""
	<tr>
	<td>{get_rank_badge(row['Rank'])}</td>
	<td class="model-cell">{row['Model']}</td>
	<td>{get_type_badge(row['Model Type'])}</td>
	<td class="vendor-cell">{row['Vendor']}</td>
	<td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
	<td class="score-cell">{get_score_bar(row['Category Score'])}</td>
	</tr>
	"""

	return table_html, perf_chart, cost_chart


	def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
	with gr.Tab("Leaderboard"):
	gr.HTML(HEADER_CONTENT + CARDS)
	gr.HTML(DESCRIPTION_HTML)

	# Filters row
	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	model_type = gr.Dropdown(
	choices=["All"] + df["Model Type"].unique().tolist(),
	value="All",
	label="Model Type",
	)
	with gr.Column(scale=1):
	category = gr.Dropdown(
	choices=list(CATEGORIES.keys()),
	value=list(CATEGORIES.keys())[0],
	label="Category",
	)
	with gr.Column(scale=1):
	sort_by = gr.Radio(
	choices=["Performance", "Cost"],
	value="Performance",
	label="Sort by",
	)

	# Content
	output = gr.HTML()
	plot1 = gr.Plot()
	plot2 = gr.Plot()

	gr.HTML(
	"""<div class="note-box">
	<p style="margin: 0; font-size: 1em;">
	Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
	</p>
	</div>"""
	)

	gr.HTML(METHODOLOGY)

	for input_comp in [model_type, category, sort_by]:
	input_comp.change(
	fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
	inputs=[model_type, category, sort_by],
	outputs=[output, plot1, plot2],
	)

	return output, plot1, plot2