Spaces:

birder-project
/

leaderboard

Running

App Files Files Community

leaderboard / app.py

hassonofer

Update app.py

2daeddf verified 5 months ago

raw

history blame

13.3 kB

	from pathlib import Path

	import altair as alt
	import polars as pl

	import gradio as gr

	DATASETS = []
	BENCHMARKS = {
	# Name: (device, AMP, compile, single thread)
	"Parameters": (None, None, None, None),
	"GPU Memory": (None, None, None, None),
	"CPU rate": ("cpu", False, False, False),
	"CPU rate single core": ("cpu", False, False, True),
	"CPU rate with compile": ("cpu", False, True, False),
	"CPU rate AMP with compile": ("cpu", True, True, False),
	"CUDA rate": ("cuda", False, False, False),
	"CUDA rate with compile": ("cuda", False, True, False),
	"CUDA rate AMP with compile": ("cuda", True, True, False),
	}


	def plot_acc_param(param_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
	df = param_compare_results_df.select(
	"Model name",
	"Model type",
	"Accuracy",
	"Top-3 accuracy",
	"Resolution",
	"Parameters (M)",
	"Pareto frontier (p)",
	"Intermediate",
	"MIM",
	"Distilled",
	)
	base = df.plot.point(
	x="Parameters (M)",
	y="Accuracy",
	color="Model type",
	shape="Resolution:N",
	tooltip=[
	"Parameters (M)",
	"Accuracy",
	"Top-3 accuracy",
	"Model name",
	"Model type",
	"Resolution",
	"Intermediate",
	"MIM",
	"Distilled",
	],
	)
	text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
	frontier = df.plot.line(x="Parameters (M)", y="Pareto frontier (p)").mark_line(
	interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
	)

	chart = base + text + frontier
	return chart.properties(title="Accuracy vs Parameter Count", width=width, height=height).configure_scale(zero=False)


	def plot_acc_memory(memory_compare_results_df: pl.DataFrame, width: int = 900, height: int = 640) -> alt.LayerChart:
	if len(memory_compare_results_df) > 0:
	batch_size = memory_compare_results_df["max_batch_size"][0]
	amp = memory_compare_results_df["amp"][0]
	else:
	batch_size = ""
	amp = ""

	df = memory_compare_results_df.select(
	"Model name",
	"Model type",
	"Accuracy",
	"Top-3 accuracy",
	"Resolution",
	"Peak GPU memory (MB)",
	"Parameters (M)",
	"Pareto frontier (mem)",
	"Intermediate",
	"MIM",
	"Distilled",
	)
	base = df.plot.point(
	x="Peak GPU memory (MB)",
	y="Accuracy",
	color="Model type",
	shape="Resolution:N",
	tooltip=[
	"Peak GPU memory (MB)",
	"Parameters (M)",
	"Accuracy",
	"Top-3 accuracy",
	"Model name",
	"Model type",
	"Resolution",
	"Intermediate",
	"MIM",
	"Distilled",
	],
	)
	text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
	frontier = df.plot.line(x="Peak GPU memory (MB)", y="Pareto frontier (mem)").mark_line(
	interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
	)

	chart = base + text + frontier
	return chart.properties(
	title=f"Accuracy vs GPU Memory (batch size={batch_size}, amp={amp})", width=width, height=height
	).configure_scale(zero=False)


	def plot_acc_rate(rate_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
	if len(rate_compare_results_df) > 0:
	device = rate_compare_results_df["device"][0]
	compiled = rate_compare_results_df["compile"][0]
	batch_size = rate_compare_results_df["max_batch_size"][0]
	amp = rate_compare_results_df["amp"][0]
	single_thread = rate_compare_results_df["single_thread"][0]
	else:
	device = ""
	compiled = ""
	batch_size = ""
	amp = ""
	single_thread = False

	df = rate_compare_results_df.select(
	"Model name",
	"Model type",
	"Accuracy",
	"Top-3 accuracy",
	"Resolution",
	"ms / sample",
	"Parameters (M)",
	"Pareto frontier (ms)",
	"Intermediate",
	"MIM",
	"Distilled",
	)
	base = df.plot.point(
	x="ms / sample",
	y="Accuracy",
	color="Model type",
	shape="Resolution:N",
	tooltip=[
	"ms / sample",
	"Parameters (M)",
	"Accuracy",
	"Top-3 accuracy",
	"Model name",
	"Model type",
	"Resolution",
	"Intermediate",
	"MIM",
	"Distilled",
	],
	)
	text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
	frontier = df.plot.line(x="ms / sample", y="Pareto frontier (ms)").mark_line(
	interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
	)

	chart = base + text + frontier

	if single_thread is True:
	single_thread_title = " Single Core"
	else:
	single_thread_title = ""

	return chart.properties(
	title=(
	f"Accuracy vs {device.upper()}{single_thread_title} Rate (compile={compiled}, "
	f"batch size={batch_size}, amp={amp})"
	),
	width=width,
	height=height,
	).configure_scale(zero=False)


	def update_data(
	dataset: str, benchmark: str, intermediate: bool, mim: bool, dist: bool, log_x: bool, search_bar: str
	) -> tuple[alt.LayerChart, pl.DataFrame]:
	compare_results_df = pl.read_csv(f"results_{dataset}.csv")
	if intermediate is False:
	compare_results_df = compare_results_df.filter(pl.col("Intermediate") == intermediate)
	if mim is False:
	compare_results_df = compare_results_df.filter(pl.col("MIM") == mim)
	if dist is False:
	compare_results_df = compare_results_df.filter(pl.col("Distilled") == dist)

	x_scale_type = "log" if log_x is True else "linear"

	# Filter models
	compare_results_df = compare_results_df.filter(pl.col("Model name").str.contains(search_bar))

	# Parameter count
	if benchmark == "Parameters":
	param_compare_results_df = compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
	"Parameters (M)", descending=False
	)
	param_compare_results_df = param_compare_results_df.with_columns(
	pl.col("Accuracy").cum_max().alias("Pareto frontier (p)")
	)
	param_compare_results_df = param_compare_results_df.drop(
	"Samples / sec", "device", "ms / sample", "Peak GPU memory (MB)"
	)
	chart = plot_acc_param(param_compare_results_df)

	x_max = param_compare_results_df["Parameters (M)"].quantile(0.9)
	x_min = param_compare_results_df["Parameters (M)"].quantile(0.1)
	chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
	output_df = param_compare_results_df

	# Peak memory
	elif benchmark == "GPU Memory":
	memory_compare_results_df = compare_results_df.drop_nulls(subset=["Peak GPU memory (MB)"])
	memory_compare_results_df = memory_compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
	"Peak GPU memory (MB)", descending=False
	)
	memory_compare_results_df = memory_compare_results_df.with_columns(
	pl.col("Accuracy").cum_max().alias("Pareto frontier (mem)")
	)
	memory_compare_results_df = memory_compare_results_df.drop("Samples / sec", "device", "ms / sample")
	chart = plot_acc_memory(memory_compare_results_df)
	x_max = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.9)
	x_min = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.1)
	chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
	output_df = memory_compare_results_df

	# Rate
	else:
	(device, amp_enabled, compiled, single_thread) = BENCHMARKS[benchmark]
	df = compare_results_df.drop_nulls(subset=["ms / sample"])
	df = df.filter(device=device, amp=amp_enabled, compile=compiled, single_thread=single_thread)
	device_compare_results_df = df.unique(subset=["Model name", "Resolution"]).sort("ms / sample", descending=False)
	device_compare_results_df = device_compare_results_df.drop("Peak GPU memory (MB)")
	device_compare_results_df = device_compare_results_df.with_columns(
	pl.col("Accuracy").cum_max().alias("Pareto frontier (ms)")
	)
	chart = plot_acc_rate(device_compare_results_df)

	x_max = device_compare_results_df["ms / sample"].quantile(0.95)
	x_min = device_compare_results_df["ms / sample"].min()
	if x_max is not None and x_min is not None:
	x_max = x_max * 1.04
	x_min = x_min * 0.96

	chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
	output_df = device_compare_results_df

	output_df = output_df.select(
	[
	pl.col(col).round(4) if output_df.schema[col] in [pl.Float32, pl.Float64] else col
	for col in output_df.columns
	]
	)

	return (chart, output_df.drop("Mistakes", "Samples", "torch_version"))


	def app() -> None:
	with gr.Blocks(title="Birder Leaderboard", analytics_enabled=False) as leaderboard:
	gr.HTML("<center><h1>The Birder Leaderboard</h1></center>")
	with gr.Row():
	with gr.Column():
	pass

	with gr.Column():
	gr.Markdown(
	"""
	Leaderboard of all the pre-trained Birder models across multiple datasets.

	### Benchmark Setup

	* GPU: A5000 ADA Generation
	* CPU: AMD Ryzen Threadripper PRO 7975WX
	* PyTorch version: 2.7.1+cu128

	### Dataset Information

	\| Name \| Training samples \| Validation samples \| Classes \|
	\|---------------------\|------------------\|--------------------\|-------------\|
	\| arabian-peninsula \| 583,868 \| 21,634 \| 735 \|
	\| eu-common \| 569,784 \| 19,869 \| 707 \|
	\| il-all \| 462,430 \| 18,621 \| 550 \|
	\| il-common \| 330,880 \| 15,828 \| 371 \|
	"""
	)

	with gr.Column():
	pass

	with gr.Row():
	with gr.Column():
	pass

	with gr.Column():
	dataset_dropdown = gr.Dropdown(
	choices=DATASETS,
	label="Select Dataset",
	value=DATASETS[0] if DATASETS else None,
	)
	benchmark_dropdown = gr.Dropdown(
	choices=BENCHMARKS.keys(),
	label="Select Benchmark",
	value=next(iter(BENCHMARKS.keys())) if BENCHMARKS else None,
	filterable=False,
	)

	with gr.Column():
	intermediate = gr.Checkbox(
	label="Intermediate",
	value=True,
	info="Show models that underwent intermediate training (extra data)",
	)
	mim = gr.Checkbox(label="MIM", value=True, info="Show models with Masked Image Modeling pre-training")
	dist = gr.Checkbox(label="Distilled", value=True, info="Show distilled models")
	log_x = gr.Checkbox(label="Log scale X-axis", value=False)

	with gr.Column():
	pass

	with gr.Row():
	with gr.Column():
	pass

	with gr.Column(scale=2):
	search_bar = gr.Textbox(label="Model Filter", placeholder="e.g. convnext, efficient\|mobile")

	with gr.Column():
	pass

	plot = gr.Plot(container=False)
	table = gr.Dataframe(show_search="search")

	inputs = [dataset_dropdown, benchmark_dropdown, intermediate, mim, dist, log_x, search_bar]
	outputs = [plot, table]
	leaderboard.load(update_data, inputs=inputs, outputs=outputs)

	dataset_dropdown.change(update_data, inputs=inputs, outputs=outputs)
	benchmark_dropdown.change(update_data, inputs=inputs, outputs=outputs)
	intermediate.change(update_data, inputs=inputs, outputs=outputs)
	mim.change(update_data, inputs=inputs, outputs=outputs)
	dist.change(update_data, inputs=inputs, outputs=outputs)
	log_x.change(update_data, inputs=inputs, outputs=outputs)
	search_bar.change(update_data, inputs=inputs, outputs=outputs)

	leaderboard.launch()


	# Launch the app
	if __name__ == "__main__":
	file_info = []
	for p in Path.glob(Path("."), "results_*.csv"):
	file_info.append((p.stat().st_size, p.stem.removeprefix("results_")))

	DATASETS = [dataset_name for _, dataset_name in sorted(file_info, key=lambda x: x[1], reverse=True)]

	app()