Spaces:
Running
Running
import sys | |
import gradio as gr | |
import pandas as pd | |
import plotly.express as px | |
from gradio.themes.utils import colors | |
from results.parse import parse_agg, read_data | |
from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT | |
from style.css_html_js import custom_css | |
from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases | |
def filter_leaderboard(task, benchmark, model_type, search_query, max_params): | |
subset = df.copy() | |
# Filter by task specific benchmarks when 'All' benchmarks is selected | |
if task == "Spec-to-RTL": | |
valid_benchmarks = s2r_benchs | |
if benchmark == "All": | |
subset = subset[subset["Benchmark"].isin(valid_benchmarks)] | |
elif task == "Code Completion": | |
valid_benchmarks = cc_benchs | |
if benchmark == "All": | |
subset = subset[subset["Benchmark"].isin(valid_benchmarks)] | |
elif task == "Line Completion": | |
valid_benchmarks = lc_benchs | |
if benchmark == "All": | |
subset = subset[subset["Benchmark"].isin(valid_benchmarks)] | |
if benchmark != "All": | |
subset = df[df["Benchmark"] == benchmark] | |
if model_type != "All": | |
# without emojis | |
subset = subset[subset["Model Type"] == model_type.split(" ")[0]] | |
if search_query: | |
subset = subset[ | |
subset["Model"].str.contains(search_query, case=False, na=False) | |
] | |
max_params = float(max_params) | |
subset = subset[subset["Params"] <= max_params] | |
if benchmark == "All": | |
if task == "Spec-to-RTL": | |
return filter_bench_all(subset, df_agg, agg_column="Agg S2R") | |
elif task == "Code Completion": | |
return filter_bench_all(subset, df_agg, agg_column="Agg MC") | |
elif task == "Line Completion": | |
return filter_RTLRepo(subset) | |
elif benchmark == "RTL-Repo": | |
return filter_RTLRepo(subset) | |
else: | |
agg_column = None | |
if benchmark == "VerilogEval S2R": | |
agg_column = "Agg VerilogEval S2R" | |
elif benchmark == "VerilogEval MC": | |
agg_column = "Agg VerilogEval MC" | |
elif benchmark == "RTLLM": | |
agg_column = "Agg RTLLM" | |
elif benchmark == "VeriGen": | |
agg_column = "Agg VeriGen" | |
return filter_bench(subset, df_agg, agg_column) | |
def update_benchmarks_by_task(task): | |
if task == "Spec-to-RTL": | |
new_benchmarks = ["All"] + s2r_benchs | |
elif task == "Code Completion": | |
new_benchmarks = ["All"] + cc_benchs | |
elif task == "Line Completion": | |
new_benchmarks = lc_benchs | |
else: | |
new_benchmarks = ["All"] + benchmarks | |
benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0] | |
filtered = filter_leaderboard( | |
task, | |
benchmark_value, | |
model_type_dropdown.value, | |
search_box.value, | |
params_slider.value, | |
) | |
return gr.update(value=benchmark_value, choices=new_benchmarks), filtered | |
def generate_scatter_plot(benchmark, metric): | |
benchmark, metric = handle_special_cases(benchmark, metric) | |
subset = df[df["Benchmark"] == benchmark] | |
if benchmark == "RTL-Repo": | |
subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)] | |
detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean() | |
detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True) | |
else: | |
detailed_scores = subset.pivot_table( | |
index="Model", columns="Metric", values="Score" | |
).reset_index() | |
details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model") | |
scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna( | |
subset=["Params", metric] | |
) | |
scatter_data["x"] = scatter_data["Params"] | |
scatter_data["y"] = scatter_data[metric] | |
scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40 | |
type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"} | |
scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray") | |
y_axis_limits = { | |
"Functionality (FNC)": [5, 90], | |
"Syntax (STX)": [20, 100], | |
"Synthesis (SYN)": [5, 90], | |
"Power": [0, 50], | |
"Performance": [0, 50], | |
"Area": [0, 50], | |
"Exact Matching (EM)": [0, 50], | |
} | |
y_range = y_axis_limits.get(metric, [0, 80]) | |
fig = px.scatter( | |
scatter_data, | |
x="x", | |
y="y", | |
log_x=True, | |
size="size", | |
color="Model Type", | |
text="Model", | |
hover_data={metric: ":.2f"}, | |
title=f"Params vs. {metric} for {benchmark}", | |
labels={"x": "# Params (Log Scale)", "y": metric}, | |
template="plotly_white", | |
height=600, | |
width=1200, | |
) | |
fig.update_traces( | |
textposition="top center", | |
textfont_size=10, | |
marker=dict(opacity=0.8, line=dict(width=0.5, color="black")), | |
) | |
fig.update_layout( | |
xaxis=dict( | |
showgrid=True, | |
type="log", | |
tickmode="array", | |
tickvals=[8, 14, 32, 72, 200, 700], | |
ticktext=["8", "14", "32", "72", "200", "700"], | |
), | |
showlegend=False, | |
yaxis=dict(range=y_range), | |
margin=dict(l=50, r=50, t=50, b=50), | |
plot_bgcolor="white", | |
) | |
return fig | |
js_func = """ | |
function refresh() { | |
const url = new URL(window.location); | |
if (url.searchParams.get('__theme') !== 'light') { | |
url.searchParams.set('__theme', 'light'); | |
window.location.href = url.href; | |
} | |
} | |
""" | |
with gr.Blocks( | |
css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald) | |
) as app: | |
df, benchmarks, metrics, default_metric = read_data() | |
df_agg = parse_agg("./results/aggregated_scores.csv") | |
tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"] | |
s2r_benchs = ["VerilogEval S2R", "RTLLM"] | |
cc_benchs = ["VerilogEval MC", "VeriGen"] | |
lc_benchs = ["RTL-Repo"] | |
non_rtl_metrics = [ | |
"Syntax (STX)", | |
"Functionality (FNC)", | |
"Synthesis (SYN)", | |
"Power", | |
"Performance", | |
"Area", | |
] | |
rtl_metrics = ["Exact Matching (EM)"] | |
model_types = ["All", "General π’", "Coding π΅", "RTL-Specific π΄"] | |
gr.HTML( | |
""" | |
<div align="center"> | |
<img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> | |
</div> | |
""" | |
) | |
gr.HTML( | |
""" | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script> | |
<div style="text-align: center; margin-bottom: 0px; margin-top: 0px;"> | |
<a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;"> | |
<button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;"> | |
GitHub Repo | |
</button> | |
</a> | |
<a href="http://arxiv.org/abs/2504.01986" target="_blank" style="text-decoration: none; margin-right: 10px;"> | |
<button style="background: #b31b1b; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;"> | |
arXiv Preprint | |
</button> | |
</a> | |
<a href="mailto:[email protected]?subject=TuRTLe%20leaderboard%20new%20entry&body=Link%20to%20HuggingFace%20Model:" style="text-decoration: none;"> | |
<button style="background: #00674F; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;"> | |
How to submit | |
</button> | |
</a> | |
<p style="margin-top: 15px;">If you have any inquiries or wish to collaborate: | |
<a href="mailto:[email protected]">[email protected]</a> | |
</p> | |
</div> | |
""" | |
) | |
gr.HTML( | |
""" | |
<div style=" margin-top:-10px !important;"> | |
<p style="margin-bottom: 15px; text-align: start !important;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design. | |
Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs. | |
Use the filters below to explore different RTL benchmarks and models.</p> | |
<p style="margin-top: 15px; text-align: start !important; "><span style="font-variant: small-caps; font-weight: bold;">NEW UPDATE (JUNE 2025)</span>: We make our framework open-source on GitHub, and add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks.</p> | |
</div> | |
""" | |
) | |
with gr.Tabs(): | |
with gr.Tab("Leaderboard"): | |
with gr.Row(equal_height=True): | |
with gr.Column(): | |
task_radio = gr.Radio( | |
choices=tasks, label="Select Task", value="Spec-to-RTL" | |
) | |
with gr.Column(): | |
benchmark_radio = gr.Radio( | |
choices=["All"] + s2r_benchs, | |
label="Select Benchmark", | |
value="All", | |
) | |
with gr.Row(equal_height=True): | |
search_box = gr.Textbox( | |
label="Search Model", | |
placeholder="Type model name...", | |
scale=2, | |
) | |
model_type_dropdown = gr.Radio( | |
choices=model_types, | |
label="Select Model Type", | |
value="All", | |
scale=3, | |
) | |
params_slider = gr.Slider( | |
minimum=df["Params"].min(), | |
maximum=700, | |
value=700, | |
label="Max Params", | |
step=1, | |
scale=2, | |
) | |
leaderboard = gr.DataFrame( | |
value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700), | |
headers="first row", | |
show_row_numbers=True, | |
wrap=True, | |
datatype=[ | |
"markdown", | |
"html", | |
], | |
interactive=False, | |
column_widths=[ | |
"7%", | |
"24%", | |
"17%", | |
"10%", | |
"13%", | |
"10%", | |
"14%", | |
], | |
elem_classes="dataframe-leaderboard", | |
) | |
with gr.Tab("Plot View"): | |
with gr.Row(equal_height=True): | |
default_benchmark = s2r_benchs[0] | |
bubble_benchmark = gr.Dropdown( | |
choices=benchmarks, | |
label="Select Benchmark", | |
value=default_benchmark, | |
elem_classes="gr-dropdown", | |
) | |
default_metric = non_rtl_metrics[0] | |
bubble_metric = gr.Dropdown( | |
choices=non_rtl_metrics, | |
label="Select Metric", | |
value=default_metric, | |
) | |
with gr.Row(equal_height=True): | |
scatter_plot = gr.Plot( | |
value=generate_scatter_plot(default_benchmark, default_metric), | |
label="Bubble Chart", | |
elem_id="full-width-plot", | |
) | |
with gr.Tab("Metrics Information"): | |
with open("./static/metrics.md", "r") as file: | |
gr.Markdown( | |
file.read(), | |
latex_delimiters=[ | |
{"left": "$$", "right": "$$", "display": True}, | |
{"left": "$", "right": "$", "display": False}, | |
], | |
elem_classes="metrics-page", | |
) | |
with gr.Tab("About Us"): | |
gr.HTML( | |
""" | |
<div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;"> | |
<div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;"> | |
<img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/> | |
<img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/> | |
</div> | |
<p style="font-size: 16px; text-align: start;"> | |
The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the | |
<a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>. | |
This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>. | |
</p> | |
<ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;"> | |
<li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li> | |
<li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li> | |
<li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li> | |
</ul> | |
<p style="font-size: 16px; margin-top: 15px;"> | |
Feel free to contact us: | |
</p> | |
<p style="font-size: 16px;">Email: <a href="mailto:[email protected]"><b>[email protected]</b></a></p> | |
</div> | |
""" | |
) | |
with gr.Tab("References"): | |
gr.HTML( | |
""" | |
<div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;"> | |
<ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;"> | |
<li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li> | |
<li>RTL-Repo: Allam and M. Shalan, βRtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,β in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1β5.</li> | |
<li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, βVerigen: A large language model for verilog code generation,β ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1β31, 2024. </li> | |
<li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, βVerilogeval: Evaluating large language models for verilog code generation,β in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1β8.</li> | |
<li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, βRevisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,β ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li> | |
<li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, βRtllm: An open-source benchmark for design rtl generation with large language model,β in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722β727.</li> | |
</ul> | |
<p style="font-size: 16px; margin-top: 15px;"> | |
Feel free to contact us: | |
</p> | |
</div> | |
""" | |
) | |
with gr.Row(): | |
with gr.Accordion("π Citation", open=False): | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
lines=10, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
# event handlers, ugly way but it works | |
task_radio.change( | |
fn=update_benchmarks_by_task, | |
inputs=[task_radio], | |
outputs=[benchmark_radio, leaderboard], | |
) | |
benchmark_radio.change( | |
fn=filter_leaderboard, | |
inputs=[ | |
task_radio, | |
benchmark_radio, | |
model_type_dropdown, | |
search_box, | |
params_slider, | |
], | |
outputs=leaderboard, | |
) | |
model_type_dropdown.change( | |
fn=filter_leaderboard, | |
inputs=[ | |
task_radio, | |
benchmark_radio, | |
model_type_dropdown, | |
search_box, | |
params_slider, | |
], | |
outputs=leaderboard, | |
) | |
search_box.change( | |
fn=filter_leaderboard, | |
inputs=[ | |
task_radio, | |
benchmark_radio, | |
model_type_dropdown, | |
search_box, | |
params_slider, | |
], | |
outputs=leaderboard, | |
) | |
params_slider.change( | |
fn=filter_leaderboard, | |
inputs=[ | |
task_radio, | |
benchmark_radio, | |
model_type_dropdown, | |
search_box, | |
params_slider, | |
], | |
outputs=leaderboard, | |
) | |
def on_benchmark_change(benchmark, _): | |
if benchmark == "RTL-Repo": | |
metric = "Exact Matching (EM)" | |
return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot( | |
benchmark, metric | |
) | |
else: | |
metric = non_rtl_metrics[0] | |
return gr.update( | |
choices=non_rtl_metrics[:-1], value=metric | |
), generate_scatter_plot(benchmark, metric) | |
def on_metric_change(benchmark, metric): | |
benchmark, metric = handle_special_cases(benchmark, metric) | |
fig = generate_scatter_plot(benchmark, metric) | |
return gr.update(value=benchmark), fig | |
bubble_benchmark.change( | |
fn=on_benchmark_change, | |
inputs=[bubble_benchmark, bubble_metric], | |
outputs=[bubble_metric, scatter_plot], | |
js=""" // this is to avoid resetting user scroll each time a plot is re-generated | |
(benchmark, metric) => { | |
let scrollY = window.scrollY; | |
const observer = new MutationObserver(() => { | |
window.scrollTo(0, scrollY); | |
observer.disconnect(); | |
}); | |
observer.observe(document.getElementById('full-width-plot'), { childList: true }); | |
return [benchmark, metric]; | |
} | |
""", | |
) | |
bubble_metric.change( | |
fn=on_metric_change, | |
inputs=[bubble_benchmark, bubble_metric], | |
outputs=[bubble_benchmark, scatter_plot], | |
js=""" // this is to avoid resetting user scroll each time a plot is re-generated | |
(benchmark, metric) => { | |
let scrollY = window.scrollY; | |
const observer = new MutationObserver(() => { | |
window.scrollTo(0, scrollY); | |
observer.disconnect(); | |
}); | |
observer.observe(document.getElementById('full-width-plot'), { childList: true }); | |
return [benchmark, metric]; | |
} | |
""", | |
) | |
app.launch( | |
allowed_paths=[ | |
"logo.png", | |
"hpai_logo_grad.png", | |
"bsc-logo.png", | |
] | |
) | |