Spaces:
Running
Running
Commit
Β·
711a69b
1
Parent(s):
2c02057
Re-organize code
Browse files- app.py +263 -127
- aggregated_scores.csv β results/aggregated_scores.csv +0 -0
- parse.py β results/parse.py +115 -34
- results.csv β results/results.csv +0 -0
- results.json β results/results.json +0 -0
- about.py β static/about.py +0 -0
- metrics.md β static/metrics.md +0 -0
- css_html_js.py β style/css_html_js.py +0 -0
app.py
CHANGED
@@ -1,71 +1,69 @@
|
|
1 |
-
import
|
2 |
-
from typing import Union
|
3 |
|
4 |
import gradio as gr
|
5 |
-
import numpy as np
|
6 |
import pandas as pd
|
7 |
import plotly.express as px
|
8 |
-
import plotly.graph_objects as go
|
9 |
from gradio.themes.utils import colors
|
10 |
-
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
11 |
|
12 |
-
from
|
13 |
-
from
|
14 |
-
from
|
15 |
-
from utils import
|
16 |
-
handle_special_cases, model_hyperlink, type_emoji)
|
17 |
|
18 |
|
19 |
def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
|
20 |
subset = df.copy()
|
21 |
-
|
22 |
# Filter by task specific benchmarks when 'All' benchmarks is selected
|
23 |
if task == "Spec-to-RTL":
|
24 |
valid_benchmarks = s2r_benchs
|
25 |
-
if benchmark ==
|
26 |
-
subset = subset[subset[
|
27 |
elif task == "Code Completion":
|
28 |
valid_benchmarks = cc_benchs
|
29 |
-
if benchmark ==
|
30 |
-
subset = subset[subset[
|
31 |
elif task == "Line Completion":
|
32 |
valid_benchmarks = lc_benchs
|
33 |
-
if benchmark ==
|
34 |
-
subset = subset[subset[
|
35 |
-
|
36 |
-
if benchmark !=
|
37 |
-
subset = df[df[
|
38 |
-
|
39 |
-
if model_type !=
|
40 |
# without emojis
|
41 |
-
subset = subset[subset[
|
42 |
if search_query:
|
43 |
-
subset = subset[
|
|
|
|
|
44 |
max_params = float(max_params)
|
45 |
-
subset = subset[subset[
|
46 |
-
|
47 |
-
if benchmark ==
|
48 |
-
if task ==
|
49 |
-
return filter_bench_all(subset, df_agg, agg_column=
|
50 |
-
elif task ==
|
51 |
-
return filter_bench_all(subset, df_agg, agg_column=
|
52 |
-
elif task ==
|
53 |
return filter_RTLRepo(subset)
|
54 |
-
elif benchmark ==
|
55 |
return filter_RTLRepo(subset)
|
56 |
else:
|
57 |
agg_column = None
|
58 |
-
if benchmark ==
|
59 |
-
agg_column =
|
60 |
-
elif benchmark ==
|
61 |
-
agg_column =
|
62 |
-
elif benchmark ==
|
63 |
-
agg_column =
|
64 |
-
elif benchmark ==
|
65 |
-
agg_column =
|
66 |
-
|
67 |
return filter_bench(subset, df_agg, agg_column)
|
68 |
|
|
|
69 |
def update_benchmarks_by_task(task):
|
70 |
if task == "Spec-to-RTL":
|
71 |
new_benchmarks = ["All"] + s2r_benchs
|
@@ -76,59 +74,90 @@ def update_benchmarks_by_task(task):
|
|
76 |
else:
|
77 |
new_benchmarks = ["All"] + benchmarks
|
78 |
benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
|
79 |
-
filtered = filter_leaderboard(
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
|
81 |
|
|
|
82 |
def generate_scatter_plot(benchmark, metric):
|
83 |
benchmark, metric = handle_special_cases(benchmark, metric)
|
84 |
-
|
85 |
-
subset = df[df[
|
86 |
if benchmark == "RTL-Repo":
|
87 |
-
subset = subset[subset[
|
88 |
-
detailed_scores = subset.groupby(
|
89 |
-
detailed_scores.rename(columns={
|
90 |
else:
|
91 |
-
detailed_scores = subset.pivot_table(
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
scatter_data[
|
97 |
-
scatter_data[
|
98 |
-
scatter_data[
|
99 |
|
100 |
type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
|
101 |
-
scatter_data[
|
102 |
|
103 |
y_axis_limits = {
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
106 |
}
|
107 |
y_range = y_axis_limits.get(metric, [0, 80])
|
108 |
|
109 |
fig = px.scatter(
|
110 |
-
scatter_data,
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
)
|
115 |
|
116 |
fig.update_traces(
|
117 |
-
textposition=
|
118 |
-
|
|
|
119 |
)
|
120 |
fig.update_layout(
|
121 |
xaxis=dict(
|
122 |
-
showgrid=True,
|
|
|
|
|
123 |
tickvals=[8, 14, 32, 72, 200, 700],
|
124 |
-
ticktext=[
|
125 |
),
|
126 |
-
showlegend=False,
|
127 |
-
|
|
|
|
|
128 |
)
|
129 |
|
130 |
return fig
|
131 |
|
|
|
132 |
js_func = """
|
133 |
function refresh() {
|
134 |
const url = new URL(window.location);
|
@@ -139,24 +168,36 @@ function refresh() {
|
|
139 |
}
|
140 |
}
|
141 |
"""
|
142 |
-
|
143 |
-
with gr.Blocks(
|
|
|
|
|
144 |
df, benchmarks, metrics, default_metric = read_data()
|
145 |
-
df_agg = parse_agg("./aggregated_scores.csv")
|
146 |
tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
|
147 |
s2r_benchs = ["VerilogEval S2R", "RTLLM"]
|
148 |
cc_benchs = ["VerilogEval MC", "VeriGen"]
|
149 |
lc_benchs = ["RTL-Repo"]
|
150 |
-
non_rtl_metrics = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
rtl_metrics = ["Exact Matching (EM)"]
|
152 |
-
model_types = [
|
153 |
-
|
154 |
-
gr.HTML(
|
|
|
155 |
<p align="center" style="margin-bottom: -10px;">
|
156 |
<img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
|
157 |
</p>
|
158 |
-
"""
|
159 |
-
|
|
|
|
|
160 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
161 |
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
|
162 |
<div style="text-align: center; margin-bottom: 15px;">
|
@@ -184,60 +225,99 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
|
|
184 |
<a href="mailto:[email protected]">[email protected]</a>
|
185 |
</p>
|
186 |
</div>
|
187 |
-
"""
|
|
|
188 |
with gr.Tabs():
|
189 |
with gr.Tab("Leaderboard"):
|
190 |
with gr.Row(equal_height=True):
|
191 |
with gr.Column():
|
192 |
-
task_radio = gr.Radio(
|
193 |
-
|
194 |
-
benchmark_radio = gr.Radio(choices=["All"] + s2r_benchs, label="Select Benchmark", value='All')
|
195 |
-
|
196 |
-
with gr.Row(equal_height=True):
|
197 |
-
search_box = gr.Textbox(
|
198 |
-
label="Search Model",
|
199 |
-
placeholder="Type model name...",
|
200 |
-
scale=2,
|
201 |
-
)
|
202 |
-
model_type_dropdown = gr.Radio(
|
203 |
-
choices=model_types,
|
204 |
-
label="Select Model Type",
|
205 |
-
value='All',
|
206 |
-
scale=3,
|
207 |
)
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
step=1,
|
214 |
-
scale=2,
|
215 |
)
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
leaderboard = gr.DataFrame(
|
218 |
-
value=filter_leaderboard(
|
219 |
headers="first row",
|
220 |
show_row_numbers=True,
|
221 |
wrap=True,
|
222 |
-
datatype=[
|
|
|
|
|
|
|
223 |
interactive=False,
|
224 |
-
column_widths=[
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
with gr.Tab("Plot View"):
|
227 |
with gr.Row(equal_height=True):
|
228 |
default_benchmark = s2r_benchs[0]
|
229 |
-
bubble_benchmark = gr.Dropdown(
|
|
|
|
|
|
|
|
|
|
|
230 |
default_metric = non_rtl_metrics[0]
|
231 |
-
bubble_metric = gr.Dropdown(
|
|
|
|
|
|
|
|
|
232 |
with gr.Row(equal_height=True):
|
233 |
-
scatter_plot = gr.Plot(
|
|
|
|
|
|
|
|
|
234 |
|
235 |
with gr.Tab("Metrics Information"):
|
236 |
-
with open("metrics.md", "r") as file:
|
237 |
-
gr.Markdown(
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
|
|
241 |
with gr.Tab("About Us"):
|
242 |
gr.HTML(
|
243 |
"""
|
@@ -267,7 +347,7 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
|
|
267 |
</div>
|
268 |
"""
|
269 |
)
|
270 |
-
|
271 |
with gr.Row():
|
272 |
with gr.Accordion("π Citation", open=False):
|
273 |
citation_button = gr.Textbox(
|
@@ -277,21 +357,69 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
|
|
277 |
elem_id="citation-button",
|
278 |
show_copy_button=True,
|
279 |
)
|
280 |
-
|
281 |
# event handlers, ugly way but it works
|
282 |
-
task_radio.change(
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
def on_benchmark_change(benchmark, _):
|
289 |
if benchmark == "RTL-Repo":
|
290 |
metric = "Exact Matching (EM)"
|
291 |
-
return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
|
|
|
|
|
292 |
else:
|
293 |
metric = non_rtl_metrics[0]
|
294 |
-
return gr.update(
|
|
|
|
|
295 |
|
296 |
def on_metric_change(benchmark, metric):
|
297 |
benchmark, metric = handle_special_cases(benchmark, metric)
|
@@ -299,7 +427,7 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
|
|
299 |
return gr.update(value=benchmark), fig
|
300 |
|
301 |
bubble_benchmark.change(
|
302 |
-
fn=on_benchmark_change,
|
303 |
inputs=[bubble_benchmark, bubble_metric],
|
304 |
outputs=[bubble_metric, scatter_plot],
|
305 |
js=""" // this is to avoid resetting user scroll each time a plot is re-generated
|
@@ -312,7 +440,8 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
|
|
312 |
observer.observe(document.getElementById('full-width-plot'), { childList: true });
|
313 |
return [benchmark, metric];
|
314 |
}
|
315 |
-
"""
|
|
|
316 |
|
317 |
bubble_metric.change(
|
318 |
fn=on_metric_change,
|
@@ -328,7 +457,14 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
|
|
328 |
observer.observe(document.getElementById('full-width-plot'), { childList: true });
|
329 |
return [benchmark, metric];
|
330 |
}
|
331 |
-
"""
|
332 |
-
|
|
|
333 |
|
334 |
-
app.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
|
|
2 |
|
3 |
import gradio as gr
|
|
|
4 |
import pandas as pd
|
5 |
import plotly.express as px
|
|
|
6 |
from gradio.themes.utils import colors
|
|
|
7 |
|
8 |
+
from results.parse import parse_agg, read_data
|
9 |
+
from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
|
10 |
+
from style.css_html_js import custom_css
|
11 |
+
from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases
|
|
|
12 |
|
13 |
|
14 |
def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
|
15 |
subset = df.copy()
|
16 |
+
|
17 |
# Filter by task specific benchmarks when 'All' benchmarks is selected
|
18 |
if task == "Spec-to-RTL":
|
19 |
valid_benchmarks = s2r_benchs
|
20 |
+
if benchmark == "All":
|
21 |
+
subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
|
22 |
elif task == "Code Completion":
|
23 |
valid_benchmarks = cc_benchs
|
24 |
+
if benchmark == "All":
|
25 |
+
subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
|
26 |
elif task == "Line Completion":
|
27 |
valid_benchmarks = lc_benchs
|
28 |
+
if benchmark == "All":
|
29 |
+
subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
|
30 |
+
|
31 |
+
if benchmark != "All":
|
32 |
+
subset = df[df["Benchmark"] == benchmark]
|
33 |
+
|
34 |
+
if model_type != "All":
|
35 |
# without emojis
|
36 |
+
subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
|
37 |
if search_query:
|
38 |
+
subset = subset[
|
39 |
+
subset["Model"].str.contains(search_query, case=False, na=False)
|
40 |
+
]
|
41 |
max_params = float(max_params)
|
42 |
+
subset = subset[subset["Params"] <= max_params]
|
43 |
+
|
44 |
+
if benchmark == "All":
|
45 |
+
if task == "Spec-to-RTL":
|
46 |
+
return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
|
47 |
+
elif task == "Code Completion":
|
48 |
+
return filter_bench_all(subset, df_agg, agg_column="Agg MC")
|
49 |
+
elif task == "Line Completion":
|
50 |
return filter_RTLRepo(subset)
|
51 |
+
elif benchmark == "RTL-Repo":
|
52 |
return filter_RTLRepo(subset)
|
53 |
else:
|
54 |
agg_column = None
|
55 |
+
if benchmark == "VerilogEval S2R":
|
56 |
+
agg_column = "Agg VerilogEval S2R"
|
57 |
+
elif benchmark == "VerilogEval MC":
|
58 |
+
agg_column = "Agg VerilogEval MC"
|
59 |
+
elif benchmark == "RTLLM":
|
60 |
+
agg_column = "Agg RTLLM"
|
61 |
+
elif benchmark == "VeriGen":
|
62 |
+
agg_column = "Agg VeriGen"
|
63 |
+
|
64 |
return filter_bench(subset, df_agg, agg_column)
|
65 |
|
66 |
+
|
67 |
def update_benchmarks_by_task(task):
|
68 |
if task == "Spec-to-RTL":
|
69 |
new_benchmarks = ["All"] + s2r_benchs
|
|
|
74 |
else:
|
75 |
new_benchmarks = ["All"] + benchmarks
|
76 |
benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
|
77 |
+
filtered = filter_leaderboard(
|
78 |
+
task,
|
79 |
+
benchmark_value,
|
80 |
+
model_type_dropdown.value,
|
81 |
+
search_box.value,
|
82 |
+
params_slider.value,
|
83 |
+
)
|
84 |
return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
|
85 |
|
86 |
+
|
87 |
def generate_scatter_plot(benchmark, metric):
|
88 |
benchmark, metric = handle_special_cases(benchmark, metric)
|
89 |
+
|
90 |
+
subset = df[df["Benchmark"] == benchmark]
|
91 |
if benchmark == "RTL-Repo":
|
92 |
+
subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
|
93 |
+
detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
|
94 |
+
detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
|
95 |
else:
|
96 |
+
detailed_scores = subset.pivot_table(
|
97 |
+
index="Model", columns="Metric", values="Score"
|
98 |
+
).reset_index()
|
99 |
+
|
100 |
+
details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
|
101 |
+
scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
|
102 |
+
subset=["Params", metric]
|
103 |
+
)
|
104 |
|
105 |
+
scatter_data["x"] = scatter_data["Params"]
|
106 |
+
scatter_data["y"] = scatter_data[metric]
|
107 |
+
scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40
|
108 |
|
109 |
type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
|
110 |
+
scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")
|
111 |
|
112 |
y_axis_limits = {
|
113 |
+
"Functionality (FNC)": [5, 90],
|
114 |
+
"Syntax (STX)": [20, 100],
|
115 |
+
"Synthesis (SYN)": [5, 90],
|
116 |
+
"Power": [0, 50],
|
117 |
+
"Performance": [0, 50],
|
118 |
+
"Area": [0, 50],
|
119 |
+
"Exact Matching (EM)": [0, 50],
|
120 |
}
|
121 |
y_range = y_axis_limits.get(metric, [0, 80])
|
122 |
|
123 |
fig = px.scatter(
|
124 |
+
scatter_data,
|
125 |
+
x="x",
|
126 |
+
y="y",
|
127 |
+
log_x=True,
|
128 |
+
size="size",
|
129 |
+
color="Model Type",
|
130 |
+
text="Model",
|
131 |
+
hover_data={metric: ":.2f"},
|
132 |
+
title=f"Params vs. {metric} for {benchmark}",
|
133 |
+
labels={"x": "# Params (Log Scale)", "y": metric},
|
134 |
+
template="plotly_white",
|
135 |
+
height=600,
|
136 |
+
width=1200,
|
137 |
)
|
138 |
|
139 |
fig.update_traces(
|
140 |
+
textposition="top center",
|
141 |
+
textfont_size=10,
|
142 |
+
marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
|
143 |
)
|
144 |
fig.update_layout(
|
145 |
xaxis=dict(
|
146 |
+
showgrid=True,
|
147 |
+
type="log",
|
148 |
+
tickmode="array",
|
149 |
tickvals=[8, 14, 32, 72, 200, 700],
|
150 |
+
ticktext=["8", "14", "32", "72", "200", "700"],
|
151 |
),
|
152 |
+
showlegend=False,
|
153 |
+
yaxis=dict(range=y_range),
|
154 |
+
margin=dict(l=50, r=50, t=50, b=50),
|
155 |
+
plot_bgcolor="white",
|
156 |
)
|
157 |
|
158 |
return fig
|
159 |
|
160 |
+
|
161 |
js_func = """
|
162 |
function refresh() {
|
163 |
const url = new URL(window.location);
|
|
|
168 |
}
|
169 |
}
|
170 |
"""
|
171 |
+
|
172 |
+
with gr.Blocks(
|
173 |
+
css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)
|
174 |
+
) as app:
|
175 |
df, benchmarks, metrics, default_metric = read_data()
|
176 |
+
df_agg = parse_agg("./results/aggregated_scores.csv")
|
177 |
tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
|
178 |
s2r_benchs = ["VerilogEval S2R", "RTLLM"]
|
179 |
cc_benchs = ["VerilogEval MC", "VeriGen"]
|
180 |
lc_benchs = ["RTL-Repo"]
|
181 |
+
non_rtl_metrics = [
|
182 |
+
"Syntax (STX)",
|
183 |
+
"Functionality (FNC)",
|
184 |
+
"Synthesis (SYN)",
|
185 |
+
"Power",
|
186 |
+
"Performance",
|
187 |
+
"Area",
|
188 |
+
]
|
189 |
rtl_metrics = ["Exact Matching (EM)"]
|
190 |
+
model_types = ["All", "General π’", "Coding π΅", "RTL-Specific π΄"]
|
191 |
+
|
192 |
+
gr.HTML(
|
193 |
+
"""
|
194 |
<p align="center" style="margin-bottom: -10px;">
|
195 |
<img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
|
196 |
</p>
|
197 |
+
"""
|
198 |
+
)
|
199 |
+
gr.HTML(
|
200 |
+
"""
|
201 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
202 |
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
|
203 |
<div style="text-align: center; margin-bottom: 15px;">
|
|
|
225 |
<a href="mailto:[email protected]">[email protected]</a>
|
226 |
</p>
|
227 |
</div>
|
228 |
+
"""
|
229 |
+
)
|
230 |
with gr.Tabs():
|
231 |
with gr.Tab("Leaderboard"):
|
232 |
with gr.Row(equal_height=True):
|
233 |
with gr.Column():
|
234 |
+
task_radio = gr.Radio(
|
235 |
+
choices=tasks, label="Select Task", value="Spec-to-RTL"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
)
|
237 |
+
with gr.Column():
|
238 |
+
benchmark_radio = gr.Radio(
|
239 |
+
choices=["All"] + s2r_benchs,
|
240 |
+
label="Select Benchmark",
|
241 |
+
value="All",
|
|
|
|
|
242 |
)
|
243 |
+
|
244 |
+
with gr.Row(equal_height=True):
|
245 |
+
search_box = gr.Textbox(
|
246 |
+
label="Search Model",
|
247 |
+
placeholder="Type model name...",
|
248 |
+
scale=2,
|
249 |
+
)
|
250 |
+
model_type_dropdown = gr.Radio(
|
251 |
+
choices=model_types,
|
252 |
+
label="Select Model Type",
|
253 |
+
value="All",
|
254 |
+
scale=3,
|
255 |
+
)
|
256 |
+
params_slider = gr.Slider(
|
257 |
+
minimum=df["Params"].min(),
|
258 |
+
maximum=700,
|
259 |
+
value=700,
|
260 |
+
label="Max Params",
|
261 |
+
step=1,
|
262 |
+
scale=2,
|
263 |
+
)
|
264 |
+
|
265 |
leaderboard = gr.DataFrame(
|
266 |
+
value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
|
267 |
headers="first row",
|
268 |
show_row_numbers=True,
|
269 |
wrap=True,
|
270 |
+
datatype=[
|
271 |
+
"markdown",
|
272 |
+
"html",
|
273 |
+
],
|
274 |
interactive=False,
|
275 |
+
column_widths=[
|
276 |
+
"7%",
|
277 |
+
"25%",
|
278 |
+
"10%",
|
279 |
+
"17%",
|
280 |
+
"6%",
|
281 |
+
"6%",
|
282 |
+
"6%",
|
283 |
+
"6%",
|
284 |
+
"6%",
|
285 |
+
"7%",
|
286 |
+
],
|
287 |
+
)
|
288 |
+
|
289 |
with gr.Tab("Plot View"):
|
290 |
with gr.Row(equal_height=True):
|
291 |
default_benchmark = s2r_benchs[0]
|
292 |
+
bubble_benchmark = gr.Dropdown(
|
293 |
+
choices=benchmarks,
|
294 |
+
label="Select Benchmark",
|
295 |
+
value=default_benchmark,
|
296 |
+
elem_classes="gr-dropdown",
|
297 |
+
)
|
298 |
default_metric = non_rtl_metrics[0]
|
299 |
+
bubble_metric = gr.Dropdown(
|
300 |
+
choices=non_rtl_metrics,
|
301 |
+
label="Select Metric",
|
302 |
+
value=default_metric,
|
303 |
+
)
|
304 |
with gr.Row(equal_height=True):
|
305 |
+
scatter_plot = gr.Plot(
|
306 |
+
value=generate_scatter_plot(default_benchmark, default_metric),
|
307 |
+
label="Bubble Chart",
|
308 |
+
elem_id="full-width-plot",
|
309 |
+
)
|
310 |
|
311 |
with gr.Tab("Metrics Information"):
|
312 |
+
with open("./static/metrics.md", "r") as file:
|
313 |
+
gr.Markdown(
|
314 |
+
file.read(),
|
315 |
+
latex_delimiters=[
|
316 |
+
{"left": "$$", "right": "$$", "display": True},
|
317 |
+
{"left": "$", "right": "$", "display": False},
|
318 |
+
],
|
319 |
+
elem_classes="metrics-page",
|
320 |
+
)
|
321 |
with gr.Tab("About Us"):
|
322 |
gr.HTML(
|
323 |
"""
|
|
|
347 |
</div>
|
348 |
"""
|
349 |
)
|
350 |
+
|
351 |
with gr.Row():
|
352 |
with gr.Accordion("π Citation", open=False):
|
353 |
citation_button = gr.Textbox(
|
|
|
357 |
elem_id="citation-button",
|
358 |
show_copy_button=True,
|
359 |
)
|
360 |
+
|
361 |
# event handlers, ugly way but it works
|
362 |
+
task_radio.change(
|
363 |
+
fn=update_benchmarks_by_task,
|
364 |
+
inputs=[task_radio],
|
365 |
+
outputs=[benchmark_radio, leaderboard],
|
366 |
+
)
|
367 |
+
benchmark_radio.change(
|
368 |
+
fn=filter_leaderboard,
|
369 |
+
inputs=[
|
370 |
+
task_radio,
|
371 |
+
benchmark_radio,
|
372 |
+
model_type_dropdown,
|
373 |
+
search_box,
|
374 |
+
params_slider,
|
375 |
+
],
|
376 |
+
outputs=leaderboard,
|
377 |
+
)
|
378 |
+
model_type_dropdown.change(
|
379 |
+
fn=filter_leaderboard,
|
380 |
+
inputs=[
|
381 |
+
task_radio,
|
382 |
+
benchmark_radio,
|
383 |
+
model_type_dropdown,
|
384 |
+
search_box,
|
385 |
+
params_slider,
|
386 |
+
],
|
387 |
+
outputs=leaderboard,
|
388 |
+
)
|
389 |
+
search_box.change(
|
390 |
+
fn=filter_leaderboard,
|
391 |
+
inputs=[
|
392 |
+
task_radio,
|
393 |
+
benchmark_radio,
|
394 |
+
model_type_dropdown,
|
395 |
+
search_box,
|
396 |
+
params_slider,
|
397 |
+
],
|
398 |
+
outputs=leaderboard,
|
399 |
+
)
|
400 |
+
params_slider.change(
|
401 |
+
fn=filter_leaderboard,
|
402 |
+
inputs=[
|
403 |
+
task_radio,
|
404 |
+
benchmark_radio,
|
405 |
+
model_type_dropdown,
|
406 |
+
search_box,
|
407 |
+
params_slider,
|
408 |
+
],
|
409 |
+
outputs=leaderboard,
|
410 |
+
)
|
411 |
|
412 |
def on_benchmark_change(benchmark, _):
|
413 |
if benchmark == "RTL-Repo":
|
414 |
metric = "Exact Matching (EM)"
|
415 |
+
return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
|
416 |
+
benchmark, metric
|
417 |
+
)
|
418 |
else:
|
419 |
metric = non_rtl_metrics[0]
|
420 |
+
return gr.update(
|
421 |
+
choices=non_rtl_metrics[:-1], value=metric
|
422 |
+
), generate_scatter_plot(benchmark, metric)
|
423 |
|
424 |
def on_metric_change(benchmark, metric):
|
425 |
benchmark, metric = handle_special_cases(benchmark, metric)
|
|
|
427 |
return gr.update(value=benchmark), fig
|
428 |
|
429 |
bubble_benchmark.change(
|
430 |
+
fn=on_benchmark_change,
|
431 |
inputs=[bubble_benchmark, bubble_metric],
|
432 |
outputs=[bubble_metric, scatter_plot],
|
433 |
js=""" // this is to avoid resetting user scroll each time a plot is re-generated
|
|
|
440 |
observer.observe(document.getElementById('full-width-plot'), { childList: true });
|
441 |
return [benchmark, metric];
|
442 |
}
|
443 |
+
""",
|
444 |
+
)
|
445 |
|
446 |
bubble_metric.change(
|
447 |
fn=on_metric_change,
|
|
|
457 |
observer.observe(document.getElementById('full-width-plot'), { childList: true });
|
458 |
return [benchmark, metric];
|
459 |
}
|
460 |
+
""",
|
461 |
+
)
|
462 |
+
|
463 |
|
464 |
+
app.launch(
|
465 |
+
allowed_paths=[
|
466 |
+
"logo.png",
|
467 |
+
"hpai_logo_grad.png",
|
468 |
+
"bsc-logo.png",
|
469 |
+
]
|
470 |
+
)
|
aggregated_scores.csv β results/aggregated_scores.csv
RENAMED
File without changes
|
parse.py β results/parse.py
RENAMED
@@ -1,35 +1,99 @@
|
|
1 |
-
import json
|
2 |
-
import pandas as pd
|
3 |
import csv
|
4 |
-
|
5 |
import locale
|
|
|
|
|
|
|
6 |
|
7 |
model_details = {
|
8 |
"DeepSeek R1": ("https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General"),
|
9 |
-
"Llama 3.1 405B": (
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General"),
|
13 |
-
"StarChat2 15B v0.1": (
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
"DeepSeek
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
"CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
|
26 |
"CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
|
27 |
-
"CodeV-DS-6.7B": (
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
}
|
32 |
|
|
|
33 |
def get_headers(reader, agg=False) -> Union[list, list]:
|
34 |
metrics, benchs = [], []
|
35 |
for i, row in enumerate(reader):
|
@@ -42,6 +106,7 @@ def get_headers(reader, agg=False) -> Union[list, list]:
|
|
42 |
return metrics
|
43 |
return metrics, benchs
|
44 |
|
|
|
45 |
def get_model_params_and_url(model) -> Union[str, str, float]:
|
46 |
if model not in model_details:
|
47 |
return "-", "-", "-"
|
@@ -50,6 +115,7 @@ def get_model_params_and_url(model) -> Union[str, str, float]:
|
|
50 |
type = model_details[model][2]
|
51 |
return url, params, type
|
52 |
|
|
|
53 |
def parse_results(csv_path: str) -> list[dict]:
|
54 |
"""
|
55 |
Each row has the following format:
|
@@ -57,8 +123,8 @@ def parse_results(csv_path: str) -> list[dict]:
|
|
57 |
"""
|
58 |
dataset = []
|
59 |
models = []
|
60 |
-
with open(csv_path, newline=
|
61 |
-
reader = csv.reader(csvfile, delimiter=
|
62 |
metrics, benchs = get_headers(reader)
|
63 |
for i, row in enumerate(reader):
|
64 |
model = row[0]
|
@@ -69,12 +135,12 @@ def parse_results(csv_path: str) -> list[dict]:
|
|
69 |
for metric, bench in zip(metrics, benchs):
|
70 |
if metric == "EM":
|
71 |
metric = "Exact Matching (EM)"
|
72 |
-
record = {}
|
73 |
record["Model"] = model
|
74 |
record["Model Type"] = type
|
75 |
record["Benchmark"] = bench
|
76 |
record["Task"] = metric
|
77 |
-
record["Result"] = float(row[ctr].replace(
|
78 |
record["Model URL"] = url
|
79 |
record["Params"] = params
|
80 |
dataset.append(record)
|
@@ -82,32 +148,47 @@ def parse_results(csv_path: str) -> list[dict]:
|
|
82 |
print(models)
|
83 |
return dataset
|
84 |
|
|
|
85 |
def parse_agg(csv_path: str) -> list[dict]:
|
86 |
"""
|
87 |
Each row has the following format:
|
88 |
MODEL | BENCHMARK | TASK | METRIC | RESULT
|
89 |
"""
|
90 |
-
return pd.read_csv("aggregated_scores.csv")
|
|
|
91 |
|
92 |
def writeJson(data: list):
|
93 |
-
with open(
|
94 |
json.dump(data, f, indent=4, ensure_ascii=False)
|
95 |
print("Done")
|
96 |
|
|
|
97 |
def read_json():
|
98 |
-
json_path = "
|
99 |
with open(json_path, "r", encoding="utf-8") as file:
|
100 |
data = json.load(file)
|
101 |
return data
|
102 |
|
|
|
103 |
def read_data() -> Union[pd.DataFrame, list, list, str]:
|
104 |
data = read_json()
|
105 |
df = pd.DataFrame(data)
|
106 |
-
df.rename(
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
return df, benchmarks, metrics, default_metric
|
112 |
|
113 |
|
|
|
|
|
|
|
1 |
import csv
|
2 |
+
import json
|
3 |
import locale
|
4 |
+
from typing import Dict, Union
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
|
8 |
model_details = {
|
9 |
"DeepSeek R1": ("https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General"),
|
10 |
+
"Llama 3.1 405B": (
|
11 |
+
"https://huggingface.co/meta-llama/Llama-3.1-405B",
|
12 |
+
406,
|
13 |
+
"General",
|
14 |
+
),
|
15 |
+
"Llama 3.(1-3) 70B": (
|
16 |
+
"https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
|
17 |
+
70.6,
|
18 |
+
"General",
|
19 |
+
),
|
20 |
+
"Qwen2.5 72B": (
|
21 |
+
"https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
22 |
+
72.7,
|
23 |
+
"General",
|
24 |
+
),
|
25 |
"Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General"),
|
26 |
+
"StarChat2 15B v0.1": (
|
27 |
+
"https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
|
28 |
+
16,
|
29 |
+
"General",
|
30 |
+
),
|
31 |
+
"DeepSeek R1 Distill Qwen 14B": (
|
32 |
+
"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
33 |
+
14.8,
|
34 |
+
"General",
|
35 |
+
),
|
36 |
+
"CodeLlama 70B": (
|
37 |
+
"https://huggingface.co/codellama/CodeLlama-70b-hf",
|
38 |
+
69,
|
39 |
+
"Coding",
|
40 |
+
),
|
41 |
+
"QwenCoder 2.5 32B": (
|
42 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
|
43 |
+
32.5,
|
44 |
+
"Coding",
|
45 |
+
),
|
46 |
+
"DeepSeek Coder 33B": (
|
47 |
+
"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
|
48 |
+
33.3,
|
49 |
+
"Coding",
|
50 |
+
),
|
51 |
+
"QwenCoder 2.5 14B": (
|
52 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
|
53 |
+
14.7,
|
54 |
+
"Coding",
|
55 |
+
),
|
56 |
+
"OpenCoder 8B": (
|
57 |
+
"https://huggingface.co/infly/OpenCoder-8B-Instruct",
|
58 |
+
7.77,
|
59 |
+
"Coding",
|
60 |
+
),
|
61 |
+
"QwenCoder 2.5 7B": (
|
62 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
|
63 |
+
7.61,
|
64 |
+
"Coding",
|
65 |
+
),
|
66 |
+
"DeepSeek Coder 6,7B": (
|
67 |
+
"https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
|
68 |
+
6.74,
|
69 |
+
"Coding",
|
70 |
+
),
|
71 |
+
"HaVen-CodeQwen": (
|
72 |
+
"https://huggingface.co/yangyiyao/HaVen-CodeQwen",
|
73 |
+
7.25,
|
74 |
+
"RTL-Specific",
|
75 |
+
),
|
76 |
"CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
|
77 |
"CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
|
78 |
+
"CodeV-DS-6.7B": (
|
79 |
+
"https://huggingface.co/yang-z/CodeV-DS-6.7B",
|
80 |
+
6.74,
|
81 |
+
"RTL-Specific",
|
82 |
+
),
|
83 |
+
"RTLCoder Mistral": (
|
84 |
+
"https://huggingface.co/ishorn5/RTLCoder-v1.1",
|
85 |
+
7.24,
|
86 |
+
"RTL-Specific",
|
87 |
+
),
|
88 |
+
"RTLCoder DeepSeek": (
|
89 |
+
"https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
|
90 |
+
6.74,
|
91 |
+
"RTL-Specific",
|
92 |
+
),
|
93 |
+
"OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific"),
|
94 |
}
|
95 |
|
96 |
+
|
97 |
def get_headers(reader, agg=False) -> Union[list, list]:
|
98 |
metrics, benchs = [], []
|
99 |
for i, row in enumerate(reader):
|
|
|
106 |
return metrics
|
107 |
return metrics, benchs
|
108 |
|
109 |
+
|
110 |
def get_model_params_and_url(model) -> Union[str, str, float]:
|
111 |
if model not in model_details:
|
112 |
return "-", "-", "-"
|
|
|
115 |
type = model_details[model][2]
|
116 |
return url, params, type
|
117 |
|
118 |
+
|
119 |
def parse_results(csv_path: str) -> list[dict]:
|
120 |
"""
|
121 |
Each row has the following format:
|
|
|
123 |
"""
|
124 |
dataset = []
|
125 |
models = []
|
126 |
+
with open(csv_path, newline="") as csvfile:
|
127 |
+
reader = csv.reader(csvfile, delimiter=",")
|
128 |
metrics, benchs = get_headers(reader)
|
129 |
for i, row in enumerate(reader):
|
130 |
model = row[0]
|
|
|
135 |
for metric, bench in zip(metrics, benchs):
|
136 |
if metric == "EM":
|
137 |
metric = "Exact Matching (EM)"
|
138 |
+
record = {}
|
139 |
record["Model"] = model
|
140 |
record["Model Type"] = type
|
141 |
record["Benchmark"] = bench
|
142 |
record["Task"] = metric
|
143 |
+
record["Result"] = float(row[ctr].replace(",", "."))
|
144 |
record["Model URL"] = url
|
145 |
record["Params"] = params
|
146 |
dataset.append(record)
|
|
|
148 |
print(models)
|
149 |
return dataset
|
150 |
|
151 |
+
|
152 |
def parse_agg(csv_path: str) -> list[dict]:
|
153 |
"""
|
154 |
Each row has the following format:
|
155 |
MODEL | BENCHMARK | TASK | METRIC | RESULT
|
156 |
"""
|
157 |
+
return pd.read_csv("results/aggregated_scores.csv")
|
158 |
+
|
159 |
|
160 |
def writeJson(data: list):
|
161 |
+
with open("results/results.json", "w") as f:
|
162 |
json.dump(data, f, indent=4, ensure_ascii=False)
|
163 |
print("Done")
|
164 |
|
165 |
+
|
166 |
def read_json():
|
167 |
+
json_path = "results/results.json"
|
168 |
with open(json_path, "r", encoding="utf-8") as file:
|
169 |
data = json.load(file)
|
170 |
return data
|
171 |
|
172 |
+
|
173 |
def read_data() -> Union[pd.DataFrame, list, list, str]:
|
174 |
data = read_json()
|
175 |
df = pd.DataFrame(data)
|
176 |
+
df.rename(
|
177 |
+
columns={
|
178 |
+
"Model": "Model",
|
179 |
+
"Benchmark": "Benchmark",
|
180 |
+
"Task": "Metric",
|
181 |
+
"Result": "Score",
|
182 |
+
"EM": "Exact Matching (EM)",
|
183 |
+
},
|
184 |
+
inplace=True,
|
185 |
+
)
|
186 |
+
df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
|
187 |
+
benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
|
188 |
+
metrics = df["Metric"].unique().tolist()
|
189 |
+
default_metric = (
|
190 |
+
"Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
|
191 |
+
)
|
192 |
return df, benchmarks, metrics, default_metric
|
193 |
|
194 |
|
results.csv β results/results.csv
RENAMED
File without changes
|
results.json β results/results.json
RENAMED
File without changes
|
about.py β static/about.py
RENAMED
File without changes
|
metrics.md β static/metrics.md
RENAMED
File without changes
|
css_html_js.py β style/css_html_js.py
RENAMED
File without changes
|