ggcristian commited on
Commit
711a69b
Β·
1 Parent(s): 2c02057

Re-organize code

Browse files
app.py CHANGED
@@ -1,71 +1,69 @@
1
- import json
2
- from typing import Union
3
 
4
  import gradio as gr
5
- import numpy as np
6
  import pandas as pd
7
  import plotly.express as px
8
- import plotly.graph_objects as go
9
  from gradio.themes.utils import colors
10
- from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
11
 
12
- from about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
13
- from css_html_js import custom_css, trigger_plot
14
- from parse import parse_agg, read_data, read_json
15
- from utils import (filter_bench, filter_bench_all, filter_RTLRepo,
16
- handle_special_cases, model_hyperlink, type_emoji)
17
 
18
 
19
  def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
20
  subset = df.copy()
21
-
22
  # Filter by task specific benchmarks when 'All' benchmarks is selected
23
  if task == "Spec-to-RTL":
24
  valid_benchmarks = s2r_benchs
25
- if benchmark == 'All':
26
- subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
27
  elif task == "Code Completion":
28
  valid_benchmarks = cc_benchs
29
- if benchmark == 'All':
30
- subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
31
  elif task == "Line Completion":
32
  valid_benchmarks = lc_benchs
33
- if benchmark == 'All':
34
- subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
35
-
36
- if benchmark != 'All':
37
- subset = df[df['Benchmark'] == benchmark]
38
-
39
- if model_type != 'All':
40
  # without emojis
41
- subset = subset[subset['Model Type'] == model_type.split(" ")[0]]
42
  if search_query:
43
- subset = subset[subset['Model'].str.contains(search_query, case=False, na=False)]
 
 
44
  max_params = float(max_params)
45
- subset = subset[subset['Params'] <= max_params]
46
-
47
- if benchmark == 'All':
48
- if task == 'Spec-to-RTL':
49
- return filter_bench_all(subset, df_agg, agg_column='Agg S2R')
50
- elif task == 'Code Completion':
51
- return filter_bench_all(subset, df_agg, agg_column='Agg MC')
52
- elif task == 'Line Completion':
53
  return filter_RTLRepo(subset)
54
- elif benchmark == 'RTL-Repo':
55
  return filter_RTLRepo(subset)
56
  else:
57
  agg_column = None
58
- if benchmark == 'VerilogEval S2R':
59
- agg_column = 'Agg VerilogEval S2R'
60
- elif benchmark == 'VerilogEval MC':
61
- agg_column = 'Agg VerilogEval MC'
62
- elif benchmark == 'RTLLM':
63
- agg_column = 'Agg RTLLM'
64
- elif benchmark == 'VeriGen':
65
- agg_column = 'Agg VeriGen'
66
-
67
  return filter_bench(subset, df_agg, agg_column)
68
 
 
69
  def update_benchmarks_by_task(task):
70
  if task == "Spec-to-RTL":
71
  new_benchmarks = ["All"] + s2r_benchs
@@ -76,59 +74,90 @@ def update_benchmarks_by_task(task):
76
  else:
77
  new_benchmarks = ["All"] + benchmarks
78
  benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
79
- filtered = filter_leaderboard(task, benchmark_value, model_type_dropdown.value, search_box.value, params_slider.value)
 
 
 
 
 
 
80
  return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
81
 
 
82
  def generate_scatter_plot(benchmark, metric):
83
  benchmark, metric = handle_special_cases(benchmark, metric)
84
-
85
- subset = df[df['Benchmark'] == benchmark]
86
  if benchmark == "RTL-Repo":
87
- subset = subset[subset['Metric'].str.contains('EM', case=False, na=False)]
88
- detailed_scores = subset.groupby('Model', as_index=False)['Score'].mean()
89
- detailed_scores.rename(columns={'Score': 'Exact Matching (EM)'}, inplace=True)
90
  else:
91
- detailed_scores = subset.pivot_table(index='Model', columns='Metric', values='Score').reset_index()
92
-
93
- details = df[['Model', 'Params', 'Model Type']].drop_duplicates('Model')
94
- scatter_data = pd.merge(detailed_scores, details, on='Model', how='left').dropna(subset=['Params', metric])
 
 
 
 
95
 
96
- scatter_data['x'] = scatter_data['Params']
97
- scatter_data['y'] = scatter_data[metric]
98
- scatter_data['size'] = (scatter_data['x'] ** 0.3) * 40
99
 
100
  type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
101
- scatter_data['color'] = scatter_data['Model Type'].map(type_colors).fillna('gray')
102
 
103
  y_axis_limits = {
104
- 'Functionality (FNC)': [5, 90], 'Syntax (STX)': [20, 100], 'Synthesis (SYN)': [5, 90],
105
- 'Power': [0, 50], 'Performance': [0, 50], 'Area': [0, 50], 'Exact Matching (EM)': [0, 50]
 
 
 
 
 
106
  }
107
  y_range = y_axis_limits.get(metric, [0, 80])
108
 
109
  fig = px.scatter(
110
- scatter_data, x='x', y='y', log_x=True, size='size', color='Model Type', text='Model',
111
- hover_data={metric: ':.2f'}, title=f'Params vs. {metric} for {benchmark}',
112
- labels={'x': '# Params (Log Scale)', 'y': metric}, template="plotly_white",
113
- height=600, width=1200
 
 
 
 
 
 
 
 
 
114
  )
115
 
116
  fig.update_traces(
117
- textposition='top center', textfont_size=10,
118
- marker=dict(opacity=0.8, line=dict(width=0.5, color='black'))
 
119
  )
120
  fig.update_layout(
121
  xaxis=dict(
122
- showgrid=True, type='log', tickmode='array',
 
 
123
  tickvals=[8, 14, 32, 72, 200, 700],
124
- ticktext=['8', '14', '32', '72', '200', '700']
125
  ),
126
- showlegend=False, yaxis=dict(range=y_range),
127
- margin=dict(l=50, r=50, t=50, b=50), plot_bgcolor='white'
 
 
128
  )
129
 
130
  return fig
131
 
 
132
  js_func = """
133
  function refresh() {
134
  const url = new URL(window.location);
@@ -139,24 +168,36 @@ function refresh() {
139
  }
140
  }
141
  """
142
-
143
- with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)) as app:
 
 
144
  df, benchmarks, metrics, default_metric = read_data()
145
- df_agg = parse_agg("./aggregated_scores.csv")
146
  tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
147
  s2r_benchs = ["VerilogEval S2R", "RTLLM"]
148
  cc_benchs = ["VerilogEval MC", "VeriGen"]
149
  lc_benchs = ["RTL-Repo"]
150
- non_rtl_metrics = ["Syntax (STX)", "Functionality (FNC)", "Synthesis (SYN)", "Power", "Performance", "Area"]
 
 
 
 
 
 
 
151
  rtl_metrics = ["Exact Matching (EM)"]
152
- model_types = ['All', 'General 🟒', 'Coding πŸ”΅', 'RTL-Specific πŸ”΄']
153
-
154
- gr.HTML("""
 
155
  <p align="center" style="margin-bottom: -10px;">
156
  <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
157
  </p>
158
- """)
159
- gr.HTML("""
 
 
160
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
161
  <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
162
  <div style="text-align: center; margin-bottom: 15px;">
@@ -184,60 +225,99 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
184
  <a href="mailto:[email protected]">[email protected]</a>
185
  </p>
186
  </div>
187
- """)
 
188
  with gr.Tabs():
189
  with gr.Tab("Leaderboard"):
190
  with gr.Row(equal_height=True):
191
  with gr.Column():
192
- task_radio = gr.Radio(choices=tasks, label="Select Task", value='Spec-to-RTL')
193
- with gr.Column():
194
- benchmark_radio = gr.Radio(choices=["All"] + s2r_benchs, label="Select Benchmark", value='All')
195
-
196
- with gr.Row(equal_height=True):
197
- search_box = gr.Textbox(
198
- label="Search Model",
199
- placeholder="Type model name...",
200
- scale=2,
201
- )
202
- model_type_dropdown = gr.Radio(
203
- choices=model_types,
204
- label="Select Model Type",
205
- value='All',
206
- scale=3,
207
  )
208
- params_slider = gr.Slider(
209
- minimum=df['Params'].min(),
210
- maximum=700,
211
- value=700,
212
- label="Max Params",
213
- step=1,
214
- scale=2,
215
  )
216
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  leaderboard = gr.DataFrame(
218
- value=filter_leaderboard('Spec-to-RTL', 'All', 'All', "", 700),
219
  headers="first row",
220
  show_row_numbers=True,
221
  wrap=True,
222
- datatype=["markdown", "html",],
 
 
 
223
  interactive=False,
224
- column_widths=["7%", "25%", "10%", "17%", "6%", "6%", "6%", "6%", "6%", "7%"])
225
-
 
 
 
 
 
 
 
 
 
 
 
 
226
  with gr.Tab("Plot View"):
227
  with gr.Row(equal_height=True):
228
  default_benchmark = s2r_benchs[0]
229
- bubble_benchmark = gr.Dropdown(choices=benchmarks, label="Select Benchmark", value=default_benchmark, elem_classes="gr-dropdown")
 
 
 
 
 
230
  default_metric = non_rtl_metrics[0]
231
- bubble_metric = gr.Dropdown(choices=non_rtl_metrics[:-1], label="Select Metric", value=default_metric)
 
 
 
 
232
  with gr.Row(equal_height=True):
233
- scatter_plot = gr.Plot(value=generate_scatter_plot(default_benchmark, default_metric), label="Bubble Chart", elem_id="full-width-plot")
 
 
 
 
234
 
235
  with gr.Tab("Metrics Information"):
236
- with open("metrics.md", "r") as file:
237
- gr.Markdown(file.read(), latex_delimiters=[
238
- {"left": "$$", "right": "$$", "display": True},
239
- {"left": "$", "right": "$", "display": False}
240
- ], elem_classes="metrics-page")
 
 
 
 
241
  with gr.Tab("About Us"):
242
  gr.HTML(
243
  """
@@ -267,7 +347,7 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
267
  </div>
268
  """
269
  )
270
-
271
  with gr.Row():
272
  with gr.Accordion("πŸ“™ Citation", open=False):
273
  citation_button = gr.Textbox(
@@ -277,21 +357,69 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
277
  elem_id="citation-button",
278
  show_copy_button=True,
279
  )
280
-
281
  # event handlers, ugly way but it works
282
- task_radio.change(fn=update_benchmarks_by_task, inputs=[task_radio], outputs=[benchmark_radio, leaderboard])
283
- benchmark_radio.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
284
- model_type_dropdown.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
285
- search_box.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
286
- params_slider.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  def on_benchmark_change(benchmark, _):
289
  if benchmark == "RTL-Repo":
290
  metric = "Exact Matching (EM)"
291
- return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(benchmark, metric)
 
 
292
  else:
293
  metric = non_rtl_metrics[0]
294
- return gr.update(choices=non_rtl_metrics[:-1], value=metric), generate_scatter_plot(benchmark, metric)
 
 
295
 
296
  def on_metric_change(benchmark, metric):
297
  benchmark, metric = handle_special_cases(benchmark, metric)
@@ -299,7 +427,7 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
299
  return gr.update(value=benchmark), fig
300
 
301
  bubble_benchmark.change(
302
- fn=on_benchmark_change,
303
  inputs=[bubble_benchmark, bubble_metric],
304
  outputs=[bubble_metric, scatter_plot],
305
  js=""" // this is to avoid resetting user scroll each time a plot is re-generated
@@ -312,7 +440,8 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
312
  observer.observe(document.getElementById('full-width-plot'), { childList: true });
313
  return [benchmark, metric];
314
  }
315
- """)
 
316
 
317
  bubble_metric.change(
318
  fn=on_metric_change,
@@ -328,7 +457,14 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
328
  observer.observe(document.getElementById('full-width-plot'), { childList: true });
329
  return [benchmark, metric];
330
  }
331
- """)
332
-
 
333
 
334
- app.launch(allowed_paths=["logo.png", "hpai_logo_grad.png", "bsc-logo.png"])
 
 
 
 
 
 
 
1
+ import sys
 
2
 
3
  import gradio as gr
 
4
  import pandas as pd
5
  import plotly.express as px
 
6
  from gradio.themes.utils import colors
 
7
 
8
+ from results.parse import parse_agg, read_data
9
+ from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
10
+ from style.css_html_js import custom_css
11
+ from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases
 
12
 
13
 
14
  def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
15
  subset = df.copy()
16
+
17
  # Filter by task specific benchmarks when 'All' benchmarks is selected
18
  if task == "Spec-to-RTL":
19
  valid_benchmarks = s2r_benchs
20
+ if benchmark == "All":
21
+ subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
22
  elif task == "Code Completion":
23
  valid_benchmarks = cc_benchs
24
+ if benchmark == "All":
25
+ subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
26
  elif task == "Line Completion":
27
  valid_benchmarks = lc_benchs
28
+ if benchmark == "All":
29
+ subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
30
+
31
+ if benchmark != "All":
32
+ subset = df[df["Benchmark"] == benchmark]
33
+
34
+ if model_type != "All":
35
  # without emojis
36
+ subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
37
  if search_query:
38
+ subset = subset[
39
+ subset["Model"].str.contains(search_query, case=False, na=False)
40
+ ]
41
  max_params = float(max_params)
42
+ subset = subset[subset["Params"] <= max_params]
43
+
44
+ if benchmark == "All":
45
+ if task == "Spec-to-RTL":
46
+ return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
47
+ elif task == "Code Completion":
48
+ return filter_bench_all(subset, df_agg, agg_column="Agg MC")
49
+ elif task == "Line Completion":
50
  return filter_RTLRepo(subset)
51
+ elif benchmark == "RTL-Repo":
52
  return filter_RTLRepo(subset)
53
  else:
54
  agg_column = None
55
+ if benchmark == "VerilogEval S2R":
56
+ agg_column = "Agg VerilogEval S2R"
57
+ elif benchmark == "VerilogEval MC":
58
+ agg_column = "Agg VerilogEval MC"
59
+ elif benchmark == "RTLLM":
60
+ agg_column = "Agg RTLLM"
61
+ elif benchmark == "VeriGen":
62
+ agg_column = "Agg VeriGen"
63
+
64
  return filter_bench(subset, df_agg, agg_column)
65
 
66
+
67
  def update_benchmarks_by_task(task):
68
  if task == "Spec-to-RTL":
69
  new_benchmarks = ["All"] + s2r_benchs
 
74
  else:
75
  new_benchmarks = ["All"] + benchmarks
76
  benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
77
+ filtered = filter_leaderboard(
78
+ task,
79
+ benchmark_value,
80
+ model_type_dropdown.value,
81
+ search_box.value,
82
+ params_slider.value,
83
+ )
84
  return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
85
 
86
+
87
  def generate_scatter_plot(benchmark, metric):
88
  benchmark, metric = handle_special_cases(benchmark, metric)
89
+
90
+ subset = df[df["Benchmark"] == benchmark]
91
  if benchmark == "RTL-Repo":
92
+ subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
93
+ detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
94
+ detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
95
  else:
96
+ detailed_scores = subset.pivot_table(
97
+ index="Model", columns="Metric", values="Score"
98
+ ).reset_index()
99
+
100
+ details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
101
+ scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
102
+ subset=["Params", metric]
103
+ )
104
 
105
+ scatter_data["x"] = scatter_data["Params"]
106
+ scatter_data["y"] = scatter_data[metric]
107
+ scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40
108
 
109
  type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
110
+ scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")
111
 
112
  y_axis_limits = {
113
+ "Functionality (FNC)": [5, 90],
114
+ "Syntax (STX)": [20, 100],
115
+ "Synthesis (SYN)": [5, 90],
116
+ "Power": [0, 50],
117
+ "Performance": [0, 50],
118
+ "Area": [0, 50],
119
+ "Exact Matching (EM)": [0, 50],
120
  }
121
  y_range = y_axis_limits.get(metric, [0, 80])
122
 
123
  fig = px.scatter(
124
+ scatter_data,
125
+ x="x",
126
+ y="y",
127
+ log_x=True,
128
+ size="size",
129
+ color="Model Type",
130
+ text="Model",
131
+ hover_data={metric: ":.2f"},
132
+ title=f"Params vs. {metric} for {benchmark}",
133
+ labels={"x": "# Params (Log Scale)", "y": metric},
134
+ template="plotly_white",
135
+ height=600,
136
+ width=1200,
137
  )
138
 
139
  fig.update_traces(
140
+ textposition="top center",
141
+ textfont_size=10,
142
+ marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
143
  )
144
  fig.update_layout(
145
  xaxis=dict(
146
+ showgrid=True,
147
+ type="log",
148
+ tickmode="array",
149
  tickvals=[8, 14, 32, 72, 200, 700],
150
+ ticktext=["8", "14", "32", "72", "200", "700"],
151
  ),
152
+ showlegend=False,
153
+ yaxis=dict(range=y_range),
154
+ margin=dict(l=50, r=50, t=50, b=50),
155
+ plot_bgcolor="white",
156
  )
157
 
158
  return fig
159
 
160
+
161
  js_func = """
162
  function refresh() {
163
  const url = new URL(window.location);
 
168
  }
169
  }
170
  """
171
+
172
+ with gr.Blocks(
173
+ css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)
174
+ ) as app:
175
  df, benchmarks, metrics, default_metric = read_data()
176
+ df_agg = parse_agg("./results/aggregated_scores.csv")
177
  tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
178
  s2r_benchs = ["VerilogEval S2R", "RTLLM"]
179
  cc_benchs = ["VerilogEval MC", "VeriGen"]
180
  lc_benchs = ["RTL-Repo"]
181
+ non_rtl_metrics = [
182
+ "Syntax (STX)",
183
+ "Functionality (FNC)",
184
+ "Synthesis (SYN)",
185
+ "Power",
186
+ "Performance",
187
+ "Area",
188
+ ]
189
  rtl_metrics = ["Exact Matching (EM)"]
190
+ model_types = ["All", "General 🟒", "Coding πŸ”΅", "RTL-Specific πŸ”΄"]
191
+
192
+ gr.HTML(
193
+ """
194
  <p align="center" style="margin-bottom: -10px;">
195
  <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
196
  </p>
197
+ """
198
+ )
199
+ gr.HTML(
200
+ """
201
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
202
  <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
203
  <div style="text-align: center; margin-bottom: 15px;">
 
225
  <a href="mailto:[email protected]">[email protected]</a>
226
  </p>
227
  </div>
228
+ """
229
+ )
230
  with gr.Tabs():
231
  with gr.Tab("Leaderboard"):
232
  with gr.Row(equal_height=True):
233
  with gr.Column():
234
+ task_radio = gr.Radio(
235
+ choices=tasks, label="Select Task", value="Spec-to-RTL"
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  )
237
+ with gr.Column():
238
+ benchmark_radio = gr.Radio(
239
+ choices=["All"] + s2r_benchs,
240
+ label="Select Benchmark",
241
+ value="All",
 
 
242
  )
243
+
244
+ with gr.Row(equal_height=True):
245
+ search_box = gr.Textbox(
246
+ label="Search Model",
247
+ placeholder="Type model name...",
248
+ scale=2,
249
+ )
250
+ model_type_dropdown = gr.Radio(
251
+ choices=model_types,
252
+ label="Select Model Type",
253
+ value="All",
254
+ scale=3,
255
+ )
256
+ params_slider = gr.Slider(
257
+ minimum=df["Params"].min(),
258
+ maximum=700,
259
+ value=700,
260
+ label="Max Params",
261
+ step=1,
262
+ scale=2,
263
+ )
264
+
265
  leaderboard = gr.DataFrame(
266
+ value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
267
  headers="first row",
268
  show_row_numbers=True,
269
  wrap=True,
270
+ datatype=[
271
+ "markdown",
272
+ "html",
273
+ ],
274
  interactive=False,
275
+ column_widths=[
276
+ "7%",
277
+ "25%",
278
+ "10%",
279
+ "17%",
280
+ "6%",
281
+ "6%",
282
+ "6%",
283
+ "6%",
284
+ "6%",
285
+ "7%",
286
+ ],
287
+ )
288
+
289
  with gr.Tab("Plot View"):
290
  with gr.Row(equal_height=True):
291
  default_benchmark = s2r_benchs[0]
292
+ bubble_benchmark = gr.Dropdown(
293
+ choices=benchmarks,
294
+ label="Select Benchmark",
295
+ value=default_benchmark,
296
+ elem_classes="gr-dropdown",
297
+ )
298
  default_metric = non_rtl_metrics[0]
299
+ bubble_metric = gr.Dropdown(
300
+ choices=non_rtl_metrics,
301
+ label="Select Metric",
302
+ value=default_metric,
303
+ )
304
  with gr.Row(equal_height=True):
305
+ scatter_plot = gr.Plot(
306
+ value=generate_scatter_plot(default_benchmark, default_metric),
307
+ label="Bubble Chart",
308
+ elem_id="full-width-plot",
309
+ )
310
 
311
  with gr.Tab("Metrics Information"):
312
+ with open("./static/metrics.md", "r") as file:
313
+ gr.Markdown(
314
+ file.read(),
315
+ latex_delimiters=[
316
+ {"left": "$$", "right": "$$", "display": True},
317
+ {"left": "$", "right": "$", "display": False},
318
+ ],
319
+ elem_classes="metrics-page",
320
+ )
321
  with gr.Tab("About Us"):
322
  gr.HTML(
323
  """
 
347
  </div>
348
  """
349
  )
350
+
351
  with gr.Row():
352
  with gr.Accordion("πŸ“™ Citation", open=False):
353
  citation_button = gr.Textbox(
 
357
  elem_id="citation-button",
358
  show_copy_button=True,
359
  )
360
+
361
  # event handlers, ugly way but it works
362
+ task_radio.change(
363
+ fn=update_benchmarks_by_task,
364
+ inputs=[task_radio],
365
+ outputs=[benchmark_radio, leaderboard],
366
+ )
367
+ benchmark_radio.change(
368
+ fn=filter_leaderboard,
369
+ inputs=[
370
+ task_radio,
371
+ benchmark_radio,
372
+ model_type_dropdown,
373
+ search_box,
374
+ params_slider,
375
+ ],
376
+ outputs=leaderboard,
377
+ )
378
+ model_type_dropdown.change(
379
+ fn=filter_leaderboard,
380
+ inputs=[
381
+ task_radio,
382
+ benchmark_radio,
383
+ model_type_dropdown,
384
+ search_box,
385
+ params_slider,
386
+ ],
387
+ outputs=leaderboard,
388
+ )
389
+ search_box.change(
390
+ fn=filter_leaderboard,
391
+ inputs=[
392
+ task_radio,
393
+ benchmark_radio,
394
+ model_type_dropdown,
395
+ search_box,
396
+ params_slider,
397
+ ],
398
+ outputs=leaderboard,
399
+ )
400
+ params_slider.change(
401
+ fn=filter_leaderboard,
402
+ inputs=[
403
+ task_radio,
404
+ benchmark_radio,
405
+ model_type_dropdown,
406
+ search_box,
407
+ params_slider,
408
+ ],
409
+ outputs=leaderboard,
410
+ )
411
 
412
  def on_benchmark_change(benchmark, _):
413
  if benchmark == "RTL-Repo":
414
  metric = "Exact Matching (EM)"
415
+ return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
416
+ benchmark, metric
417
+ )
418
  else:
419
  metric = non_rtl_metrics[0]
420
+ return gr.update(
421
+ choices=non_rtl_metrics[:-1], value=metric
422
+ ), generate_scatter_plot(benchmark, metric)
423
 
424
  def on_metric_change(benchmark, metric):
425
  benchmark, metric = handle_special_cases(benchmark, metric)
 
427
  return gr.update(value=benchmark), fig
428
 
429
  bubble_benchmark.change(
430
+ fn=on_benchmark_change,
431
  inputs=[bubble_benchmark, bubble_metric],
432
  outputs=[bubble_metric, scatter_plot],
433
  js=""" // this is to avoid resetting user scroll each time a plot is re-generated
 
440
  observer.observe(document.getElementById('full-width-plot'), { childList: true });
441
  return [benchmark, metric];
442
  }
443
+ """,
444
+ )
445
 
446
  bubble_metric.change(
447
  fn=on_metric_change,
 
457
  observer.observe(document.getElementById('full-width-plot'), { childList: true });
458
  return [benchmark, metric];
459
  }
460
+ """,
461
+ )
462
+
463
 
464
+ app.launch(
465
+ allowed_paths=[
466
+ "logo.png",
467
+ "hpai_logo_grad.png",
468
+ "bsc-logo.png",
469
+ ]
470
+ )
aggregated_scores.csv β†’ results/aggregated_scores.csv RENAMED
File without changes
parse.py β†’ results/parse.py RENAMED
@@ -1,35 +1,99 @@
1
- import json
2
- import pandas as pd
3
  import csv
4
- from typing import Dict, Union
5
  import locale
 
 
 
6
 
7
  model_details = {
8
  "DeepSeek R1": ("https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General"),
9
- "Llama 3.1 405B": ("https://huggingface.co/meta-llama/Llama-3.1-405B", 406, "General"),
10
- "Llama 3.(1-3) 70B": ("https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", 70.6, "General"),
11
- "Qwen2.5 72B": ("https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", 72.7, "General"),
 
 
 
 
 
 
 
 
 
 
 
 
12
  "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General"),
13
- "StarChat2 15B v0.1": ("https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1", 16, "General"),
14
- "DeepSeek R1 Distill Qwen 14B": ("https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", 14.8, "General"),
15
-
16
- "CodeLlama 70B": ("https://huggingface.co/codellama/CodeLlama-70b-hf", 69, "Coding"),
17
- "QwenCoder 2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct", 32.5, "Coding"),
18
- "DeepSeek Coder 33B": ("https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct", 33.3, "Coding"),
19
- "QwenCoder 2.5 14B": ("https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct", 14.7, "Coding"),
20
- "OpenCoder 8B": ("https://huggingface.co/infly/OpenCoder-8B-Instruct", 7.77, "Coding"),
21
- "QwenCoder 2.5 7B": ("https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct", 7.61, "Coding"),
22
- "DeepSeek Coder 6,7B": ("https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct", 6.74, "Coding"),
23
-
24
- "HaVen-CodeQwen": ("https://huggingface.co/yangyiyao/HaVen-CodeQwen", 7.25, "RTL-Specific"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  "CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
26
  "CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
27
- "CodeV-DS-6.7B": ("https://huggingface.co/yang-z/CodeV-DS-6.7B", 6.74, "RTL-Specific"),
28
- "RTLCoder Mistral": ("https://huggingface.co/ishorn5/RTLCoder-v1.1", 7.24, "RTL-Specific"),
29
- "RTLCoder DeepSeek": ("https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1", 6.74, "RTL-Specific"),
30
- "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific")
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
 
 
33
  def get_headers(reader, agg=False) -> Union[list, list]:
34
  metrics, benchs = [], []
35
  for i, row in enumerate(reader):
@@ -42,6 +106,7 @@ def get_headers(reader, agg=False) -> Union[list, list]:
42
  return metrics
43
  return metrics, benchs
44
 
 
45
  def get_model_params_and_url(model) -> Union[str, str, float]:
46
  if model not in model_details:
47
  return "-", "-", "-"
@@ -50,6 +115,7 @@ def get_model_params_and_url(model) -> Union[str, str, float]:
50
  type = model_details[model][2]
51
  return url, params, type
52
 
 
53
  def parse_results(csv_path: str) -> list[dict]:
54
  """
55
  Each row has the following format:
@@ -57,8 +123,8 @@ def parse_results(csv_path: str) -> list[dict]:
57
  """
58
  dataset = []
59
  models = []
60
- with open(csv_path, newline='') as csvfile:
61
- reader = csv.reader(csvfile, delimiter=',')
62
  metrics, benchs = get_headers(reader)
63
  for i, row in enumerate(reader):
64
  model = row[0]
@@ -69,12 +135,12 @@ def parse_results(csv_path: str) -> list[dict]:
69
  for metric, bench in zip(metrics, benchs):
70
  if metric == "EM":
71
  metric = "Exact Matching (EM)"
72
- record = {}
73
  record["Model"] = model
74
  record["Model Type"] = type
75
  record["Benchmark"] = bench
76
  record["Task"] = metric
77
- record["Result"] = float(row[ctr].replace(',','.'))
78
  record["Model URL"] = url
79
  record["Params"] = params
80
  dataset.append(record)
@@ -82,32 +148,47 @@ def parse_results(csv_path: str) -> list[dict]:
82
  print(models)
83
  return dataset
84
 
 
85
  def parse_agg(csv_path: str) -> list[dict]:
86
  """
87
  Each row has the following format:
88
  MODEL | BENCHMARK | TASK | METRIC | RESULT
89
  """
90
- return pd.read_csv("aggregated_scores.csv")
 
91
 
92
  def writeJson(data: list):
93
- with open('results.json', 'w') as f:
94
  json.dump(data, f, indent=4, ensure_ascii=False)
95
  print("Done")
96
 
 
97
  def read_json():
98
- json_path = "./results.json"
99
  with open(json_path, "r", encoding="utf-8") as file:
100
  data = json.load(file)
101
  return data
102
 
 
103
  def read_data() -> Union[pd.DataFrame, list, list, str]:
104
  data = read_json()
105
  df = pd.DataFrame(data)
106
- df.rename(columns={'Model': 'Model', 'Benchmark': 'Benchmark', 'Task': 'Metric', 'Result': 'Score', 'EM': 'Exact Matching (EM)'}, inplace=True)
107
- df['Params'] = pd.to_numeric(df['Params'], errors='coerce')
108
- benchmarks = sorted(df['Benchmark'].unique().tolist(), reverse=True)
109
- metrics = df['Metric'].unique().tolist()
110
- default_metric = 'Functionality (FNC)' if 'Functionality (FNC)' in metrics else metrics[0]
 
 
 
 
 
 
 
 
 
 
 
111
  return df, benchmarks, metrics, default_metric
112
 
113
 
 
 
 
1
  import csv
2
+ import json
3
  import locale
4
+ from typing import Dict, Union
5
+
6
+ import pandas as pd
7
 
8
  model_details = {
9
  "DeepSeek R1": ("https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General"),
10
+ "Llama 3.1 405B": (
11
+ "https://huggingface.co/meta-llama/Llama-3.1-405B",
12
+ 406,
13
+ "General",
14
+ ),
15
+ "Llama 3.(1-3) 70B": (
16
+ "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
17
+ 70.6,
18
+ "General",
19
+ ),
20
+ "Qwen2.5 72B": (
21
+ "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
22
+ 72.7,
23
+ "General",
24
+ ),
25
  "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General"),
26
+ "StarChat2 15B v0.1": (
27
+ "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
28
+ 16,
29
+ "General",
30
+ ),
31
+ "DeepSeek R1 Distill Qwen 14B": (
32
+ "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
33
+ 14.8,
34
+ "General",
35
+ ),
36
+ "CodeLlama 70B": (
37
+ "https://huggingface.co/codellama/CodeLlama-70b-hf",
38
+ 69,
39
+ "Coding",
40
+ ),
41
+ "QwenCoder 2.5 32B": (
42
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
43
+ 32.5,
44
+ "Coding",
45
+ ),
46
+ "DeepSeek Coder 33B": (
47
+ "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
48
+ 33.3,
49
+ "Coding",
50
+ ),
51
+ "QwenCoder 2.5 14B": (
52
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
53
+ 14.7,
54
+ "Coding",
55
+ ),
56
+ "OpenCoder 8B": (
57
+ "https://huggingface.co/infly/OpenCoder-8B-Instruct",
58
+ 7.77,
59
+ "Coding",
60
+ ),
61
+ "QwenCoder 2.5 7B": (
62
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
63
+ 7.61,
64
+ "Coding",
65
+ ),
66
+ "DeepSeek Coder 6,7B": (
67
+ "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
68
+ 6.74,
69
+ "Coding",
70
+ ),
71
+ "HaVen-CodeQwen": (
72
+ "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
73
+ 7.25,
74
+ "RTL-Specific",
75
+ ),
76
  "CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
77
  "CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
78
+ "CodeV-DS-6.7B": (
79
+ "https://huggingface.co/yang-z/CodeV-DS-6.7B",
80
+ 6.74,
81
+ "RTL-Specific",
82
+ ),
83
+ "RTLCoder Mistral": (
84
+ "https://huggingface.co/ishorn5/RTLCoder-v1.1",
85
+ 7.24,
86
+ "RTL-Specific",
87
+ ),
88
+ "RTLCoder DeepSeek": (
89
+ "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
90
+ 6.74,
91
+ "RTL-Specific",
92
+ ),
93
+ "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific"),
94
  }
95
 
96
+
97
  def get_headers(reader, agg=False) -> Union[list, list]:
98
  metrics, benchs = [], []
99
  for i, row in enumerate(reader):
 
106
  return metrics
107
  return metrics, benchs
108
 
109
+
110
  def get_model_params_and_url(model) -> Union[str, str, float]:
111
  if model not in model_details:
112
  return "-", "-", "-"
 
115
  type = model_details[model][2]
116
  return url, params, type
117
 
118
+
119
  def parse_results(csv_path: str) -> list[dict]:
120
  """
121
  Each row has the following format:
 
123
  """
124
  dataset = []
125
  models = []
126
+ with open(csv_path, newline="") as csvfile:
127
+ reader = csv.reader(csvfile, delimiter=",")
128
  metrics, benchs = get_headers(reader)
129
  for i, row in enumerate(reader):
130
  model = row[0]
 
135
  for metric, bench in zip(metrics, benchs):
136
  if metric == "EM":
137
  metric = "Exact Matching (EM)"
138
+ record = {}
139
  record["Model"] = model
140
  record["Model Type"] = type
141
  record["Benchmark"] = bench
142
  record["Task"] = metric
143
+ record["Result"] = float(row[ctr].replace(",", "."))
144
  record["Model URL"] = url
145
  record["Params"] = params
146
  dataset.append(record)
 
148
  print(models)
149
  return dataset
150
 
151
+
152
  def parse_agg(csv_path: str) -> list[dict]:
153
  """
154
  Each row has the following format:
155
  MODEL | BENCHMARK | TASK | METRIC | RESULT
156
  """
157
+ return pd.read_csv("results/aggregated_scores.csv")
158
+
159
 
160
  def writeJson(data: list):
161
+ with open("results/results.json", "w") as f:
162
  json.dump(data, f, indent=4, ensure_ascii=False)
163
  print("Done")
164
 
165
+
166
  def read_json():
167
+ json_path = "results/results.json"
168
  with open(json_path, "r", encoding="utf-8") as file:
169
  data = json.load(file)
170
  return data
171
 
172
+
173
  def read_data() -> Union[pd.DataFrame, list, list, str]:
174
  data = read_json()
175
  df = pd.DataFrame(data)
176
+ df.rename(
177
+ columns={
178
+ "Model": "Model",
179
+ "Benchmark": "Benchmark",
180
+ "Task": "Metric",
181
+ "Result": "Score",
182
+ "EM": "Exact Matching (EM)",
183
+ },
184
+ inplace=True,
185
+ )
186
+ df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
187
+ benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
188
+ metrics = df["Metric"].unique().tolist()
189
+ default_metric = (
190
+ "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
191
+ )
192
  return df, benchmarks, metrics, default_metric
193
 
194
 
results.csv β†’ results/results.csv RENAMED
File without changes
results.json β†’ results/results.json RENAMED
File without changes
about.py β†’ static/about.py RENAMED
File without changes
metrics.md β†’ static/metrics.md RENAMED
File without changes
css_html_js.py β†’ style/css_html_js.py RENAMED
File without changes