|
import os |
|
import gradio as gr |
|
import pandas as pd |
|
import json |
|
import plotly.graph_objects as go |
|
from plotly.subplots import make_subplots |
|
from collections import Counter |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
from scorer import question_scorer |
|
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION |
|
|
|
TOKEN = os.environ.get("TOKEN", None) |
|
|
|
OWNER = "Online-Mind2Web" |
|
YEAR_VERSION = "2025" |
|
LOCAL_DEBUG = True |
|
|
|
def get_dataframe_from_results(eval_path): |
|
df = pd.read_csv(eval_path) |
|
df = df.sort_values(by=["Average SR"], ascending=False) |
|
for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']: |
|
df[format_column] = df[format_column].map('{:.1f}'.format) |
|
return df |
|
|
|
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv') |
|
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') |
|
TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"] |
|
|
|
def refresh(): |
|
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv') |
|
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') |
|
return auto_eval_dataframe_test, human_eval_dataframe_test |
|
|
|
def plot_heatmap_with_performance_bar(json_file): |
|
with open(json_file, "r") as f: |
|
data = json.load(f) |
|
|
|
agents = [k for k in data[0].keys() if k.endswith("_human_label")] |
|
records = [] |
|
original_ids = [task["task_id"] for task in data] |
|
|
|
for task in data: |
|
task_id = task["task_id"] |
|
for agent in agents: |
|
raw_val = task.get(agent, "0") |
|
try: |
|
val = int(raw_val) |
|
except ValueError: |
|
val = 0 |
|
val = 1 if val == 1 else 0 |
|
records.append({ |
|
"Task ID": task_id, |
|
"Agent": agent.replace("_human_label", ""), |
|
"Success": val |
|
}) |
|
|
|
df = pd.DataFrame(records) |
|
pivot = df.pivot_table(index="Agent", columns="Task ID", values="Success", aggfunc="max") |
|
|
|
for task_id in original_ids: |
|
if task_id not in pivot.columns: |
|
pivot[task_id] = 0 |
|
pivot = pivot[original_ids] |
|
|
|
agent_success_rate = pivot.sum(axis=1) / pivot.shape[1] |
|
pivot["SuccessRate"] = agent_success_rate |
|
pivot = pivot.sort_values(by="SuccessRate", ascending=False) |
|
pivot = pivot.drop(columns=["SuccessRate"]) |
|
|
|
agent_name_map = { |
|
"Operator": "Operator", |
|
"Agent-E": "Agent-E", |
|
"Browser_Use": "Browser Use", |
|
"Claude_Computer_Use": "Claude Computer Use", |
|
"SeeAct": "SeeAct" |
|
} |
|
sorted_agents = pivot.index.tolist() |
|
pivot.index = [ |
|
f"{agent_name_map.get(agent, agent)} ({agent_success_rate[agent]*100:.1f}%)" |
|
for agent in sorted_agents |
|
] |
|
|
|
custom_labels = [["Success" if val == 1 else "Failure" for val in row] for row in pivot.values] |
|
any_agent_solved = pivot.max(axis=0).sum() |
|
best_agent_solved = pivot.sum(axis=1).max() |
|
total_tasks = len(original_ids) |
|
|
|
fig = make_subplots( |
|
rows=2, cols=1, |
|
row_heights=[0.8, 0.2], |
|
vertical_spacing=0.08, |
|
subplot_titles=("TASK ID", ""), |
|
shared_xaxes=False |
|
) |
|
|
|
fig.add_trace(go.Heatmap( |
|
z=pivot.values, |
|
x=pivot.columns, |
|
y=pivot.index, |
|
colorscale=[[0, "white"], [1, "skyblue"]], |
|
zmin=0, |
|
zmax=1, |
|
showscale=False, |
|
customdata=custom_labels, |
|
hovertemplate="Agent: %{y}<br>Task ID: %{x}<br>Completion: %{customdata}<extra></extra>" |
|
), row=1, col=1) |
|
|
|
fig.add_trace(go.Bar( |
|
y=["Any agent", "Best agent"], |
|
x=[any_agent_solved, best_agent_solved], |
|
orientation='h', |
|
marker_color=["dodgerblue", "mediumseagreen"], |
|
text=[ |
|
f"{int(any_agent_solved)}/{total_tasks} ({any_agent_solved / total_tasks:.1%})", |
|
f"{int(best_agent_solved)}/{total_tasks} ({best_agent_solved / total_tasks:.1%})" |
|
], |
|
textposition="auto", |
|
showlegend=False |
|
), row=2, col=1) |
|
|
|
fig.add_trace(go.Scatter( |
|
x=[None], y=[None], |
|
mode='markers', |
|
marker=dict(size=10, color='skyblue'), |
|
name='Success' |
|
)) |
|
fig.add_trace(go.Scatter( |
|
x=[None], y=[None], |
|
mode='markers', |
|
marker=dict(size=10, color='white', line=dict(width=1, color='black')), |
|
name='Failure' |
|
)) |
|
|
|
fig.update_xaxes(range=[0, total_tasks], row=2, col=1) |
|
fig.update_layout( |
|
height=600, |
|
xaxis=dict(showticklabels=False), |
|
yaxis=dict(title="Agent"), |
|
yaxis2=dict(title=""), |
|
margin=dict(t=60) |
|
) |
|
return fig |
|
|
|
def gradio_plot_wrapper(json_file): |
|
return plot_heatmap_with_performance_bar(json_file.name) |
|
|
|
demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""") |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.HTML(LINKS) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Accordion("π Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
elem_id="citation-button", |
|
lines=10, |
|
) |
|
|
|
gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1): |
|
human_leaderboard_table_test = gr.Dataframe( |
|
value=human_eval_dataframe_test, |
|
datatype=TYPES, |
|
interactive=False, |
|
wrap=False |
|
) |
|
gr.Markdown("### Visualization") |
|
gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)") |
|
fig = plot_heatmap_with_performance_bar("./human_label.json") |
|
gr.Plot(fig) |
|
gr.Markdown(EVALUATION_DETAILS) |
|
|
|
with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2): |
|
auto_leaderboard_table_test = gr.Dataframe( |
|
value=auto_eval_dataframe_test, |
|
datatype=TYPES, |
|
interactive=False, |
|
wrap=False |
|
) |
|
|
|
with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3): |
|
with gr.Row(): |
|
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") |
|
|
|
refresh_button = gr.Button("Refresh") |
|
refresh_button.click( |
|
refresh, |
|
inputs=[], |
|
outputs=[auto_leaderboard_table_test, human_leaderboard_table_test], |
|
) |
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.start() |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |
|
|