WeijianQi1999's picture
updatae human label
dbd7a03
import os
import gradio as gr
import pandas as pd
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from apscheduler.schedulers.background import BackgroundScheduler
from scorer import question_scorer
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
TOKEN = os.environ.get("TOKEN", None)
OWNER = "Online-Mind2Web"
YEAR_VERSION = "2025"
LOCAL_DEBUG = True
def get_dataframe_from_results(eval_path):
df = pd.read_csv(eval_path)
df = df.sort_values(by=["Average SR"], ascending=False)
for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
df[format_column] = df[format_column].map('{:.1f}'.format)
return df
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
def refresh():
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
return auto_eval_dataframe_test, human_eval_dataframe_test
def plot_heatmap_with_performance_bar(json_file):
with open(json_file, "r") as f:
data = json.load(f)
agents = [k for k in data[0].keys() if k.endswith("_human_label")]
records = []
original_ids = [task["task_id"] for task in data]
for task in data:
task_id = task["task_id"]
for agent in agents:
raw_val = task.get(agent, "0")
try:
val = int(raw_val)
except ValueError:
val = 0
val = 1 if val == 1 else 0
records.append({
"Task ID": task_id,
"Agent": agent.replace("_human_label", ""),
"Success": val
})
df = pd.DataFrame(records)
pivot = df.pivot_table(index="Agent", columns="Task ID", values="Success", aggfunc="max")
for task_id in original_ids:
if task_id not in pivot.columns:
pivot[task_id] = 0
pivot = pivot[original_ids]
agent_success_rate = pivot.sum(axis=1) / pivot.shape[1]
pivot["SuccessRate"] = agent_success_rate
pivot = pivot.sort_values(by="SuccessRate", ascending=False)
pivot = pivot.drop(columns=["SuccessRate"])
agent_name_map = {
"Operator": "Operator",
"Agent-E": "Agent-E",
"Browser_Use": "Browser Use",
"Claude_Computer_Use": "Claude Computer Use",
"SeeAct": "SeeAct"
}
sorted_agents = pivot.index.tolist()
pivot.index = [
f"{agent_name_map.get(agent, agent)} ({agent_success_rate[agent]*100:.1f}%)"
for agent in sorted_agents
]
custom_labels = [["Success" if val == 1 else "Failure" for val in row] for row in pivot.values]
any_agent_solved = pivot.max(axis=0).sum()
best_agent_solved = pivot.sum(axis=1).max()
total_tasks = len(original_ids)
fig = make_subplots(
rows=2, cols=1,
row_heights=[0.8, 0.2],
vertical_spacing=0.08,
subplot_titles=("TASK ID", ""),
shared_xaxes=False
)
fig.add_trace(go.Heatmap(
z=pivot.values,
x=pivot.columns,
y=pivot.index,
colorscale=[[0, "white"], [1, "skyblue"]],
zmin=0,
zmax=1,
showscale=False,
customdata=custom_labels,
hovertemplate="Agent: %{y}<br>Task ID: %{x}<br>Completion: %{customdata}<extra></extra>"
), row=1, col=1)
fig.add_trace(go.Bar(
y=["Any agent", "Best agent"],
x=[any_agent_solved, best_agent_solved],
orientation='h',
marker_color=["dodgerblue", "mediumseagreen"],
text=[
f"{int(any_agent_solved)}/{total_tasks} ({any_agent_solved / total_tasks:.1%})",
f"{int(best_agent_solved)}/{total_tasks} ({best_agent_solved / total_tasks:.1%})"
],
textposition="auto",
showlegend=False
), row=2, col=1)
fig.add_trace(go.Scatter(
x=[None], y=[None],
mode='markers',
marker=dict(size=10, color='skyblue'),
name='Success'
))
fig.add_trace(go.Scatter(
x=[None], y=[None],
mode='markers',
marker=dict(size=10, color='white', line=dict(width=1, color='black')),
name='Failure'
))
fig.update_xaxes(range=[0, total_tasks], row=2, col=1)
fig.update_layout(
height=600,
xaxis=dict(showticklabels=False),
yaxis=dict(title="Agent"),
yaxis2=dict(title=""),
margin=dict(t=60)
)
return fig
def gradio_plot_wrapper(json_file):
return plot_heatmap_with_performance_bar(json_file.name)
demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")
with demo:
gr.HTML(TITLE)
gr.HTML(LINKS)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
lines=10,
)
gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
human_leaderboard_table_test = gr.Dataframe(
value=human_eval_dataframe_test,
datatype=TYPES,
interactive=False,
wrap=False
)
gr.Markdown("### Visualization")
gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
fig = plot_heatmap_with_performance_bar("./human_label.json")
gr.Plot(fig)
gr.Markdown(EVALUATION_DETAILS)
with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
auto_leaderboard_table_test = gr.Dataframe(
value=auto_eval_dataframe_test,
datatype=TYPES,
interactive=False,
wrap=False
)
with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
with gr.Row():
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
refresh_button = gr.Button("Refresh")
refresh_button.click(
refresh,
inputs=[],
outputs=[auto_leaderboard_table_test, human_leaderboard_table_test],
)
scheduler = BackgroundScheduler()
scheduler.start()
if __name__ == "__main__":
demo.launch(debug=True)