Commit
Β·
2dba94f
1
Parent(s):
c6b576c
initialize
Browse files- README.md +8 -5
- app.py +89 -0
- auto_Mind2Web-Online - Leaderboard_data.csv +6 -0
- content.py +84 -0
- dataset_readme.md +19 -0
- human_Mind2Web-Online - Leaderboard_data.csv +6 -0
- requirements.txt +5 -0
- scorer.py +104 -0
README.md
CHANGED
@@ -1,13 +1,16 @@
|
|
1 |
---
|
2 |
title: Online Mind2Web Leaderboard
|
3 |
-
emoji:
|
4 |
colorFrom: yellow
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.22.0
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Online Mind2Web Leaderboard
|
3 |
+
emoji: π
|
4 |
colorFrom: yellow
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
7 |
app_file: app.py
|
8 |
+
pinned: true
|
9 |
+
license: apache-2.0
|
10 |
+
hf_oauth: true
|
11 |
+
failure_strategy: rollback
|
12 |
+
tags:
|
13 |
+
- leaderboard
|
14 |
---
|
15 |
|
16 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
+
|
9 |
+
# InfoStrings
|
10 |
+
from scorer import question_scorer
|
11 |
+
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
|
12 |
+
|
13 |
+
TOKEN = os.environ.get("TOKEN", None)
|
14 |
+
|
15 |
+
OWNER="Online Mind2Web"
|
16 |
+
# api = HfApi()
|
17 |
+
|
18 |
+
YEAR_VERSION = "2024"
|
19 |
+
|
20 |
+
LOCAL_DEBUG = True
|
21 |
+
|
22 |
+
# Display the results
|
23 |
+
def get_dataframe_from_results(eval_path):
|
24 |
+
df = pd.read_csv(eval_path)
|
25 |
+
df = df.sort_values(by=["Average SR"], ascending=False)
|
26 |
+
for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
|
27 |
+
df[format_column] = df[format_column].map('{:.1f}'.format)
|
28 |
+
# df["Average SR"] = df["Average SR"].map('{:.1f}'.format)
|
29 |
+
return df
|
30 |
+
|
31 |
+
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
|
32 |
+
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
33 |
+
|
34 |
+
|
35 |
+
TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
|
36 |
+
|
37 |
+
def refresh():
|
38 |
+
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
|
39 |
+
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
40 |
+
return auto_eval_dataframe_test, human_eval_dataframe_test
|
41 |
+
|
42 |
+
def upload_file(files):
|
43 |
+
file_paths = [file.name for file in files]
|
44 |
+
return file_paths
|
45 |
+
|
46 |
+
|
47 |
+
demo = gr.Blocks()
|
48 |
+
with demo:
|
49 |
+
gr.HTML(TITLE)
|
50 |
+
gr.HTML(LINKS)
|
51 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
52 |
+
|
53 |
+
with gr.Row():
|
54 |
+
with gr.Accordion("π Citation", open=False):
|
55 |
+
citation_button = gr.Textbox(
|
56 |
+
value=CITATION_BUTTON_TEXT,
|
57 |
+
label=CITATION_BUTTON_LABEL,
|
58 |
+
elem_id="citation-button",
|
59 |
+
lines=10,
|
60 |
+
)
|
61 |
+
|
62 |
+
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
|
63 |
+
human_leaderboard_table_test = gr.components.Dataframe(
|
64 |
+
value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
|
65 |
+
column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
|
66 |
+
)
|
67 |
+
with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
|
68 |
+
auto_leaderboard_table_test = gr.components.Dataframe(
|
69 |
+
value=auto_eval_dataframe_test, datatype=TYPES, interactive=False,
|
70 |
+
column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
|
71 |
+
)
|
72 |
+
|
73 |
+
with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
|
74 |
+
with gr.Row():
|
75 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
76 |
+
|
77 |
+
refresh_button = gr.Button("Refresh")
|
78 |
+
refresh_button.click(
|
79 |
+
refresh,
|
80 |
+
inputs=[],
|
81 |
+
outputs=[
|
82 |
+
auto_leaderboard_table_test,
|
83 |
+
human_leaderboard_table_test,
|
84 |
+
],
|
85 |
+
)
|
86 |
+
|
87 |
+
scheduler = BackgroundScheduler()
|
88 |
+
scheduler.start()
|
89 |
+
demo.launch(debug=True)
|
auto_Mind2Web-Online - Leaderboard_data.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
|
2 |
+
Operator,Unknown,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
|
3 |
+
SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
|
4 |
+
Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
|
5 |
+
Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
|
6 |
+
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22
|
content.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = """<h1 align="center" id="space-title">π Online Mind2Web Leaderboard</h1>"""
|
2 |
+
LINKS = """
|
3 |
+
<div align="center">
|
4 |
+
<a href="#">Blog</a> |
|
5 |
+
<a href="#">Paper</a> |
|
6 |
+
<a href="https://github.com/OSU-NLP-Group/Online-Mind2Web">Code</a> |
|
7 |
+
<a href="https://huggingface.co/datasets/osunlp/Online-Mind2Web">Data</a>
|
8 |
+
</div>
|
9 |
+
"""
|
10 |
+
|
11 |
+
INTRODUCTION_TEXT = """
|
12 |
+
Online Mind2Web is a benchmark designed to evaluate real-world performance of web agents on online websites.
|
13 |
+
|
14 |
+
|
15 |
+
## Tasks
|
16 |
+
Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate agents' performance in real-world environments.
|
17 |
+
|
18 |
+
Tasks are categorized into three difficulty levels based on the steps human annotators need:
|
19 |
+
- Easy: 1 - 5
|
20 |
+
- Medium: 6 - 10
|
21 |
+
- Hard: 11 +
|
22 |
+
|
23 |
+
## Leaderboard
|
24 |
+
"""
|
25 |
+
|
26 |
+
SUBMISSION_TEXT = """
|
27 |
+
## Submissions
|
28 |
+
Participants are invited to submit your agent's trajectory to test. The submissions will be evaluated based on our auto-eval.
|
29 |
+
|
30 |
+
### Format of submission
|
31 |
+
Submissions must include a sequence of images (i.e., screenshots in the trajectory) and a result.json file for each task. The JSON file should contain the fields: "Task", "Task_id", and "action_history". You can refer to an example of the submission files.
|
32 |
+
"""
|
33 |
+
|
34 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
35 |
+
CITATION_BUTTON_TEXT = r"""
|
36 |
+
Online Mind2Web"""
|
37 |
+
|
38 |
+
SUBMIT_INTRODUCTION = """
|
39 |
+
## β Please submit the trajectory file with the following format:
|
40 |
+
Each task is stored in a folder named after its `task_id`, containing:
|
41 |
+
|
42 |
+
- `trajectory/`: Stores screenshots of each step.
|
43 |
+
- `result.json`: Task metadata and action history.
|
44 |
+
|
45 |
+
**Structure:**
|
46 |
+
```
|
47 |
+
main_directory/
|
48 |
+
βββ task_id/
|
49 |
+
βββ result.json
|
50 |
+
βββ trajectory/
|
51 |
+
βββ 0_screenshot.png
|
52 |
+
βββ 1_screenshot.png
|
53 |
+
βββ ...
|
54 |
+
```
|
55 |
+
|
56 |
+
**`result.json` format:**
|
57 |
+
```json
|
58 |
+
{
|
59 |
+
"task_id": 123,
|
60 |
+
"task": "abc",
|
61 |
+
"action_history": ["abc", "xyz", "..."]
|
62 |
+
}
|
63 |
+
```
|
64 |
+
Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
|
65 |
+
|
66 |
+
We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval resultsβwe will spot-check these before adding them to the human-eval table.
|
67 |
+
|
68 |
+
"""
|
69 |
+
DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
|
70 |
+
"""
|
71 |
+
|
72 |
+
|
73 |
+
def format_error(msg):
|
74 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
|
75 |
+
|
76 |
+
def format_warning(msg):
|
77 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
|
78 |
+
|
79 |
+
def format_log(msg):
|
80 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
81 |
+
|
82 |
+
def model_hyperlink(link, model_name):
|
83 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
84 |
+
|
dataset_readme.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Dataset Description
|
2 |
+
- **Homepage:**
|
3 |
+
- **Repository:**
|
4 |
+
- **Paper:**
|
5 |
+
- **Point of Contact:**
|
6 |
+
|
7 |
+
### Dataset Summary
|
8 |
+
Mind2Web-Online is the online version of [Mind2Web](https://osu-nlp-group.github.io/Mind2Web/), a more diverse and user-centric dataset includes 300 high-quality tasks from 136 popular websites across various domains. The dataset covers a diverse set of user tasks, such as clothing, food, housing, and transportation, to evaluate web agents' performance in a real-world online environment.
|
9 |
+
|
10 |
+
### Data Fields
|
11 |
+
- "task_id" (str): Unique id for each task.
|
12 |
+
- "website" (str): Website url.
|
13 |
+
- "task_description" (str): Task description.
|
14 |
+
- "reference_length" (int): Number of steps required for a human annotator to complete the task.
|
15 |
+
|
16 |
+
### Disclaimer
|
17 |
+
This dataset was collected and released solely for research purposes, with the goal of making the web more accessible via language technologies. The authors are strongly against any potential harmful use of the data or technology to any party.
|
18 |
+
|
19 |
+
### Citation Information
|
human_Mind2Web-Online - Leaderboard_data.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
|
2 |
+
Operator,Unknown,OpenAI,OSU NLP,83.1,58.0,43.2,61.3,2025-3-22
|
3 |
+
SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,60.2,25.2,8.1,30.7,2025-3-22
|
4 |
+
Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,55.4,26.6,8.1,30.0,2025-3-22
|
5 |
+
Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,14.9,29.0,2025-3-22
|
6 |
+
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
gradio
|
3 |
+
huggingface-hub
|
4 |
+
numpy
|
5 |
+
APScheduler
|
scorer.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import string
|
4 |
+
import warnings
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
|
9 |
+
def normalize_number_str(number_str: str) -> float:
|
10 |
+
# we replace these common units and commas to allow
|
11 |
+
# conversion to float
|
12 |
+
for char in ["$", "%", ","]:
|
13 |
+
number_str = number_str.replace(char, "")
|
14 |
+
try:
|
15 |
+
return float(number_str)
|
16 |
+
except ValueError:
|
17 |
+
print(f"String {number_str} cannot be normalized to number str.")
|
18 |
+
return float("inf")
|
19 |
+
|
20 |
+
|
21 |
+
def split_string(
|
22 |
+
s: str,
|
23 |
+
char_list: list[str] = [",", ";"],
|
24 |
+
) -> list[str]:
|
25 |
+
pattern = f"[{''.join(char_list)}]"
|
26 |
+
return re.split(pattern, s)
|
27 |
+
|
28 |
+
|
29 |
+
def question_scorer(
|
30 |
+
model_answer: str,
|
31 |
+
ground_truth: str,
|
32 |
+
) -> bool:
|
33 |
+
def is_float(element: any) -> bool:
|
34 |
+
try:
|
35 |
+
float(element)
|
36 |
+
return True
|
37 |
+
except ValueError:
|
38 |
+
return False
|
39 |
+
|
40 |
+
if model_answer is None:
|
41 |
+
model_answer = "None"
|
42 |
+
|
43 |
+
# if gt is a number
|
44 |
+
if is_float(ground_truth):
|
45 |
+
print(f"Evaluating {model_answer} as a number.")
|
46 |
+
normalized_answer = normalize_number_str(model_answer)
|
47 |
+
return normalized_answer == float(ground_truth)
|
48 |
+
|
49 |
+
# if gt is a list
|
50 |
+
elif any(char in ground_truth for char in [",", ";"]):
|
51 |
+
print(f"Evaluating {model_answer} as a comma separated list.")
|
52 |
+
# question with the fish: normalization removes punct
|
53 |
+
|
54 |
+
gt_elems = split_string(ground_truth)
|
55 |
+
ma_elems = split_string(model_answer)
|
56 |
+
|
57 |
+
# check length is the same
|
58 |
+
if len(gt_elems) != len(ma_elems):
|
59 |
+
warnings.warn(
|
60 |
+
"Answer lists have different lengths, returning False.", UserWarning
|
61 |
+
)
|
62 |
+
return False
|
63 |
+
|
64 |
+
# compare each element as float or str
|
65 |
+
comparisons = []
|
66 |
+
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
|
67 |
+
if is_float(gt_elem):
|
68 |
+
normalized_ma_elem = normalize_number_str(ma_elem)
|
69 |
+
comparisons.append(normalized_ma_elem == float(gt_elem))
|
70 |
+
else:
|
71 |
+
# we do not remove punct since comparisons can include punct
|
72 |
+
comparisons.append(
|
73 |
+
normalize_str(ma_elem, remove_punct=False)
|
74 |
+
== normalize_str(gt_elem, remove_punct=False)
|
75 |
+
)
|
76 |
+
return all(comparisons)
|
77 |
+
|
78 |
+
# if gt is a str
|
79 |
+
else:
|
80 |
+
print(f"Evaluating {model_answer} as a string.")
|
81 |
+
return normalize_str(model_answer) == normalize_str(ground_truth)
|
82 |
+
|
83 |
+
|
84 |
+
def normalize_str(input_str, remove_punct=True) -> str:
|
85 |
+
"""
|
86 |
+
Normalize a string by:
|
87 |
+
- Removing all white spaces
|
88 |
+
- Optionally removing punctuation (if remove_punct is True)
|
89 |
+
- Converting to lowercase
|
90 |
+
Parameters:
|
91 |
+
- input_str: str, the string to normalize
|
92 |
+
- remove_punct: bool, whether to remove punctuation (default: True)
|
93 |
+
Returns:
|
94 |
+
- str, the normalized string
|
95 |
+
"""
|
96 |
+
# Remove all white spaces. Required e.g for seagull vs. sea gull
|
97 |
+
no_spaces = re.sub(r"\s", "", input_str)
|
98 |
+
|
99 |
+
# Remove punctuation, if specified.
|
100 |
+
if remove_punct:
|
101 |
+
translator = str.maketrans("", "", string.punctuation)
|
102 |
+
return no_spaces.lower().translate(translator)
|
103 |
+
else:
|
104 |
+
return no_spaces.lower()
|