WeijianQi1999 commited on
Commit
2dba94f
Β·
1 Parent(s): c6b576c

initialize

Browse files
README.md CHANGED
@@ -1,13 +1,16 @@
1
  ---
2
  title: Online Mind2Web Leaderboard
3
- emoji: πŸ¦€
4
  colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.22.0
8
  app_file: app.py
9
- pinned: false
10
- short_description: osunlp/Online_Mind2Web_Leaderboard
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Online Mind2Web Leaderboard
3
+ emoji: 🌐
4
  colorFrom: yellow
5
+ colorTo: indigo
6
  sdk: gradio
 
7
  app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ hf_oauth: true
11
+ failure_strategy: rollback
12
+ tags:
13
+ - leaderboard
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
+
9
+ # InfoStrings
10
+ from scorer import question_scorer
11
+ from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
12
+
13
+ TOKEN = os.environ.get("TOKEN", None)
14
+
15
+ OWNER="Online Mind2Web"
16
+ # api = HfApi()
17
+
18
+ YEAR_VERSION = "2024"
19
+
20
+ LOCAL_DEBUG = True
21
+
22
+ # Display the results
23
+ def get_dataframe_from_results(eval_path):
24
+ df = pd.read_csv(eval_path)
25
+ df = df.sort_values(by=["Average SR"], ascending=False)
26
+ for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
27
+ df[format_column] = df[format_column].map('{:.1f}'.format)
28
+ # df["Average SR"] = df["Average SR"].map('{:.1f}'.format)
29
+ return df
30
+
31
+ auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
32
+ human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
33
+
34
+
35
+ TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
36
+
37
+ def refresh():
38
+ auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
39
+ human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
40
+ return auto_eval_dataframe_test, human_eval_dataframe_test
41
+
42
+ def upload_file(files):
43
+ file_paths = [file.name for file in files]
44
+ return file_paths
45
+
46
+
47
+ demo = gr.Blocks()
48
+ with demo:
49
+ gr.HTML(TITLE)
50
+ gr.HTML(LINKS)
51
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
52
+
53
+ with gr.Row():
54
+ with gr.Accordion("πŸ“™ Citation", open=False):
55
+ citation_button = gr.Textbox(
56
+ value=CITATION_BUTTON_TEXT,
57
+ label=CITATION_BUTTON_LABEL,
58
+ elem_id="citation-button",
59
+ lines=10,
60
+ )
61
+
62
+ with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
63
+ human_leaderboard_table_test = gr.components.Dataframe(
64
+ value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
65
+ column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
66
+ )
67
+ with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
68
+ auto_leaderboard_table_test = gr.components.Dataframe(
69
+ value=auto_eval_dataframe_test, datatype=TYPES, interactive=False,
70
+ column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
71
+ )
72
+
73
+ with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
74
+ with gr.Row():
75
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
76
+
77
+ refresh_button = gr.Button("Refresh")
78
+ refresh_button.click(
79
+ refresh,
80
+ inputs=[],
81
+ outputs=[
82
+ auto_leaderboard_table_test,
83
+ human_leaderboard_table_test,
84
+ ],
85
+ )
86
+
87
+ scheduler = BackgroundScheduler()
88
+ scheduler.start()
89
+ demo.launch(debug=True)
auto_Mind2Web-Online - Leaderboard_data.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
2
+ Operator,Unknown,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
3
+ SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
4
+ Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
5
+ Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
6
+ Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22
content.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">πŸ† Online Mind2Web Leaderboard</h1>"""
2
+ LINKS = """
3
+ <div align="center">
4
+ <a href="#">Blog</a> |
5
+ <a href="#">Paper</a> |
6
+ <a href="https://github.com/OSU-NLP-Group/Online-Mind2Web">Code</a> |
7
+ <a href="https://huggingface.co/datasets/osunlp/Online-Mind2Web">Data</a>
8
+ </div>
9
+ """
10
+
11
+ INTRODUCTION_TEXT = """
12
+ Online Mind2Web is a benchmark designed to evaluate real-world performance of web agents on online websites.
13
+
14
+
15
+ ## Tasks
16
+ Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate agents' performance in real-world environments.
17
+
18
+ Tasks are categorized into three difficulty levels based on the steps human annotators need:
19
+ - Easy: 1 - 5
20
+ - Medium: 6 - 10
21
+ - Hard: 11 +
22
+
23
+ ## Leaderboard
24
+ """
25
+
26
+ SUBMISSION_TEXT = """
27
+ ## Submissions
28
+ Participants are invited to submit your agent's trajectory to test. The submissions will be evaluated based on our auto-eval.
29
+
30
+ ### Format of submission
31
+ Submissions must include a sequence of images (i.e., screenshots in the trajectory) and a result.json file for each task. The JSON file should contain the fields: "Task", "Task_id", and "action_history". You can refer to an example of the submission files.
32
+ """
33
+
34
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
35
+ CITATION_BUTTON_TEXT = r"""
36
+ Online Mind2Web"""
37
+
38
+ SUBMIT_INTRODUCTION = """
39
+ ## ⚠ Please submit the trajectory file with the following format:
40
+ Each task is stored in a folder named after its `task_id`, containing:
41
+
42
+ - `trajectory/`: Stores screenshots of each step.
43
+ - `result.json`: Task metadata and action history.
44
+
45
+ **Structure:**
46
+ ```
47
+ main_directory/
48
+ └── task_id/
49
+ β”œβ”€β”€ result.json
50
+ └── trajectory/
51
+ β”œβ”€β”€ 0_screenshot.png
52
+ β”œβ”€β”€ 1_screenshot.png
53
+ └── ...
54
+ ```
55
+
56
+ **`result.json` format:**
57
+ ```json
58
+ {
59
+ "task_id": 123,
60
+ "task": "abc",
61
+ "action_history": ["abc", "xyz", "..."]
62
+ }
63
+ ```
64
+ Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
65
+
66
+ We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval resultsβ€”we will spot-check these before adding them to the human-eval table.
67
+
68
+ """
69
+ DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
70
+ """
71
+
72
+
73
+ def format_error(msg):
74
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
75
+
76
+ def format_warning(msg):
77
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
78
+
79
+ def format_log(msg):
80
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
81
+
82
+ def model_hyperlink(link, model_name):
83
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
84
+
dataset_readme.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Dataset Description
2
+ - **Homepage:**
3
+ - **Repository:**
4
+ - **Paper:**
5
+ - **Point of Contact:**
6
+
7
+ ### Dataset Summary
8
+ Mind2Web-Online is the online version of [Mind2Web](https://osu-nlp-group.github.io/Mind2Web/), a more diverse and user-centric dataset includes 300 high-quality tasks from 136 popular websites across various domains. The dataset covers a diverse set of user tasks, such as clothing, food, housing, and transportation, to evaluate web agents' performance in a real-world online environment.
9
+
10
+ ### Data Fields
11
+ - "task_id" (str): Unique id for each task.
12
+ - "website" (str): Website url.
13
+ - "task_description" (str): Task description.
14
+ - "reference_length" (int): Number of steps required for a human annotator to complete the task.
15
+
16
+ ### Disclaimer
17
+ This dataset was collected and released solely for research purposes, with the goal of making the web more accessible via language technologies. The authors are strongly against any potential harmful use of the data or technology to any party.
18
+
19
+ ### Citation Information
human_Mind2Web-Online - Leaderboard_data.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
2
+ Operator,Unknown,OpenAI,OSU NLP,83.1,58.0,43.2,61.3,2025-3-22
3
+ SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,60.2,25.2,8.1,30.7,2025-3-22
4
+ Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,55.4,26.6,8.1,30.0,2025-3-22
5
+ Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,14.9,29.0,2025-3-22
6
+ Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ datasets
2
+ gradio
3
+ huggingface-hub
4
+ numpy
5
+ APScheduler
scorer.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import string
4
+ import warnings
5
+
6
+ import numpy as np
7
+
8
+
9
+ def normalize_number_str(number_str: str) -> float:
10
+ # we replace these common units and commas to allow
11
+ # conversion to float
12
+ for char in ["$", "%", ","]:
13
+ number_str = number_str.replace(char, "")
14
+ try:
15
+ return float(number_str)
16
+ except ValueError:
17
+ print(f"String {number_str} cannot be normalized to number str.")
18
+ return float("inf")
19
+
20
+
21
+ def split_string(
22
+ s: str,
23
+ char_list: list[str] = [",", ";"],
24
+ ) -> list[str]:
25
+ pattern = f"[{''.join(char_list)}]"
26
+ return re.split(pattern, s)
27
+
28
+
29
+ def question_scorer(
30
+ model_answer: str,
31
+ ground_truth: str,
32
+ ) -> bool:
33
+ def is_float(element: any) -> bool:
34
+ try:
35
+ float(element)
36
+ return True
37
+ except ValueError:
38
+ return False
39
+
40
+ if model_answer is None:
41
+ model_answer = "None"
42
+
43
+ # if gt is a number
44
+ if is_float(ground_truth):
45
+ print(f"Evaluating {model_answer} as a number.")
46
+ normalized_answer = normalize_number_str(model_answer)
47
+ return normalized_answer == float(ground_truth)
48
+
49
+ # if gt is a list
50
+ elif any(char in ground_truth for char in [",", ";"]):
51
+ print(f"Evaluating {model_answer} as a comma separated list.")
52
+ # question with the fish: normalization removes punct
53
+
54
+ gt_elems = split_string(ground_truth)
55
+ ma_elems = split_string(model_answer)
56
+
57
+ # check length is the same
58
+ if len(gt_elems) != len(ma_elems):
59
+ warnings.warn(
60
+ "Answer lists have different lengths, returning False.", UserWarning
61
+ )
62
+ return False
63
+
64
+ # compare each element as float or str
65
+ comparisons = []
66
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
67
+ if is_float(gt_elem):
68
+ normalized_ma_elem = normalize_number_str(ma_elem)
69
+ comparisons.append(normalized_ma_elem == float(gt_elem))
70
+ else:
71
+ # we do not remove punct since comparisons can include punct
72
+ comparisons.append(
73
+ normalize_str(ma_elem, remove_punct=False)
74
+ == normalize_str(gt_elem, remove_punct=False)
75
+ )
76
+ return all(comparisons)
77
+
78
+ # if gt is a str
79
+ else:
80
+ print(f"Evaluating {model_answer} as a string.")
81
+ return normalize_str(model_answer) == normalize_str(ground_truth)
82
+
83
+
84
+ def normalize_str(input_str, remove_punct=True) -> str:
85
+ """
86
+ Normalize a string by:
87
+ - Removing all white spaces
88
+ - Optionally removing punctuation (if remove_punct is True)
89
+ - Converting to lowercase
90
+ Parameters:
91
+ - input_str: str, the string to normalize
92
+ - remove_punct: bool, whether to remove punctuation (default: True)
93
+ Returns:
94
+ - str, the normalized string
95
+ """
96
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
97
+ no_spaces = re.sub(r"\s", "", input_str)
98
+
99
+ # Remove punctuation, if specified.
100
+ if remove_punct:
101
+ translator = str.maketrans("", "", string.punctuation)
102
+ return no_spaces.lower().translate(translator)
103
+ else:
104
+ return no_spaces.lower()