Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

App Files Files Community

WeijianQi1999 commited on Mar 24

Commit

2dba94f

1 Parent(s): c6b576c

initialize

Browse files

Files changed (8) hide show

README.md +8 -5
app.py +89 -0
auto_Mind2Web-Online - Leaderboard_data.csv +6 -0
content.py +84 -0
dataset_readme.md +19 -0
human_Mind2Web-Online - Leaderboard_data.csv +6 -0
requirements.txt +5 -0
scorer.py +104 -0

README.md CHANGED Viewed

@@ -1,13 +1,16 @@
 ---
 title: Online Mind2Web Leaderboard
-emoji: 🦀
 colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 5.22.0
 app_file: app.py
-pinned: false
-short_description: osunlp/Online_Mind2Web_Leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Online Mind2Web Leaderboard
+emoji: 🌐
 colorFrom: yellow
+colorTo: indigo
 sdk: gradio
 app_file: app.py
+pinned: true
+license: apache-2.0
+hf_oauth: true
+failure_strategy: rollback
+tags:
+  - leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import gradio as gr
+import pandas as pd
+import numpy as np
+from apscheduler.schedulers.background import BackgroundScheduler
+# InfoStrings
+from scorer import question_scorer
+from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
+TOKEN = os.environ.get("TOKEN", None)
+OWNER="Online Mind2Web"
+# api = HfApi()
+YEAR_VERSION = "2024"
+LOCAL_DEBUG = True
+# Display the results
+def get_dataframe_from_results(eval_path):
+    df = pd.read_csv(eval_path)
+    df = df.sort_values(by=["Average SR"], ascending=False)
+    for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
+        df[format_column] = df[format_column].map('{:.1f}'.format)
+    # df["Average SR"] = df["Average SR"].map('{:.1f}'.format)
+    return df
+auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
+human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
+TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
+def refresh():
+    auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
+    human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
+    return auto_eval_dataframe_test, human_eval_dataframe_test
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    gr.HTML(LINKS)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+                lines=10,
+            )
+    with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
+        human_leaderboard_table_test = gr.components.Dataframe(
+            value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
+            column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
+        )
+    with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
+        auto_leaderboard_table_test = gr.components.Dataframe(
+            value=auto_eval_dataframe_test, datatype=TYPES, interactive=False,
+            column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
+        )
+    with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
+        with gr.Row():
+            gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
+    refresh_button = gr.Button("Refresh")
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[
+            auto_leaderboard_table_test,
+            human_leaderboard_table_test,
+        ],
+    )
+scheduler = BackgroundScheduler()
+scheduler.start()
+demo.launch(debug=True)

auto_Mind2Web-Online - Leaderboard_data.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
+Operator,Unknown,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
+SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
+Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
+Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
+Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22

content.py ADDED Viewed

	@@ -0,0 +1,84 @@

+TITLE = """<h1 align="center" id="space-title">🏆 Online Mind2Web Leaderboard</h1>"""
+LINKS = """
+<div align="center">
+    <a href="#">Blog</a> |
+    <a href="#">Paper</a> |
+    <a href="https://github.com/OSU-NLP-Group/Online-Mind2Web">Code</a> |
+    <a href="https://huggingface.co/datasets/osunlp/Online-Mind2Web">Data</a>
+</div>
+"""
+INTRODUCTION_TEXT = """
+Online Mind2Web is a benchmark designed to evaluate real-world performance of web agents on online websites.
+## Tasks
+Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate agents' performance in real-world environments.
+Tasks are categorized into three difficulty levels based on the steps human annotators need:
+- Easy: 1 - 5
+- Medium: 6 - 10
+- Hard: 11 +
+## Leaderboard
+"""
+SUBMISSION_TEXT = """
+## Submissions
+Participants are invited to submit your agent's trajectory to test. The submissions will be evaluated based on our auto-eval.
+### Format of submission
+Submissions must include a sequence of images (i.e., screenshots in the trajectory) and a result.json file for each task. The JSON file should contain the fields: "Task", "Task_id", and "action_history". You can refer to an example of the submission files.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+Online Mind2Web"""
+SUBMIT_INTRODUCTION = """
+## ⚠ Please submit the trajectory file with the following format:
+Each task is stored in a folder named after its `task_id`, containing:
+- `trajectory/`: Stores screenshots of each step.
+- `result.json`: Task metadata and action history.
+**Structure:**
+```
+main_directory/
+└── task_id/
+    ├── result.json
+    └── trajectory/
+        ├── 0_screenshot.png
+        ├── 1_screenshot.png
+        └── ...
+```
+**`result.json` format:**
+```json
+{
+    "task_id": 123,
+    "task": "abc",
+    "action_history": ["abc", "xyz", "..."]
+}
+```
+Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
+We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval results—we will spot-check these before adding them to the human-eval table.
+"""
+DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
+"""
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

dataset_readme.md ADDED Viewed

	@@ -0,0 +1,19 @@

+## Dataset Description
+- **Homepage:**
+- **Repository:**
+- **Paper:**
+- **Point of Contact:**
+### Dataset Summary
+Mind2Web-Online is the online version of [Mind2Web](https://osu-nlp-group.github.io/Mind2Web/), a more diverse and user-centric dataset includes 300 high-quality tasks from 136 popular websites across various domains. The dataset covers a diverse set of user tasks, such as clothing, food, housing, and transportation, to evaluate web agents' performance in a real-world online environment.
+### Data Fields
+- "task_id" (str): Unique id for each task.
+- "website" (str): Website url.
+- "task_description" (str): Task description.
+- "reference_length" (int): Number of steps required for a human annotator to complete the task.
+### Disclaimer
+This dataset was collected and released solely for research purposes, with the goal of making the web more accessible via language technologies. The authors are strongly against any potential harmful use of the data or technology to any party.
+### Citation Information

human_Mind2Web-Online - Leaderboard_data.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
+Operator,Unknown,OpenAI,OSU NLP,83.1,58.0,43.2,61.3,2025-3-22
+SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,60.2,25.2,8.1,30.7,2025-3-22
+Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,55.4,26.6,8.1,30.0,2025-3-22
+Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,14.9,29.0,2025-3-22
+Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets
+gradio
+huggingface-hub
+numpy
+APScheduler

scorer.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import json
+import re
+import string
+import warnings
+import numpy as np
+def normalize_number_str(number_str: str) -> float:
+    # we replace these common units and commas to allow
+    # conversion to float
+    for char in ["$", "%", ","]:
+        number_str = number_str.replace(char, "")
+    try:
+        return float(number_str)
+    except ValueError:
+        print(f"String {number_str} cannot be normalized to number str.")
+        return float("inf")
+def split_string(
+    s: str,
+    char_list: list[str] = [",", ";"],
+) -> list[str]:
+    pattern = f"[{''.join(char_list)}]"
+    return re.split(pattern, s)
+def question_scorer(
+    model_answer: str,
+    ground_truth: str,
+) -> bool:
+    def is_float(element: any) -> bool:
+        try:
+            float(element)
+            return True
+        except ValueError:
+            return False
+    if model_answer is None:
+        model_answer = "None"
+    # if gt is a number
+    if is_float(ground_truth):
+        print(f"Evaluating {model_answer} as a number.")
+        normalized_answer = normalize_number_str(model_answer)
+        return normalized_answer == float(ground_truth)
+    # if gt is a list
+    elif any(char in ground_truth for char in [",", ";"]):
+        print(f"Evaluating {model_answer} as a comma separated list.")
+        # question with the fish: normalization removes punct
+        gt_elems = split_string(ground_truth)
+        ma_elems = split_string(model_answer)
+        # check length is the same
+        if len(gt_elems) != len(ma_elems):
+            warnings.warn(
+                "Answer lists have different lengths, returning False.", UserWarning
+            )
+            return False
+        # compare each element as float or str
+        comparisons = []
+        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
+            if is_float(gt_elem):
+                normalized_ma_elem = normalize_number_str(ma_elem)
+                comparisons.append(normalized_ma_elem == float(gt_elem))
+            else:
+                # we do not remove punct since comparisons can include punct
+                comparisons.append(
+                    normalize_str(ma_elem, remove_punct=False)
+                    == normalize_str(gt_elem, remove_punct=False)
+                )
+        return all(comparisons)
+    # if gt is a str
+    else:
+        print(f"Evaluating {model_answer} as a string.")
+        return normalize_str(model_answer) == normalize_str(ground_truth)
+def normalize_str(input_str, remove_punct=True) -> str:
+    """
+    Normalize a string by:
+    - Removing all white spaces
+    - Optionally removing punctuation (if remove_punct is True)
+    - Converting to lowercase
+    Parameters:
+    - input_str: str, the string to normalize
+    - remove_punct: bool, whether to remove punctuation (default: True)
+    Returns:
+    - str, the normalized string
+    """
+    # Remove all white spaces. Required e.g for seagull vs. sea gull
+    no_spaces = re.sub(r"\s", "", input_str)
+    # Remove punctuation, if specified.
+    if remove_punct:
+        translator = str.maketrans("", "", string.punctuation)
+        return no_spaces.lower().translate(translator)
+    else:
+        return no_spaces.lower()