xuetianci99 commited on
Commit
be69b6a
·
verified ·
1 Parent(s): 808487a

Upload content.py

Browse files
Files changed (1) hide show
  1. content.py +24 -14
content.py CHANGED
@@ -15,14 +15,13 @@ Based on the number of steps required by human annotators, tasks are divided int
15
 
16
  LEADERBOARD_TEXT = """
17
  ### Leaderboard
18
- We maintain two leaderboards—one for automated evaluation and another for human evaluation.
19
- All submissions will be auto-evaluated internally, and if human evaluation results are provided, a subset will be selected for rigorous spot-check verification.
20
  """
21
 
22
  SUBMISSION_TEXT = """
23
  ## Submissions
24
  Participants are invited to submit your agent's trajectory to test. The submissions will be evaluated based on our auto-eval.
25
-
26
  ### Format of submission
27
  Submissions must include a sequence of images (i.e., screenshots in the trajectory) and a result.json file for each task. The JSON file should contain the fields: "Task", "Task_id", and "action_history". You can refer to an example of the submission files.
28
  """
@@ -37,7 +36,6 @@ CITATION_BUTTON_TEXT = r"""
37
  month = "Mar",
38
  url = "https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4"
39
  }
40
-
41
  @inproceedings{deng2023mind2web,
42
  author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
43
  booktitle = {Advances in Neural Information Processing Systems},
@@ -52,13 +50,15 @@ CITATION_BUTTON_TEXT = r"""
52
  """
53
 
54
  SUBMIT_INTRODUCTION = """
55
- ## Please submit the trajectory file with the following format:
56
-
57
- Each task is stored in a folder named after its `task_id`, containing:
58
 
 
 
59
  - `trajectory/`: Stores screenshots of each step.
60
  - `result.json`: Task metadata and action history.
61
-
62
  **Structure:**
63
  ```
64
  main_directory/
@@ -69,7 +69,6 @@ main_directory/
69
  ├── 1_screenshot.png
70
  └── ...
71
  ```
72
-
73
  **`result.json` format:**
74
  ```json
75
  {
@@ -78,11 +77,23 @@ main_directory/
78
  "action_history": ["abc", "xyz", "..."]
79
  }
80
  ```
81
- Please send your agent's name, model family, and organization via email to xue.[email protected], along with the trajectory directory attached.
82
-
83
- Here is an [example](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f) of the format. We encourage you to use the script provided in our GitHub repository to obtain evaluation results and submit them. To ensure the authenticity and reliability of the reported results, we will also conduct a verification.
84
- If you have conducted your own human evaluation, please also attach your human eval results—we will spot-check these before adding them to the human-eval table.
 
 
 
 
 
 
 
 
 
 
 
85
 
 
86
  """
87
  DATA_DATASET = """## More Statistics for Online-Mind2Web Benchmark
88
  """
@@ -99,4 +110,3 @@ def format_log(msg):
99
 
100
  def model_hyperlink(link, model_name):
101
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
102
-
 
15
 
16
  LEADERBOARD_TEXT = """
17
  ### Leaderboard
18
+ Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboards—one for automatic evaluation and another for human evaluation.
19
+ Please click "Submission Guideline" for details.
20
  """
21
 
22
  SUBMISSION_TEXT = """
23
  ## Submissions
24
  Participants are invited to submit your agent's trajectory to test. The submissions will be evaluated based on our auto-eval.
 
25
  ### Format of submission
26
  Submissions must include a sequence of images (i.e., screenshots in the trajectory) and a result.json file for each task. The JSON file should contain the fields: "Task", "Task_id", and "action_history". You can refer to an example of the submission files.
27
  """
 
36
  month = "Mar",
37
  url = "https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4"
38
  }
 
39
  @inproceedings{deng2023mind2web,
40
  author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
41
  booktitle = {Advances in Neural Information Processing Systems},
 
50
  """
51
 
52
  SUBMIT_INTRODUCTION = """
53
+ You should use the script provided in our GitHub repository to obtain automatic evaluation results on your own and submit them along with all trajectories.
54
+ To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results.
55
+ If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
56
 
57
+ ## ⚠ Please submit the trajectory file with the following format:
58
+ The result of each task is stored in a folder named as its `task_id`, containing:
59
  - `trajectory/`: Stores screenshots of each step.
60
  - `result.json`: Task metadata and action history.
61
+ Here is an [example](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f) of the format.
62
  **Structure:**
63
  ```
64
  main_directory/
 
69
  ├── 1_screenshot.png
70
  └── ...
71
  ```
 
72
  **`result.json` format:**
73
  ```json
74
  {
 
77
  "action_history": ["abc", "xyz", "..."]
78
  }
79
  ```
80
+ **`human_result.json` format:**
81
+ ```json
82
+ [
83
+ {
84
+ "task_id": 123,
85
+ "task": "abc",
86
+ "human_label": 0 or 1 (failure or success)
87
+ },
88
+ {
89
+ "task_id": 456,
90
+ "task": "def",
91
+ "human_label": 0 or 1 (failure or success)
92
+ },
93
+ ]
94
+ ```
95
 
96
+ Please email your agent's name, model family, and organization to [email protected], and include the trajectory directory and auto-evaluation result file as attachments (optional: human evaluation result).
97
  """
98
  DATA_DATASET = """## More Statistics for Online-Mind2Web Benchmark
99
  """
 
110
 
111
  def model_hyperlink(link, model_name):
112
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'