[ { "benchmark": "WorkArena-L1", "score": 42.7, "std_err": 0.4, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", "reproducible": "Yes", "comments": "NA" }, { "benchmark": "WorkArena++-L2", "score": 3.0, "std_err": 0.6, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", "reproducible": "Yes", "comments": "NA" }, { "benchmark": "WorkArena++-L3", "score": 0.0, "std_err": 0.0, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", "reproducible": "Yes", "comments": "NA" }, { "benchmark": "MiniWoB", "score": 71.3, "std_err": 0.5, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", "reproducible": "Yes", "comments": "NA" }, { "benchmark": "WebArena", "score": 23.5, "std_err": 0.4, "benchmark_specific": "No", "benchmark_tuned": "No", "followed_evaluation_protocol": "Yes", "reproducible": "Yes", "comments": "NA" } ]