Spaces:
Running
Running
rename the github link
Browse files- ZeroEval-main/result_dirs/zebra-grid.summary.json +44 -0
- _about_us.md +1 -1
- _header.md +1 -1
- app.py +2 -2
- constants.py +1 -1
- data_utils.py +1 -1
- update_data.sh +3 -3
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
@@ -175,6 +175,17 @@
|
|
175 |
"Total Puzzles": 1000,
|
176 |
"Reason Lens": "855.72"
|
177 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
{
|
179 |
"Model": "gpt-4-turbo-2024-04-09",
|
180 |
"Mode": "sampling",
|
@@ -186,6 +197,17 @@
|
|
186 |
"Total Puzzles": 1000,
|
187 |
"Reason Lens": "1165.90"
|
188 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
{
|
190 |
"Model": "gemini-1.5-pro-exp-0801",
|
191 |
"Mode": "greedy",
|
@@ -472,6 +494,17 @@
|
|
472 |
"Total Puzzles": 1000,
|
473 |
"Reason Lens": "849.84"
|
474 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
{
|
476 |
"Model": "Meta-Llama-3-8B-Instruct",
|
477 |
"Mode": "greedy",
|
@@ -604,6 +637,17 @@
|
|
604 |
"Total Puzzles": 1000,
|
605 |
"Reason Lens": "718.43"
|
606 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
607 |
{
|
608 |
"Model": "gemma-2-2b-it",
|
609 |
"Mode": "greedy",
|
|
|
175 |
"Total Puzzles": 1000,
|
176 |
"Reason Lens": "855.72"
|
177 |
},
|
178 |
+
{
|
179 |
+
"Model": "Qwen2.5-72B-Instruct",
|
180 |
+
"Mode": "greedy",
|
181 |
+
"Puzzle Acc": "26.60",
|
182 |
+
"Cell Acc": "40.92",
|
183 |
+
"No answer": "11.90",
|
184 |
+
"Easy Puzzle Acc": "76.43",
|
185 |
+
"Hard Puzzle Acc": "7.22",
|
186 |
+
"Total Puzzles": 1000,
|
187 |
+
"Reason Lens": "1795.90"
|
188 |
+
},
|
189 |
{
|
190 |
"Model": "gpt-4-turbo-2024-04-09",
|
191 |
"Mode": "sampling",
|
|
|
197 |
"Total Puzzles": 1000,
|
198 |
"Reason Lens": "1165.90"
|
199 |
},
|
200 |
+
{
|
201 |
+
"Model": "Qwen2.5-32B-Instruct",
|
202 |
+
"Mode": "greedy",
|
203 |
+
"Puzzle Acc": "26.10",
|
204 |
+
"Cell Acc": "43.39",
|
205 |
+
"No answer": "6.30",
|
206 |
+
"Easy Puzzle Acc": "77.50",
|
207 |
+
"Hard Puzzle Acc": "6.11",
|
208 |
+
"Total Puzzles": 1000,
|
209 |
+
"Reason Lens": "1333.07"
|
210 |
+
},
|
211 |
{
|
212 |
"Model": "gemini-1.5-pro-exp-0801",
|
213 |
"Mode": "greedy",
|
|
|
494 |
"Total Puzzles": 1000,
|
495 |
"Reason Lens": "849.84"
|
496 |
},
|
497 |
+
{
|
498 |
+
"Model": "Qwen2.5-7B-Instruct",
|
499 |
+
"Mode": "greedy",
|
500 |
+
"Puzzle Acc": "12.00",
|
501 |
+
"Cell Acc": "30.67",
|
502 |
+
"No answer": "9.50",
|
503 |
+
"Easy Puzzle Acc": "38.93",
|
504 |
+
"Hard Puzzle Acc": "1.53",
|
505 |
+
"Total Puzzles": 1000,
|
506 |
+
"Reason Lens": "850.93"
|
507 |
+
},
|
508 |
{
|
509 |
"Model": "Meta-Llama-3-8B-Instruct",
|
510 |
"Mode": "greedy",
|
|
|
637 |
"Total Puzzles": 1000,
|
638 |
"Reason Lens": "718.43"
|
639 |
},
|
640 |
+
{
|
641 |
+
"Model": "Qwen2.5-3B-Instruct",
|
642 |
+
"Mode": "greedy",
|
643 |
+
"Puzzle Acc": "4.80",
|
644 |
+
"Cell Acc": "11.44",
|
645 |
+
"No answer": "56.70",
|
646 |
+
"Easy Puzzle Acc": "17.14",
|
647 |
+
"Hard Puzzle Acc": "0.00",
|
648 |
+
"Total Puzzles": 1000,
|
649 |
+
"Reason Lens": "906.58"
|
650 |
+
},
|
651 |
{
|
652 |
"Model": "gemma-2-2b-it",
|
653 |
"Mode": "greedy",
|
_about_us.md
CHANGED
@@ -10,6 +10,6 @@ We are from [AllenAI](https://allenai.org/) (AI2), a non-profit research organiz
|
|
10 |
### Contact
|
11 |
|
12 |
Please contact us in the following ways:
|
13 |
-
- Github Issues/PRs: [https://github.com/
|
14 |
- Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
|
15 |
|
|
|
10 |
### Contact
|
11 |
|
12 |
Please contact us in the following ways:
|
13 |
+
- Github Issues/PRs: [https://github.com/WildEval/ZeroEval/](https://github.com/WildEval/ZeroEval/)
|
14 |
- Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
|
15 |
|
_header.md
CHANGED
@@ -2,5 +2,5 @@
|
|
2 |
|
3 |
# π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
|
4 |
<!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
5 |
-
[π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/
|
6 |
|
|
|
2 |
|
3 |
# π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
|
4 |
<!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
5 |
+
[π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/WildEval/ZeroEval) | [π€ HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [π¦ X](https://twitter.com/billyuchenlin/) | [π¬ Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
|
6 |
|
app.py
CHANGED
@@ -135,8 +135,8 @@ def _tab_explore():
|
|
135 |
|
136 |
def _tab_submit():
|
137 |
markdown_text = """
|
138 |
-
Please create an issue on our [Github](https://github.com/
|
139 |
-
If you would like to do local testing, please read our code [here](https://github.com/
|
140 |
and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
|
141 |
"""
|
142 |
|
|
|
135 |
|
136 |
def _tab_submit():
|
137 |
markdown_text = """
|
138 |
+
Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
|
139 |
+
If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
|
140 |
and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
|
141 |
"""
|
142 |
|
constants.py
CHANGED
@@ -4,7 +4,7 @@ from collections import OrderedDict
|
|
4 |
DEFAULT_K = "β"
|
5 |
# DEFAULT_K = "1500"
|
6 |
|
7 |
-
banner_url = "https://github.com/
|
8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
9 |
|
10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
|
|
|
4 |
DEFAULT_K = "β"
|
5 |
# DEFAULT_K = "1500"
|
6 |
|
7 |
+
banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
|
8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
9 |
|
10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
|
data_utils.py
CHANGED
@@ -49,7 +49,7 @@ def load_all_data():
|
|
49 |
model_summary = json.load(f)
|
50 |
model_names = [model["Model"] for model in model_summary]
|
51 |
for model_name in model_names:
|
52 |
-
download_url = f"https://raw.githubusercontent.com/
|
53 |
output_file = os.path.join(result_dir, f"{model_name}.json")
|
54 |
# mkdir -p result_dir if not exists
|
55 |
os.makedirs(result_dir, exist_ok=True)
|
|
|
49 |
model_summary = json.load(f)
|
50 |
model_names = [model["Model"] for model in model_summary]
|
51 |
for model_name in model_names:
|
52 |
+
download_url = f"https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
|
53 |
output_file = os.path.join(result_dir, f"{model_name}.json")
|
54 |
# mkdir -p result_dir if not exists
|
55 |
os.makedirs(result_dir, exist_ok=True)
|
update_data.sh
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
# download the file from https://raw.githubusercontent.com/
|
2 |
# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
|
3 |
mkdir -p ZeroEval-main/result_dirs/zebra-grid/
|
4 |
-
wget https://raw.githubusercontent.com/
|
5 |
-
wget https://raw.githubusercontent.com/
|
|
|
1 |
+
# download the file from https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json
|
2 |
# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
|
3 |
mkdir -p ZeroEval-main/result_dirs/zebra-grid/
|
4 |
+
wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
|
5 |
+
wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
|