yuchenlin commited on
Commit
1f16e7a
β€’
1 Parent(s): db67fbd

rename the github link

Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED
@@ -175,6 +175,17 @@
175
  "Total Puzzles": 1000,
176
  "Reason Lens": "855.72"
177
  },
 
 
 
 
 
 
 
 
 
 
 
178
  {
179
  "Model": "gpt-4-turbo-2024-04-09",
180
  "Mode": "sampling",
@@ -186,6 +197,17 @@
186
  "Total Puzzles": 1000,
187
  "Reason Lens": "1165.90"
188
  },
 
 
 
 
 
 
 
 
 
 
 
189
  {
190
  "Model": "gemini-1.5-pro-exp-0801",
191
  "Mode": "greedy",
@@ -472,6 +494,17 @@
472
  "Total Puzzles": 1000,
473
  "Reason Lens": "849.84"
474
  },
 
 
 
 
 
 
 
 
 
 
 
475
  {
476
  "Model": "Meta-Llama-3-8B-Instruct",
477
  "Mode": "greedy",
@@ -604,6 +637,17 @@
604
  "Total Puzzles": 1000,
605
  "Reason Lens": "718.43"
606
  },
 
 
 
 
 
 
 
 
 
 
 
607
  {
608
  "Model": "gemma-2-2b-it",
609
  "Mode": "greedy",
 
175
  "Total Puzzles": 1000,
176
  "Reason Lens": "855.72"
177
  },
178
+ {
179
+ "Model": "Qwen2.5-72B-Instruct",
180
+ "Mode": "greedy",
181
+ "Puzzle Acc": "26.60",
182
+ "Cell Acc": "40.92",
183
+ "No answer": "11.90",
184
+ "Easy Puzzle Acc": "76.43",
185
+ "Hard Puzzle Acc": "7.22",
186
+ "Total Puzzles": 1000,
187
+ "Reason Lens": "1795.90"
188
+ },
189
  {
190
  "Model": "gpt-4-turbo-2024-04-09",
191
  "Mode": "sampling",
 
197
  "Total Puzzles": 1000,
198
  "Reason Lens": "1165.90"
199
  },
200
+ {
201
+ "Model": "Qwen2.5-32B-Instruct",
202
+ "Mode": "greedy",
203
+ "Puzzle Acc": "26.10",
204
+ "Cell Acc": "43.39",
205
+ "No answer": "6.30",
206
+ "Easy Puzzle Acc": "77.50",
207
+ "Hard Puzzle Acc": "6.11",
208
+ "Total Puzzles": 1000,
209
+ "Reason Lens": "1333.07"
210
+ },
211
  {
212
  "Model": "gemini-1.5-pro-exp-0801",
213
  "Mode": "greedy",
 
494
  "Total Puzzles": 1000,
495
  "Reason Lens": "849.84"
496
  },
497
+ {
498
+ "Model": "Qwen2.5-7B-Instruct",
499
+ "Mode": "greedy",
500
+ "Puzzle Acc": "12.00",
501
+ "Cell Acc": "30.67",
502
+ "No answer": "9.50",
503
+ "Easy Puzzle Acc": "38.93",
504
+ "Hard Puzzle Acc": "1.53",
505
+ "Total Puzzles": 1000,
506
+ "Reason Lens": "850.93"
507
+ },
508
  {
509
  "Model": "Meta-Llama-3-8B-Instruct",
510
  "Mode": "greedy",
 
637
  "Total Puzzles": 1000,
638
  "Reason Lens": "718.43"
639
  },
640
+ {
641
+ "Model": "Qwen2.5-3B-Instruct",
642
+ "Mode": "greedy",
643
+ "Puzzle Acc": "4.80",
644
+ "Cell Acc": "11.44",
645
+ "No answer": "56.70",
646
+ "Easy Puzzle Acc": "17.14",
647
+ "Hard Puzzle Acc": "0.00",
648
+ "Total Puzzles": 1000,
649
+ "Reason Lens": "906.58"
650
+ },
651
  {
652
  "Model": "gemma-2-2b-it",
653
  "Mode": "greedy",
_about_us.md CHANGED
@@ -10,6 +10,6 @@ We are from [AllenAI](https://allenai.org/) (AI2), a non-profit research organiz
10
  ### Contact
11
 
12
  Please contact us in the following ways:
13
- - Github Issues/PRs: [https://github.com/yuchenlin/ZeroEval/](https://github.com/yuchenlin/ZeroEval/)
14
  - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
15
 
 
10
  ### Contact
11
 
12
  Please contact us in the following ways:
13
+ - Github Issues/PRs: [https://github.com/WildEval/ZeroEval/](https://github.com/WildEval/ZeroEval/)
14
  - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
15
 
_header.md CHANGED
@@ -2,5 +2,5 @@
2
 
3
  # πŸ¦“ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
4
  <!-- [πŸ“‘ FnF Paper](https://arxiv.org/abs/2305.18654) | -->
5
- [πŸ“° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [πŸ’» GitHub](https://github.com/yuchenlin/ZeroEval) | [πŸ€— HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [πŸ’¬ Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
6
 
 
2
 
3
  # πŸ¦“ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
4
  <!-- [πŸ“‘ FnF Paper](https://arxiv.org/abs/2305.18654) | -->
5
+ [πŸ“° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [πŸ’» GitHub](https://github.com/WildEval/ZeroEval) | [πŸ€— HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [πŸ’¬ Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
6
 
app.py CHANGED
@@ -135,8 +135,8 @@ def _tab_explore():
135
 
136
  def _tab_submit():
137
  markdown_text = """
138
- Please create an issue on our [Github](https://github.com/yuchenlin/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
139
- If you would like to do local testing, please read our code [here](https://github.com/yuchenlin/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
140
  and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
141
  """
142
 
 
135
 
136
  def _tab_submit():
137
  markdown_text = """
138
+ Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
139
+ If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
140
  and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
141
  """
142
 
constants.py CHANGED
@@ -4,7 +4,7 @@ from collections import OrderedDict
4
  DEFAULT_K = "∞"
5
  # DEFAULT_K = "1500"
6
 
7
- banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
8
  BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
9
 
10
  # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
 
4
  DEFAULT_K = "∞"
5
  # DEFAULT_K = "1500"
6
 
7
+ banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
8
  BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
9
 
10
  # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
data_utils.py CHANGED
@@ -49,7 +49,7 @@ def load_all_data():
49
  model_summary = json.load(f)
50
  model_names = [model["Model"] for model in model_summary]
51
  for model_name in model_names:
52
- download_url = f"https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
53
  output_file = os.path.join(result_dir, f"{model_name}.json")
54
  # mkdir -p result_dir if not exists
55
  os.makedirs(result_dir, exist_ok=True)
 
49
  model_summary = json.load(f)
50
  model_names = [model["Model"] for model in model_summary]
51
  for model_name in model_names:
52
+ download_url = f"https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
53
  output_file = os.path.join(result_dir, f"{model_name}.json")
54
  # mkdir -p result_dir if not exists
55
  os.makedirs(result_dir, exist_ok=True)
update_data.sh CHANGED
@@ -1,5 +1,5 @@
1
- # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
2
  # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
3
  mkdir -p ZeroEval-main/result_dirs/zebra-grid/
4
- wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
5
- wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
 
1
+ # download the file from https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json
2
  # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
3
  mkdir -p ZeroEval-main/result_dirs/zebra-grid/
4
+ wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
5
+ wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json