Thanks for RedHatAI's recipe.
vllm (pretrained=/root/autodl-tmp/Harbinger-24B,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.932 |
± |
0.016 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.932 |
± |
0.016 |
vllm (pretrained=/root/autodl-tmp/Harbinger-24B,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.918 |
± |
0.0123 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.916 |
± |
0.0124 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7965 |
± |
0.0133 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8410 |
± |
0.0253 |
- other |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0281 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8500 |
± |
0.0263 |
- stem |
2 |
none |
|
acc |
↑ |
0.7263 |
± |
0.0254 |
vllm (pretrained=/root/autodl-tmp/86-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
vllm (pretrained=/root/autodl-tmp/86-256-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.896 |
± |
0.0193 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.892 |
± |
0.0197 |
vllm (pretrained=/root/autodl-tmp/87-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.912 |
± |
0.018 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.912 |
± |
0.018 |
vllm (pretrained=/root/autodl-tmp/87-128-3096,add_bos_token=true,max_model_len=3048,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7719 |
± |
0.0137 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8103 |
± |
0.0262 |
- other |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0278 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8222 |
± |
0.0282 |
- stem |
2 |
none |
|
acc |
↑ |
0.6912 |
± |
0.0263 |
vllm (pretrained=/root/autodl-tmp/87-256-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.916 |
± |
0.0176 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.916 |
± |
0.0176 |
vllm (pretrained=/root/autodl-tmp/87-256-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.896 |
± |
0.0137 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.888 |
± |
0.0141 |
vllm (pretrained=/root/autodl-tmp/87-256-3096,add_bos_token=true,max_model_len=3048,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7731 |
± |
0.0136 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0261 |
- other |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0282 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8278 |
± |
0.0273 |
- stem |
2 |
none |
|
acc |
↑ |
0.6947 |
± |
0.0260 |
vllm (pretrained=/root/autodl-tmp/875-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.916 |
± |
0.0176 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.912 |
± |
0.0180 |
vllm (pretrained=/root/autodl-tmp/875-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.902 |
± |
0.0133 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.898 |
± |
0.0135 |
vllm (pretrained=/root/autodl-tmp/88-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.924 |
± |
0.0168 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.924 |
± |
0.0168 |
vllm (pretrained=/root/autodl-tmp/88-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.914 |
± |
0.0126 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.912 |
± |
0.0127 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7731 |
± |
0.0138 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8000 |
± |
0.0269 |
- other |
2 |
none |
|
acc |
↑ |
0.7897 |
± |
0.0286 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8333 |
± |
0.0273 |
- stem |
2 |
none |
|
acc |
↑ |
0.7053 |
± |
0.0263 |
vllm (pretrained=/root/autodl-tmp/885-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.936 |
± |
0.0155 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.932 |
± |
0.0160 |
vllm (pretrained=/root/autodl-tmp/885-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.918 |
± |
0.0123 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.910 |
± |
0.0128 |
vllm (pretrained=/root/autodl-tmp/885-128-3096,add_bos_token=true,max_model_len=3048,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7719 |
± |
0.0137 |
- humanities |
2 |
none |
|
acc |
↑ |
0.7897 |
± |
0.0269 |
- other |
2 |
none |
|
acc |
↑ |
0.8205 |
± |
0.0271 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8278 |
± |
0.0275 |
- stem |
2 |
none |
|
acc |
↑ |
0.6912 |
± |
0.0264 |
vllm (pretrained=/root/autodl-tmp/885-256-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.916 |
± |
0.0176 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.916 |
± |
0.0176 |
vllm (pretrained=/root/autodl-tmp/885-256-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.910 |
± |
0.0128 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0132 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7848 |
± |
0.0135 |
- humanities |
2 |
none |
|
acc |
↑ |
0.7949 |
± |
0.0269 |
- other |
2 |
none |
|
acc |
↑ |
0.8154 |
± |
0.0277 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8222 |
± |
0.0282 |
- stem |
2 |
none |
|
acc |
↑ |
0.7333 |
± |
0.0252 |
vllm (pretrained=/root/autodl-tmp/89-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.912 |
± |
0.018 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.912 |
± |
0.018 |
vllm (pretrained=/root/autodl-tmp/90-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.932 |
± |
0.016 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.932 |
± |
0.016 |
vllm (pretrained=/root/autodl-tmp/90-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.916 |
± |
0.0124 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.914 |
± |
0.0126 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7977 |
± |
0.0132 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8410 |
± |
0.0253 |
- other |
2 |
none |
|
acc |
↑ |
0.8154 |
± |
0.0274 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8278 |
± |
0.0275 |
- stem |
2 |
none |
|
acc |
↑ |
0.7368 |
± |
0.0250 |