|
{ |
|
"config_general": { |
|
"lighteval_sha": "?", |
|
"num_fewshot_seeds": 1, |
|
"max_samples": null, |
|
"job_id": 0, |
|
"start_time": 1697.241612831, |
|
"end_time": 2750.279500025, |
|
"total_evaluation_time_secondes": "1053.0378871939997", |
|
"model_name": "Qwen/Qwen3-4B", |
|
"model_sha": "82d62bb073771e7a1ea59435f548908540217d1f", |
|
"model_dtype": null, |
|
"model_size": "14.98 GB", |
|
"generation_parameters": { |
|
"early_stopping": null, |
|
"repetition_penalty": null, |
|
"frequency_penalty": null, |
|
"length_penalty": null, |
|
"presence_penalty": null, |
|
"max_new_tokens": null, |
|
"min_new_tokens": null, |
|
"seed": null, |
|
"stop_tokens": null, |
|
"temperature": null, |
|
"top_k": null, |
|
"min_p": null, |
|
"top_p": null, |
|
"truncate_prompt": null, |
|
"response_format": null |
|
} |
|
}, |
|
"results": { |
|
"leaderboard|truthfulqa:mc|0": { |
|
"truthfulqa_mc1": 0.3671970624235006, |
|
"truthfulqa_mc1_stderr": 0.01687480500145318, |
|
"truthfulqa_mc2": 0.5481532716523813, |
|
"truthfulqa_mc2_stderr": 0.015802558799916978 |
|
}, |
|
"all": { |
|
"truthfulqa_mc1": 0.3671970624235006, |
|
"truthfulqa_mc1_stderr": 0.01687480500145318, |
|
"truthfulqa_mc2": 0.5481532716523813, |
|
"truthfulqa_mc2_stderr": 0.015802558799916978 |
|
} |
|
}, |
|
"versions": { |
|
"leaderboard|truthfulqa:mc|0": 0 |
|
}, |
|
"config_tasks": { |
|
"leaderboard|truthfulqa:mc": { |
|
"name": "truthfulqa:mc", |
|
"prompt_function": "truthful_qa_multiple_choice", |
|
"hf_repo": "truthful_qa", |
|
"hf_subset": "multiple_choice", |
|
"metric": [ |
|
{ |
|
"metric_name": [ |
|
"truthfulqa_mc1", |
|
"truthfulqa_mc2" |
|
], |
|
"higher_is_better": { |
|
"truthfulqa_mc1": true, |
|
"truthfulqa_mc2": true |
|
}, |
|
"category": "8", |
|
"use_case": "1", |
|
"sample_level_fn": "truthfulqa_mc_metrics", |
|
"corpus_level_fn": { |
|
"truthfulqa_mc1": "mean", |
|
"truthfulqa_mc2": "mean" |
|
} |
|
} |
|
], |
|
"hf_revision": null, |
|
"hf_filter": null, |
|
"hf_avail_splits": [ |
|
"validation" |
|
], |
|
"trust_dataset": true, |
|
"evaluation_splits": [ |
|
"validation" |
|
], |
|
"few_shots_split": null, |
|
"few_shots_select": null, |
|
"generation_size": -1, |
|
"generation_grammar": null, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"num_samples": null, |
|
"suite": [ |
|
"leaderboard" |
|
], |
|
"original_num_docs": 817, |
|
"effective_num_docs": 817, |
|
"must_remove_duplicate_docs": false, |
|
"version": 0 |
|
} |
|
}, |
|
"summary_tasks": { |
|
"leaderboard|truthfulqa:mc|0": { |
|
"hashes": { |
|
"hash_examples": "36a6d90e75d92d4a", |
|
"hash_full_prompts": "36a6d90e75d92d4a", |
|
"hash_input_tokens": "38466dd693501854", |
|
"hash_cont_tokens": "cceeffa697353554" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 817, |
|
"padded": 9186, |
|
"non_padded": 810, |
|
"effective_few_shots": 0.0, |
|
"num_truncated_few_shots": 0 |
|
} |
|
}, |
|
"summary_general": { |
|
"hashes": { |
|
"hash_examples": "aed1dfc67e53d0f2", |
|
"hash_full_prompts": "aed1dfc67e53d0f2", |
|
"hash_input_tokens": "48256f08799e5a5e", |
|
"hash_cont_tokens": "ed734856e6e7940f" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 817, |
|
"padded": 9186, |
|
"non_padded": 810, |
|
"num_truncated_few_shots": 0 |
|
} |
|
} |