OpenEvalsModelDetails / experiments.json
Linker1907's picture
init
c5bf87e
{
"gpt-4o": {
"display_name": "gpt 4o",
"provider": "openai",
"open": false,
"size": "?B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T10-14-16.106571"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T10-14-16.106571"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T10-14-16.106571"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T10-14-16.106571"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": [
"prompt_level_strict_acc"
],
"tags": {
"latest": "2025-02-26T10-14-16.106571"
}
}
}
},
"claude-3-7-sonnet-20250219": {
"display_name": "Claude 3.7 Sonnet",
"provider": "anthropic",
"open": false,
"size": "?B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": [
"extractive_match"
],
"tags": {
"default": "2025-02-25T14-35-15.137825",
"thinking": "2025-03-05T10-14-44.802711"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": [
"extractive_match"
],
"tags": {
"default": "2025-02-25T12-43-49.294245",
"thinking": "2025-03-05T15-37-37.180318"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": [
"extractive_match"
],
"tags": {
"default": "2025-02-25T12-37-52.771787",
"thinking": "2025-03-05T12-39-13.627801"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": [
"extractive_match"
],
"tags": {
"default": "2025-02-25T12-37-52.771787",
"thinking": "2025-03-05T12-39-13.627801"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": [
"prompt_level_strict_acc"
],
"tags": {
"default": "2025-02-25T12-24-45.750753",
"thinking": "2025-03-05T15-37-37.180318"
}
}
}
},
"o3-mini-2025-01-31": {
"display_name": "o3-mini",
"provider": "openai",
"open": false,
"size": "?B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T11-37-01.193437"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T11-37-01.193437"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T11-37-01.193437"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025-02-26T11-37-01.193437"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": [
"prompt_level_strict_acc"
],
"tags": {
"latest": "2025-02-26T11-37-01.193437"
}
}
}
},
"moonshotai/Moonlight-16B-A3B-Instruct": {
"display_name": "Moonlight",
"provider": "moonshotai",
"open": true,
"size": "16B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025_02_26T13_32_06.104265"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025_02_26T13_32_06.104265"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025_02_26T13_32_06.104265"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": [
"extractive_match"
],
"tags": {
"latest": "2025_02_26T13_32_06.104265"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": [
"prompt_level_strict_acc"
],
"tags": {
"latest": "2025_02_26T13_32_06.104265"
}
}
}
},
"meta-llama/Llama-3.3-70B-Instruct": {
"display_name": "Llama 3.3 70B",
"provider": "meta",
"open": true,
"size": "70B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-26T17-13-13.448521"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-26T17-13-13.448521"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-26T17-13-13.448521"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-26T17-13-13.448521"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-02-26T17-13-13.448521"
}
}
}
},
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {
"display_name": "DeepSeek Llama 70B",
"provider": "deepseek",
"open": true,
"size": "70B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T11-09-04.037858"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T11-09-04.037858"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T11-09-04.037858"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T11-09-04.037858"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-02-27T14-02-02.414381"
}
}
}
},
"qihoo360/TinyR1-32B-Preview": {
"display_name": "TinyR1 32B",
"provider": "qihoo360",
"open": true,
"size": "32B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T13-32-41.564652"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T13-32-41.564652"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T13-32-41.564652"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-02-27T13-32-41.564652"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-02-27T13-32-41.564652"
}
}
}
},
"openai/gpt-4.5-preview-2025-02-27": {
"display_name": "gpt 4.5",
"provider": "openai",
"open": false,
"size": "?B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T11-17-20.767980"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T11-35-34.241611"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T11-15-32.836958"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T11-15-32.836958"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-03-03T11-17-20.767980"
}
}
}
},
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {
"display_name": "DeepSeek Qwen 32B",
"provider": "deepseek",
"open": true,
"size": "32B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T14-51-09.849491"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T14-51-09.849491"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T14-51-09.849491"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-03T14-51-09.849491"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-03-03T15-06-10.838105"
}
}
}
},
"openai/deepseek-ai/DeepSeek-R1": {
"display_name": "DeepSeek R1",
"provider": "deepseek",
"open": true,
"size": "671B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-04T17-06-33.124766"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-04T17-06-33.124766"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-04T14-52-35.594174"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-04T14-25-05.009799"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-03-04T15-24-42.488745"
}
}
}
},
"Qwen/QwQ-32B": {
"display_name": "QwQ 32B",
"provider": "Qwen",
"open": true,
"size": "32B",
"benchmarks": {
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-10T11-47-46.303371"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-10T11-47-46.303371"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-10T10-36-07.886033"
}
},
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-10T10-36-07.886033"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-03-10T12-21-36.862202"
}
}
}
},
"google/gemma-3-1b-it": {
"display_name": "Gemma 3",
"provider": "google",
"open": true,
"size": "1B",
"benchmarks": {
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-18T14-25-56.178612"
}
}
}
},
"google/gemma-3-12b-it": {
"display_name": "Gemma 3 12B",
"provider": "google",
"open": true,
"size": "12B",
"benchmarks": {
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-18T14-36-23.368081"
}
}
}
},
"google/gemma-3-27b-it": {
"display_name": "Gemma 3 27B",
"provider": "google",
"open": true,
"size": "27B",
"benchmarks": {
"aime_25": {
"subset": "lighteval|aime25|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-18T14-41-33.181467"
}
},
"aime_24": {
"subset": "lighteval|aime24|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-18T15-11-34.174477"
}
},
"ifeval": {
"subset": "extended|ifeval|0",
"metrics": ["prompt_level_strict_acc"],
"tags": {
"latest": "2025-03-18T15-20-14.979833"
}
},
"gpqa_diamond": {
"subset": "lighteval|gpqa:diamond|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-18T15-20-14.979833"
}
},
"math_500": {
"subset": "lighteval|math_500|0",
"metrics": ["extractive_match"],
"tags": {
"latest": "2025-03-18T15-20-14.979833"
}
}
}
}
}