{"created_at": "2025-05-08T23:34:26.557449", "global_step": 2000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4257679180887372, "acc_stderr,none": 0.0144494642788688, "acc_norm,none": 0.4658703071672355, "acc_norm_stderr,none": 0.014577311315231099}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7289562289562289, "acc_stderr,none": 0.0091209197417606, "acc_norm,none": 0.7146464646464646, "acc_norm_stderr,none": 0.00926628058499775}, "boolq": {"alias": "boolq", "acc,none": 0.7318042813455657, "acc_stderr,none": 0.007748469592030345}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2989352989352989, "acc_stderr,none": 0.01310653028279809}, "copa": {"alias": "copa", "acc,none": 0.8, "acc_stderr,none": 0.040201512610368445}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4737104162517427, "acc_stderr,none": 0.004982879340691398, "acc_norm,none": 0.6406094403505278, "acc_norm_stderr,none": 0.004788412062375702}, "mmlu": {"acc,none": 0.34959407491810285, "acc_stderr,none": 0.003986813763121511, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.32539851222104144, "acc_stderr,none": 0.0067554212334649385, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235172}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.37575757575757573, "acc_stderr,none": 0.037818873532059816}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.44607843137254904, "acc_stderr,none": 0.03488845451304974}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5063291139240507, "acc_stderr,none": 0.03254462010767859}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.04481137755942469}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024931}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.0251310002336479}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2212290502793296, "acc_stderr,none": 0.013882164598887288}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.36977491961414793, "acc_stderr,none": 0.027417996705631005}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.39197530864197533, "acc_stderr,none": 0.02716368603827124}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30834419817470665, "acc_stderr,none": 0.011794833789715334}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3742690058479532, "acc_stderr,none": 0.037116011853894806}, "mmlu_other": {"acc,none": 0.38461538461538464, "acc_stderr,none": 0.0087012710587326, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3849056603773585, "acc_stderr,none": 0.02994649856769995}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3815028901734104, "acc_stderr,none": 0.03703851193099521}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.45739910313901344, "acc_stderr,none": 0.03343577705583065}, "mmlu_management": {"alias": " - management", "acc,none": 0.4563106796116505, "acc_stderr,none": 0.049318019942204146}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4358974358974359, "acc_stderr,none": 0.03248577511578401}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620332}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3895274584929757, "acc_stderr,none": 0.017438082556264594}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.027684181883302898}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.30141843971631205, "acc_stderr,none": 0.027374128882631146}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3602941176470588, "acc_stderr,none": 0.029163128570670736}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.037400593820293204}, "mmlu_social_sciences": {"acc,none": 0.3737406564835879, "acc_stderr,none": 0.008670878696396921, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.0433913832257986}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.37823834196891193, "acc_stderr,none": 0.03499807276193339}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33076923076923076, "acc_stderr,none": 0.023854795680971128}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3403361344537815, "acc_stderr,none": 0.030778057422931673}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.42752293577981654, "acc_stderr,none": 0.02121091020430043}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.366412213740458, "acc_stderr,none": 0.04225875451969638}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.019431775677037313}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.04494290866252088}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.30612244897959184, "acc_stderr,none": 0.029504896454595947}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.527363184079602, "acc_stderr,none": 0.035302355173346824}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.3276244846178243, "acc_stderr,none": 0.008277320689787858, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.375, "acc_stderr,none": 0.039397364351956274}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4236111111111111, "acc_stderr,none": 0.04132125019723369}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993177}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.45, "acc_stderr,none": 0.05}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534443}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2751322751322751, "acc_stderr,none": 0.023000086859068642}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.43870967741935485, "acc_stderr,none": 0.028229497320317216}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678241}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.024388430433987657}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.03350991604696042}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "mmlu_pro": {"exact_match,custom-extract": 0.18201462765957446, "exact_match_stderr,custom-extract": 0.0034771449900885703, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3291492329149233, "exact_match_stderr,custom-extract": 0.017561146780265928}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16983523447401774, "exact_match_stderr,custom-extract": 0.013376205653007208}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1068904593639576, "exact_match_stderr,custom-extract": 0.009187355756744656}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22195121951219512, "exact_match_stderr,custom-extract": 0.020548045890068298}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2476303317535545, "exact_match_stderr,custom-extract": 0.014866330095923884}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.12899896800825594, "exact_match_stderr,custom-extract": 0.010773697418009065}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22860635696821516, "exact_match_stderr,custom-extract": 0.014691669532004209}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1968503937007874, "exact_match_stderr,custom-extract": 0.020397388648694077}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1444141689373297, "exact_match_stderr,custom-extract": 0.010598401112152015}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14507772020725387, "exact_match_stderr,custom-extract": 0.009585103230059955}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1634199134199134, "exact_match_stderr,custom-extract": 0.01217041531796006}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16633266533066132, "exact_match_stderr,custom-extract": 0.01668670139852614}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15473441108545036, "exact_match_stderr,custom-extract": 0.010038127358043917}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2781954887218045, "exact_match_stderr,custom-extract": 0.015872877950292862}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.01996610354027947, "acc_norm,none": 0.386, "acc_norm_stderr,none": 0.02179352921928117}, "piqa": {"alias": "piqa", "acc,none": 0.720892274211099, "acc_stderr,none": 0.010465657948498228, "acc_norm,none": 0.7230685527747551, "acc_norm_stderr,none": 0.01044049996933454}, "race": {"alias": "race", "acc,none": 0.36650717703349284, "acc_stderr,none": 0.014912890943719231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45138178096212894, "acc_stderr,none": 0.01126045668162444}, "winogrande": {"alias": "winogrande", "acc,none": 0.6503551696921863, "acc_stderr,none": 0.013402073680850503}} {"created_at": "2025-05-09T01:28:41.634251", "global_step": 4000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4206484641638225, "acc_stderr,none": 0.014426211252508396, "acc_norm,none": 0.45307167235494883, "acc_norm_stderr,none": 0.01454689205200563}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7196969696969697, "acc_stderr,none": 0.009216306864088033, "acc_norm,none": 0.7079124579124579, "acc_norm_stderr,none": 0.00933070561656907}, "boolq": {"alias": "boolq", "acc,none": 0.7685015290519878, "acc_stderr,none": 0.007377156064425054}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3202293202293202, "acc_stderr,none": 0.013357704926272657}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4737104162517427, "acc_stderr,none": 0.0049828793406913995, "acc_norm,none": 0.6374228241386178, "acc_norm_stderr,none": 0.004797616754372304}, "mmlu": {"acc,none": 0.36227033186155816, "acc_stderr,none": 0.004005752094084227, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3324123273113709, "acc_stderr,none": 0.0067679612722847775, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.03395490020856111}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.37575757575757573, "acc_stderr,none": 0.03781887353205982}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.45588235294117646, "acc_stderr,none": 0.03495624522015473}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5527426160337553, "acc_stderr,none": 0.03236564251614192}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4297520661157025, "acc_stderr,none": 0.04519082021319773}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.047500773411999854}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3128834355828221, "acc_stderr,none": 0.036429145782924055}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.02519018132760841}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.22905027932960895, "acc_stderr,none": 0.014054314935614553}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.40192926045016075, "acc_stderr,none": 0.02784647600593048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.02723741509459248}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3135593220338983, "acc_stderr,none": 0.011849234291459329}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03615507630310935}, "mmlu_other": {"acc,none": 0.3994206630189894, "acc_stderr,none": 0.008747914827807462, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.4377358490566038, "acc_stderr,none": 0.030533338430467512}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3872832369942196, "acc_stderr,none": 0.03714325906302065}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4484304932735426, "acc_stderr,none": 0.033378837362550984}, "mmlu_management": {"alias": " - management", "acc,none": 0.4563106796116505, "acc_stderr,none": 0.049318019942204146}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.46153846153846156, "acc_stderr,none": 0.032659033811861936}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.384418901660281, "acc_stderr,none": 0.01739568874281962}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.39869281045751637, "acc_stderr,none": 0.02803609227389177}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.02746470844202214}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.4227941176470588, "acc_stderr,none": 0.030008562845003476}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.39759036144578314, "acc_stderr,none": 0.038099730845402184}, "mmlu_social_sciences": {"acc,none": 0.40201494962625933, "acc_stderr,none": 0.008753318520333115, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35353535353535354, "acc_stderr,none": 0.03406086723547153}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.42487046632124353, "acc_stderr,none": 0.035674713352125395}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3564102564102564, "acc_stderr,none": 0.0242831405294673}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3697478991596639, "acc_stderr,none": 0.031357095996135904}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.5119266055045871, "acc_stderr,none": 0.021431223617362227}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3709150326797386, "acc_stderr,none": 0.01954210156485412}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.38181818181818183, "acc_stderr,none": 0.046534298079135075}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3183673469387755, "acc_stderr,none": 0.02982253379398207}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5373134328358209, "acc_stderr,none": 0.03525675167467974}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.33143038376149697, "acc_stderr,none": 0.008305453682363425, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3925925925925926, "acc_stderr,none": 0.04218506215368879}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3618421052631579, "acc_stderr,none": 0.03910525752849724}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4513888888888889, "acc_stderr,none": 0.04161402398403279}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929775}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.43, "acc_stderr,none": 0.049756985195624284}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.37872340425531914, "acc_stderr,none": 0.03170995606040655}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277696}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633345}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.44516129032258067, "acc_stderr,none": 0.028272410186214906}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655078}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969654}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.375, "acc_stderr,none": 0.033016908987210894}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.1815159574468085, "exact_match_stderr,custom-extract": 0.0034691711332705353, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3333333333333333, "exact_match_stderr,custom-extract": 0.017617214086056418}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1596958174904943, "exact_match_stderr,custom-extract": 0.013049741978046024}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10247349823321555, "exact_match_stderr,custom-extract": 0.009017748507579058}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.20243902439024392, "exact_match_stderr,custom-extract": 0.019868606646141387}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2381516587677725, "exact_match_stderr,custom-extract": 0.014670579907447287}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14654282765737875, "exact_match_stderr,custom-extract": 0.011366728093938227}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2567237163814181, "exact_match_stderr,custom-extract": 0.015282595032542022}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.2125984251968504, "exact_match_stderr,custom-extract": 0.02098873976311752}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12806539509536785, "exact_match_stderr,custom-extract": 0.010075381773702268}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.1391561806069578, "exact_match_stderr,custom-extract": 0.009419905559243678}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17532467532467533, "exact_match_stderr,custom-extract": 0.01251590249750937}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16432865731462926, "exact_match_stderr,custom-extract": 0.016605797464661214}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.14857582755966128, "exact_match_stderr,custom-extract": 0.009872103972550979}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2756892230576441, "exact_match_stderr,custom-extract": 0.01582862563189321}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.019966103540279462, "acc_norm,none": 0.386, "acc_norm_stderr,none": 0.021793529219281165}, "piqa": {"alias": "piqa", "acc,none": 0.719804134929271, "acc_stderr,none": 0.010478122015577086, "acc_norm,none": 0.720892274211099, "acc_norm_stderr,none": 0.01046565794849823}, "race": {"alias": "race", "acc,none": 0.3712918660287081, "acc_stderr,none": 0.014953126515089411}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45752302968270214, "acc_stderr,none": 0.011273168825920714}, "winogrande": {"alias": "winogrande", "acc,none": 0.6464088397790055, "acc_stderr,none": 0.013436541262599954}} {"created_at": "2025-05-09T03:17:03.325029", "global_step": 6000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.41552901023890787, "acc_stderr,none": 0.014401366641216396, "acc_norm,none": 0.4513651877133106, "acc_norm_stderr,none": 0.014542104569955262}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7239057239057239, "acc_stderr,none": 0.009173559873835264, "acc_norm,none": 0.7138047138047138, "acc_norm_stderr,none": 0.009274470774627726}, "boolq": {"alias": "boolq", "acc,none": 0.7596330275229358, "acc_stderr,none": 0.007473634518428275}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3071253071253071, "acc_stderr,none": 0.013207032181990902}, "copa": {"alias": "copa", "acc,none": 0.8, "acc_stderr,none": 0.040201512610368445}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47679745070703045, "acc_stderr,none": 0.004984405935541095, "acc_norm,none": 0.6385182234614618, "acc_norm_stderr,none": 0.0047944784263826154}, "mmlu": {"acc,none": 0.34824099131177894, "acc_stderr,none": 0.003980165292883981, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3343251859723698, "acc_stderr,none": 0.00678938399029298, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.03512207412302052}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4303030303030303, "acc_stderr,none": 0.03866225962879077}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4852941176470588, "acc_stderr,none": 0.03507793834791324}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5063291139240507, "acc_stderr,none": 0.03254462010767859}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.044120158066245044}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3374233128834356, "acc_stderr,none": 0.03714908409935575}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.025070713719153172}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23016759776536314, "acc_stderr,none": 0.01407833925342582}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3633440514469453, "acc_stderr,none": 0.02731684767419272}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.41975308641975306, "acc_stderr,none": 0.027460099557005135}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.31747066492829207, "acc_stderr,none": 0.011888892068809312}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.38596491228070173, "acc_stderr,none": 0.03733756969066165}, "mmlu_other": {"acc,none": 0.38590280012874156, "acc_stderr,none": 0.008697860900854807, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3886792452830189, "acc_stderr,none": 0.030000485448675986}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3583815028901734, "acc_stderr,none": 0.036563436533531585}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4439461883408072, "acc_stderr,none": 0.03334625674242728}, "mmlu_management": {"alias": " - management", "acc,none": 0.44660194174757284, "acc_stderr,none": 0.04922424153458933}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.47435897435897434, "acc_stderr,none": 0.03271298896811159}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.45, "acc_stderr,none": 0.04999999999999999}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.41379310344827586, "acc_stderr,none": 0.017612204084663772}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.02736359328468494}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340461}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34191176470588236, "acc_stderr,none": 0.028814722422254177}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3855421686746988, "acc_stderr,none": 0.0378913442461155}, "mmlu_social_sciences": {"acc,none": 0.371140721481963, "acc_stderr,none": 0.0086640160229303, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537317}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30808080808080807, "acc_stderr,none": 0.032894773300986155}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.40932642487046633, "acc_stderr,none": 0.03548608168860806}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657266}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188704}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.42568807339449544, "acc_stderr,none": 0.0211992359724708}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.01941253924203216}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.42727272727272725, "acc_stderr,none": 0.04738198703545483}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3183673469387755, "acc_stderr,none": 0.029822533793982073}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4925373134328358, "acc_stderr,none": 0.03535140084276719}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.309546463685379, "acc_stderr,none": 0.00816118500374012, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4148148148148148, "acc_stderr,none": 0.042561937679014075}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34868421052631576, "acc_stderr,none": 0.0387813988879761}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3819444444444444, "acc_stderr,none": 0.040629907841466674}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083286}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3659574468085106, "acc_stderr,none": 0.031489558297455304}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491841}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.42258064516129035, "acc_stderr,none": 0.02810096472427264}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233483}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.025040443877000673}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24537037037037038, "acc_stderr,none": 0.029346665094372944}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04547960999764376}, "mmlu_pro": {"exact_match,custom-extract": 0.17993683510638298, "exact_match_stderr,custom-extract": 0.003457806017944553, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.33751743375174337, "exact_match_stderr,custom-extract": 0.01767171996958689}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15462610899873258, "exact_match_stderr,custom-extract": 0.01287961022947728}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10335689045936396, "exact_match_stderr,custom-extract": 0.009052076648374286}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1878048780487805, "exact_match_stderr,custom-extract": 0.01931176502893171}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23459715639810427, "exact_match_stderr,custom-extract": 0.014594614234695612}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14860681114551083, "exact_match_stderr,custom-extract": 0.01143264622099072}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.23471882640586797, "exact_match_stderr,custom-extract": 0.014827688336877391}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.2020997375328084, "exact_match_stderr,custom-extract": 0.02059991272477544}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11716621253405994, "exact_match_stderr,custom-extract": 0.009697154745523061}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14803849000740193, "exact_match_stderr,custom-extract": 0.009665635787320598}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1774891774891775, "exact_match_stderr,custom-extract": 0.012576387157546684}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.016013945383577262}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15550423402617397, "exact_match_stderr,custom-extract": 0.010058483346469749}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2832080200501253, "exact_match_stderr,custom-extract": 0.015959533104077266}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566065, "acc_norm,none": 0.39, "acc_norm_stderr,none": 0.021834685869369215}, "piqa": {"alias": "piqa", "acc,none": 0.7257889009793254, "acc_stderr,none": 0.010408618664933382, "acc_norm,none": 0.7257889009793254, "acc_norm_stderr,none": 0.010408618664933384}, "race": {"alias": "race", "acc,none": 0.3626794258373206, "acc_stderr,none": 0.014879563111287502}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4524053224155578, "acc_stderr,none": 0.011262695440459564}, "winogrande": {"alias": "winogrande", "acc,none": 0.6495659037095501, "acc_stderr,none": 0.013409047676670173}} {"created_at": "2025-05-09T05:06:39.903679", "global_step": 8000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.41552901023890787, "acc_stderr,none": 0.014401366641216393, "acc_norm,none": 0.4564846416382253, "acc_norm_stderr,none": 0.014555949760496437}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7201178451178452, "acc_stderr,none": 0.009212077524656534, "acc_norm,none": 0.7015993265993266, "acc_norm_stderr,none": 0.009388855914040432}, "boolq": {"alias": "boolq", "acc,none": 0.744954128440367, "acc_stderr,none": 0.007623711520701574}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.33005733005733007, "acc_stderr,none": 0.013462744249660544}, "copa": {"alias": "copa", "acc,none": 0.79, "acc_stderr,none": 0.04093601807403326}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47649870543716394, "acc_stderr,none": 0.0049842665430531174, "acc_norm,none": 0.6412069308902609, "acc_norm_stderr,none": 0.004786660691181916}, "mmlu": {"acc,none": 0.3698903290129611, "acc_stderr,none": 0.004022127866581841, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3421891604675877, "acc_stderr,none": 0.006817614086755118, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392871}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.03859268142070262}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.49019607843137253, "acc_stderr,none": 0.03508637358630572}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5443037974683544, "acc_stderr,none": 0.03241920684693333}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4351851851851852, "acc_stderr,none": 0.04792898170907062}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30346820809248554, "acc_stderr,none": 0.024752411960917212}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225598}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.40514469453376206, "acc_stderr,none": 0.02788238379132595}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.42901234567901236, "acc_stderr,none": 0.027538925613470863}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30834419817470665, "acc_stderr,none": 0.011794833789715324}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.38596491228070173, "acc_stderr,none": 0.03733756969066165}, "mmlu_other": {"acc,none": 0.40521403282909557, "acc_stderr,none": 0.008759905614897584, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.44528301886792454, "acc_stderr,none": 0.03058805297427065}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3988439306358382, "acc_stderr,none": 0.037336266553835096}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4349775784753363, "acc_stderr,none": 0.03327283370271345}, "mmlu_management": {"alias": " - management", "acc,none": 0.5631067961165048, "acc_stderr,none": 0.04911147107365777}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4658119658119658, "acc_stderr,none": 0.03267942734081227}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.41762452107279696, "acc_stderr,none": 0.017635637326951517}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3758169934640523, "acc_stderr,none": 0.02773283435336394}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.31560283687943264, "acc_stderr,none": 0.027724989449509307}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.40441176470588236, "acc_stderr,none": 0.029812630701569743}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.37349397590361444, "acc_stderr,none": 0.037658451171688624}, "mmlu_social_sciences": {"acc,none": 0.40103997400065, "acc_stderr,none": 0.008755788150352054, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436695}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.03427308652999934}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.41968911917098445, "acc_stderr,none": 0.03561587327685884}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.36666666666666664, "acc_stderr,none": 0.02443301646605246}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.030388353551886838}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.48990825688073397, "acc_stderr,none": 0.021432956203453316}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3893129770992366, "acc_stderr,none": 0.04276486542814591}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3709150326797386, "acc_stderr,none": 0.019542101564854128}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.39090909090909093, "acc_stderr,none": 0.04673752333670239}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.33877551020408164, "acc_stderr,none": 0.03029950656215418}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5522388059701493, "acc_stderr,none": 0.03516184772952167}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_stem": {"acc,none": 0.3460196638122423, "acc_stderr,none": 0.008357071228593086, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.42962962962962964, "acc_stderr,none": 0.04276349494376599}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.03925523381052932}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4652777777777778, "acc_stderr,none": 0.04171115858181618}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620333}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3829787234042553, "acc_stderr,none": 0.03177821250236922}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.037245636197746304}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4967741935483871, "acc_stderr,none": 0.02844341422643833}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.032406615658684086}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097845}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526733}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.033247089118091176}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.18683510638297873, "exact_match_stderr,custom-extract": 0.003518469085328189, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32496513249651326, "exact_match_stderr,custom-extract": 0.017503503047556067}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16476552598225602, "exact_match_stderr,custom-extract": 0.013215216167850041}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11837455830388692, "exact_match_stderr,custom-extract": 0.009605941567355312}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1902439024390244, "exact_match_stderr,custom-extract": 0.019407555306945948}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23696682464454977, "exact_match_stderr,custom-extract": 0.014645415505512255}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15067079463364294, "exact_match_stderr,custom-extract": 0.011497803701434745}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.254278728606357, "exact_match_stderr,custom-extract": 0.01523464199611947}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1968503937007874, "exact_match_stderr,custom-extract": 0.020397388648694073}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1307901907356948, "exact_match_stderr,custom-extract": 0.010166080711813392}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.1680236861584012, "exact_match_stderr,custom-extract": 0.010175923911029904}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1699134199134199, "exact_match_stderr,custom-extract": 0.012361599999468283}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.18236472945891782, "exact_match_stderr,custom-extract": 0.017303563884617016}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1585835257890685, "exact_match_stderr,custom-extract": 0.010139048344815245}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2744360902255639, "exact_match_stderr,custom-extract": 0.015806266033802353}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.01996610354027946, "acc_norm,none": 0.386, "acc_norm_stderr,none": 0.021793529219281165}, "piqa": {"alias": "piqa", "acc,none": 0.719804134929271, "acc_stderr,none": 0.010478122015577077, "acc_norm,none": 0.7290533188248096, "acc_norm_stderr,none": 0.010369718937426843}, "race": {"alias": "race", "acc,none": 0.3770334928229665, "acc_stderr,none": 0.014999337089843356}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44421699078812693, "acc_stderr,none": 0.011243437088559821}, "winogrande": {"alias": "winogrande", "acc,none": 0.6582478295185478, "acc_stderr,none": 0.013330103018622847}} {"created_at": "2025-05-09T06:57:35.593473", "global_step": 10000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4189419795221843, "acc_stderr,none": 0.014418106953639008, "acc_norm,none": 0.454778156996587, "acc_norm_stderr,none": 0.014551507060836353}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.726010101010101, "acc_stderr,none": 0.009151805901544024, "acc_norm,none": 0.7041245791245792, "acc_norm_stderr,none": 0.009365854134140057}, "boolq": {"alias": "boolq", "acc,none": 0.7577981651376147, "acc_stderr,none": 0.0074930396182058085}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3538083538083538, "acc_stderr,none": 0.013689412044271595}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4766978689504083, "acc_stderr,none": 0.004984359669951927, "acc_norm,none": 0.6406094403505278, "acc_norm_stderr,none": 0.004788412062375702}, "mmlu": {"acc,none": 0.3645492095143142, "acc_stderr,none": 0.0040190174218775535, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3419766206163656, "acc_stderr,none": 0.006837061780601579, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523809}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.43636363636363634, "acc_stderr,none": 0.03872592983524754}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.49019607843137253, "acc_stderr,none": 0.03508637358630573}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5189873417721519, "acc_stderr,none": 0.03252375148090447}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.03623089915724145}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.33815028901734107, "acc_stderr,none": 0.025469770149400172}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2536312849162011, "acc_stderr,none": 0.014551553659369923}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3890675241157556, "acc_stderr,none": 0.027690337536485376}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.027431623722415005}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.31421121251629724, "acc_stderr,none": 0.011855911587048224}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.34502923976608185, "acc_stderr,none": 0.036459813773888065}, "mmlu_other": {"acc,none": 0.37560347602188604, "acc_stderr,none": 0.008656983343187723, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.4490566037735849, "acc_stderr,none": 0.030612730713641092}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3872832369942196, "acc_stderr,none": 0.03714325906302065}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.37668161434977576, "acc_stderr,none": 0.032521134899291884}, "mmlu_management": {"alias": " - management", "acc,none": 0.4854368932038835, "acc_stderr,none": 0.04948637324026637}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4230769230769231, "acc_stderr,none": 0.032366121762202014}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3793103448275862, "acc_stderr,none": 0.01735126811754445}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3562091503267974, "acc_stderr,none": 0.027420477662629252}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3120567375886525, "acc_stderr,none": 0.02764012054516993}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3786764705882353, "acc_stderr,none": 0.02946513363977613}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3493975903614458, "acc_stderr,none": 0.03711725190740747}, "mmlu_social_sciences": {"acc,none": 0.40558986025349364, "acc_stderr,none": 0.008780170112996219, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3508771929824561, "acc_stderr,none": 0.04489539350270698}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.0347327959083696}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43523316062176165, "acc_stderr,none": 0.035780381650085846}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3641025641025641, "acc_stderr,none": 0.02439667298509476}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3403361344537815, "acc_stderr,none": 0.030778057422931673}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.5082568807339449, "acc_stderr,none": 0.021434399918214334}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3435114503816794, "acc_stderr,none": 0.041649760719448786}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.019506291693954854}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.04554619617541054}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3469387755102041, "acc_stderr,none": 0.030472526026726492}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5472636815920398, "acc_stderr,none": 0.03519702717576916}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_stem": {"acc,none": 0.3472882968601332, "acc_stderr,none": 0.008371746526670592, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4222222222222222, "acc_stderr,none": 0.04266763404099582}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.4144736842105263, "acc_stderr,none": 0.04008973785779206}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4513888888888889, "acc_stderr,none": 0.04161402398403279}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993179}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3702127659574468, "acc_stderr,none": 0.03156564682236784}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.49032258064516127, "acc_stderr,none": 0.028438677998909558}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.31527093596059114, "acc_stderr,none": 0.03269080871970186}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.033384734032074016}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697626}, "mmlu_pro": {"exact_match,custom-extract": 0.18425864361702127, "exact_match_stderr,custom-extract": 0.003499606067873189, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3277545327754533, "exact_match_stderr,custom-extract": 0.017542108119930663}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.17110266159695817, "exact_match_stderr,custom-extract": 0.013415771307906003}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11837455830388692, "exact_match_stderr,custom-extract": 0.009605941567355312}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1902439024390244, "exact_match_stderr,custom-extract": 0.01940755530694595}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2381516587677725, "exact_match_stderr,custom-extract": 0.014670579907447287}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14860681114551083, "exact_match_stderr,custom-extract": 0.011432646220990715}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2334963325183374, "exact_match_stderr,custom-extract": 0.014800831761454183}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.2125984251968504, "exact_match_stderr,custom-extract": 0.020988739763117506}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1262488646684832, "exact_match_stderr,custom-extract": 0.010014085027799643}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14877868245743894, "exact_match_stderr,custom-extract": 0.009685559549168662}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1764069264069264, "exact_match_stderr,custom-extract": 0.01254623184906949}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1903807615230461, "exact_match_stderr,custom-extract": 0.017592893278410036}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15935334872979215, "exact_match_stderr,custom-extract": 0.010158977410017724}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2669172932330827, "exact_match_stderr,custom-extract": 0.015668798035500312}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.278, "acc_stderr,none": 0.020055833888070907, "acc_norm,none": 0.39, "acc_norm_stderr,none": 0.02183468586936921}, "piqa": {"alias": "piqa", "acc,none": 0.7268770402611534, "acc_stderr,none": 0.010395730264453267, "acc_norm,none": 0.7252448313384113, "acc_norm_stderr,none": 0.010415033676676042}, "race": {"alias": "race", "acc,none": 0.36650717703349284, "acc_stderr,none": 0.014912890943719231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45291709314227224, "acc_stderr,none": 0.01126379679411243}, "winogrande": {"alias": "winogrande", "acc,none": 0.6479873717442778, "acc_stderr,none": 0.013422874824929707}} {"created_at": "2025-05-09T09:28:37.117280", "global_step": 12000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.41638225255972694, "acc_stderr,none": 0.014405618279436165, "acc_norm,none": 0.4564846416382253, "acc_norm_stderr,none": 0.014555949760496439}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7289562289562289, "acc_stderr,none": 0.0091209197417606, "acc_norm,none": 0.7133838383838383, "acc_norm_stderr,none": 0.009278551100969295}, "boolq": {"alias": "boolq", "acc,none": 0.7342507645259939, "acc_stderr,none": 0.007725929757288678}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.31367731367731366, "acc_stderr,none": 0.013283906711108124}, "copa": {"alias": "copa", "acc,none": 0.8, "acc_stderr,none": 0.04020151261036845}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47610037841067515, "acc_stderr,none": 0.0049840779062161064, "acc_norm,none": 0.643397729535949, "acc_norm_stderr,none": 0.004780169873332845}, "mmlu": {"acc,none": 0.3565019228030195, "acc_stderr,none": 0.003997294684127984, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.328586609989373, "acc_stderr,none": 0.006763654084066626, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.035122074123020534}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.40606060606060607, "acc_stderr,none": 0.03834816355401181}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.45098039215686275, "acc_stderr,none": 0.03492406104163613}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5189873417721519, "acc_stderr,none": 0.03252375148090447}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.04481137755942469}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025589}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3265895953757225, "acc_stderr,none": 0.025248264774242822}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961455}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3954983922829582, "acc_stderr,none": 0.027770918531427834}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.027339546640662734}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.29465449804432853, "acc_stderr,none": 0.011643576764069546}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3508771929824561, "acc_stderr,none": 0.03660298834049162}, "mmlu_other": {"acc,none": 0.38429353073704536, "acc_stderr,none": 0.008692556907232373, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.4339622641509434, "acc_stderr,none": 0.030503292013342592}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.37572254335260113, "acc_stderr,none": 0.036928207672648664}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.40358744394618834, "acc_stderr,none": 0.03292802819330314}, "mmlu_management": {"alias": " - management", "acc,none": 0.46601941747572817, "acc_stderr,none": 0.0493929144727348}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44871794871794873, "acc_stderr,none": 0.0325833464938688}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3793103448275862, "acc_stderr,none": 0.01735126811754445}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.027684181883302888}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340461004}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.39705882352941174, "acc_stderr,none": 0.02972215209928007}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3855421686746988, "acc_stderr,none": 0.03789134424611551}, "mmlu_social_sciences": {"acc,none": 0.39064023399415015, "acc_stderr,none": 0.008708259058446658, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35858585858585856, "acc_stderr,none": 0.03416903640391521}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.39896373056994816, "acc_stderr,none": 0.03533999094065696}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.34102564102564104, "acc_stderr,none": 0.024035489676335082}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.030283995525884396}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.5064220183486239, "acc_stderr,none": 0.02143555482001308}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3562091503267974, "acc_stderr,none": 0.0193733324207245}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.4, "acc_stderr,none": 0.0469237132203465}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3346938775510204, "acc_stderr,none": 0.030209235226242307}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.527363184079602, "acc_stderr,none": 0.035302355173346824}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.45, "acc_stderr,none": 0.05}, "mmlu_stem": {"acc,none": 0.33745639073897876, "acc_stderr,none": 0.008334016269999028, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4, "acc_stderr,none": 0.04232073695151589}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.03925523381052932}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4513888888888889, "acc_stderr,none": 0.041614023984032786}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993176}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620333}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3829787234042553, "acc_stderr,none": 0.03177821250236922}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.037245636197746304}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4806451612903226, "acc_stderr,none": 0.0284226874043121}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132976984}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275794}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.375, "acc_stderr,none": 0.033016908987210894}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.04327040932578728}, "mmlu_pro": {"exact_match,custom-extract": 0.18733377659574468, "exact_match_stderr,custom-extract": 0.0035214130064884605, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3333333333333333, "exact_match_stderr,custom-extract": 0.017617214086056415}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1761723700887199, "exact_match_stderr,custom-extract": 0.01357137861064442}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.12102473498233215, "exact_match_stderr,custom-extract": 0.00969826606799995}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2121951219512195, "exact_match_stderr,custom-extract": 0.020216937884754142}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2523696682464455, "exact_match_stderr,custom-extract": 0.01496057368158449}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15273477812177502, "exact_match_stderr,custom-extract": 0.011562213394878507}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2334963325183374, "exact_match_stderr,custom-extract": 0.014800831761454188}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1968503937007874, "exact_match_stderr,custom-extract": 0.020397388648694073}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12352406902815623, "exact_match_stderr,custom-extract": 0.009920862929791524}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15840118430792005, "exact_match_stderr,custom-extract": 0.009937219564839428}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1774891774891775, "exact_match_stderr,custom-extract": 0.012576387157546684}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.18036072144288579, "exact_match_stderr,custom-extract": 0.017229302326024137}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1608929946112394, "exact_match_stderr,custom-extract": 0.010198584409468981}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2656641604010025, "exact_match_stderr,custom-extract": 0.015645328503718126}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.278, "acc_stderr,none": 0.020055833888070907, "acc_norm,none": 0.382, "acc_norm_stderr,none": 0.021750820591250834}, "piqa": {"alias": "piqa", "acc,none": 0.7263329706202394, "acc_stderr,none": 0.01040218420622921, "acc_norm,none": 0.7236126224156693, "acc_norm_stderr,none": 0.010434162388275615}, "race": {"alias": "race", "acc,none": 0.3588516746411483, "acc_stderr,none": 0.014845215125262313}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4498464687819857, "acc_stderr,none": 0.011257008360485697}, "winogrande": {"alias": "winogrande", "acc,none": 0.6393054459352802, "acc_stderr,none": 0.013496064394234038}} {"created_at": "2025-05-09T10:35:08.285167", "global_step": 14000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.40955631399317405, "acc_stderr,none": 0.014370358632472454, "acc_norm,none": 0.45307167235494883, "acc_norm_stderr,none": 0.01454689205200563}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7281144781144782, "acc_stderr,none": 0.009129795867310494, "acc_norm,none": 0.7209595959595959, "acc_norm_stderr,none": 0.009203588704032635}, "boolq": {"alias": "boolq", "acc,none": 0.7556574923547401, "acc_stderr,none": 0.0075154423789955075}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3407043407043407, "acc_stderr,none": 0.013569036984855011}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909284}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4779924317864967, "acc_stderr,none": 0.004984945635998306, "acc_norm,none": 0.6415056761601274, "acc_norm_stderr,none": 0.004785781979354876}, "mmlu": {"acc,none": 0.3686796752599345, "acc_stderr,none": 0.004025621401912442, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.34155154091392137, "acc_stderr,none": 0.006835171260729971, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4121212121212121, "acc_stderr,none": 0.03843566993588717}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.47549019607843135, "acc_stderr,none": 0.035050931943487976}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5316455696202531, "acc_stderr,none": 0.032481974005110756}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.04481137755942469}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.047500773411999854}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3619631901840491, "acc_stderr,none": 0.037757007291414416}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3583815028901734, "acc_stderr,none": 0.025816756791584204}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2536312849162011, "acc_stderr,none": 0.014551553659369923}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3987138263665595, "acc_stderr,none": 0.0278093225857745}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.02733954664066273}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.303129074315515, "acc_stderr,none": 0.0117386699512543}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3508771929824561, "acc_stderr,none": 0.03660298834049162}, "mmlu_other": {"acc,none": 0.39716768587061474, "acc_stderr,none": 0.008747261459575715, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145633}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.45660377358490567, "acc_stderr,none": 0.030656748696739435}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3872832369942196, "acc_stderr,none": 0.03714325906302065}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4080717488789238, "acc_stderr,none": 0.03298574607842822}, "mmlu_management": {"alias": " - management", "acc,none": 0.4563106796116505, "acc_stderr,none": 0.049318019942204146}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.452991452991453, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956912}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.39208173690932313, "acc_stderr,none": 0.017458524050147636}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.39869281045751637, "acc_stderr,none": 0.028036092273891762}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.027464708442022135}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.41911764705882354, "acc_stderr,none": 0.029972807170464626}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3795180722891566, "acc_stderr,none": 0.03777798822748018}, "mmlu_social_sciences": {"acc,none": 0.4072148196295093, "acc_stderr,none": 0.008775353707287907, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3787878787878788, "acc_stderr,none": 0.03456088731993747}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.38860103626943004, "acc_stderr,none": 0.03517739796373131}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3769230769230769, "acc_stderr,none": 0.024570975364225995}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.38235294117647056, "acc_stderr,none": 0.03156663099215416}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.5174311926605505, "acc_stderr,none": 0.021424291871853157}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3435114503816794, "acc_stderr,none": 0.041649760719448786}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3660130718954248, "acc_stderr,none": 0.019488025745529675}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.04494290866252089}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3510204081632653, "acc_stderr,none": 0.03055531675557364}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5422885572139303, "acc_stderr,none": 0.035228658640995975}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_stem": {"acc,none": 0.3434823977164605, "acc_stderr,none": 0.008351756969469167, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04244633238353229}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.03925523381052932}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4722222222222222, "acc_stderr,none": 0.04174752578923185}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04690650298201943}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562428}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3872340425531915, "acc_stderr,none": 0.03184389265339526}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.037528339580033376}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400168}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4774193548387097, "acc_stderr,none": 0.028414985019707868}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.024720713193952172}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4212962962962963, "acc_stderr,none": 0.03367462138896078}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "mmlu_pro": {"exact_match,custom-extract": 0.18359375, "exact_match_stderr,custom-extract": 0.0034914520817422474, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3263598326359833, "exact_match_stderr,custom-extract": 0.017522893735261193}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16476552598225602, "exact_match_stderr,custom-extract": 0.013215216167850041}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11925795053003534, "exact_match_stderr,custom-extract": 0.009636886279174636}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.20243902439024392, "exact_match_stderr,custom-extract": 0.01986860664614139}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.25355450236966826, "exact_match_stderr,custom-extract": 0.014983764115445292}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14860681114551083, "exact_match_stderr,custom-extract": 0.011432646220990723}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2371638141809291, "exact_match_stderr,custom-extract": 0.014880887309855593}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1968503937007874, "exact_match_stderr,custom-extract": 0.020397388648694073}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11989100817438691, "exact_match_stderr,custom-extract": 0.00979411485319413}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.16062176165803108, "exact_match_stderr,custom-extract": 0.009993420477712803}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17532467532467533, "exact_match_stderr,custom-extract": 0.012515902497509368}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.17635270541082165, "exact_match_stderr,custom-extract": 0.017078394057537964}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.14318706697459585, "exact_match_stderr,custom-extract": 0.009722043433624261}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2694235588972431, "exact_match_stderr,custom-extract": 0.01571525582855951}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.019966103540279462, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.021728881438701712}, "piqa": {"alias": "piqa", "acc,none": 0.7241566920565833, "acc_stderr,none": 0.010427805502729115, "acc_norm,none": 0.7257889009793254, "acc_norm_stderr,none": 0.010408618664933382}, "race": {"alias": "race", "acc,none": 0.3569377990430622, "acc_stderr,none": 0.014827656367408905}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4570112589559877, "acc_stderr,none": 0.011272175462331422}, "winogrande": {"alias": "winogrande", "acc,none": 0.6393054459352802, "acc_stderr,none": 0.013496064394234038}}