|
{"created_at": "2025-08-15T04:34:15.014630", "global_step": 2000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883523}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.2865962955586537, "acc_stderr,none": 0.0045124716124155745, "acc_norm,none": 0.3067118103963354, "acc_norm_stderr,none": 0.0046018628072401905}, "mmlu": {"acc,none": 0.2296681384418174, "acc_stderr,none": 0.0035441282219406613, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24250797024442083, "acc_stderr,none": 0.006247831855040214, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733393}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.2375281622143547, "acc_stderr,none": 0.007613932035224903, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23243933588761176, "acc_stderr,none": 0.015104550008905706}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.02355083135199509}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.023157468308559328}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21871953201169972, "acc_stderr,none": 0.007447439920000241, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823944}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2134475103076435, "acc_stderr,none": 0.007285900658596516, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.02203721734026784}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.734, "acc_stderr,none": 0.013979965645145155, "acc_norm,none": 0.661, "acc_norm_stderr,none": 0.014976758771620347}} |
|
{"created_at": "2025-08-15T06:16:07.359443", "global_step": 4000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.011586881879177842}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3181637124078869, "acc_stderr,none": 0.004648115322328785, "acc_norm,none": 0.3679545907189803, "acc_norm_stderr,none": 0.004812633280078274}, "mmlu": {"acc,none": 0.24868252385700043, "acc_stderr,none": 0.003641672702176196, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24718384697130713, "acc_stderr,none": 0.0062881038079461195, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3412698412698413, "acc_stderr,none": 0.04240799327574925}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460295}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961464}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478043}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.034010526201040885}, "mmlu_other": {"acc,none": 0.2574831026713872, "acc_stderr,none": 0.007809943165283216, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756193}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.03476599607516478}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.15, "acc_stderr,none": 0.0358870281282637}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2242152466367713, "acc_stderr,none": 0.027991534258519538}, "mmlu_management": {"alias": " - management", "acc,none": 0.3592233009708738, "acc_stderr,none": 0.04750458399041692}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.02891120880274948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22860791826309068, "acc_stderr,none": 0.015016884698539883}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341033}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22340425531914893, "acc_stderr,none": 0.024847921358063962}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33088235294117646, "acc_stderr,none": 0.028582709753898438}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.2567435814104647, "acc_stderr,none": 0.007851568702424294, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.042663394431593955}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2828282828282828, "acc_stderr,none": 0.03208779558786751}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.18652849740932642, "acc_stderr,none": 0.028112091210117474}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02311936275823228}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361273}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3174311926605505, "acc_stderr,none": 0.019957152198460497}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.017077373377857}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579157}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.208955223880597, "acc_stderr,none": 0.028748298931728655}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.23437995559784333, "acc_stderr,none": 0.007552769045845687, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.035478541985608264}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03317672787533158}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006717}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277696}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.02193587808118476}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22580645161290322, "acc_stderr,none": 0.023785577884181012}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.028247350122180267}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097845}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02915752218460559}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952685}, "sciq": {"alias": "sciq", "acc,none": 0.825, "acc_stderr,none": 0.012021627157731987, "acc_norm,none": 0.756, "acc_norm_stderr,none": 0.013588548437881435}} |
|
{"created_at": "2025-08-15T07:59:19.018383", "global_step": 6000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2072072072072072, "acc_stderr,none": 0.011603856781422554}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3419637522405895, "acc_stderr,none": 0.004733980470799217, "acc_norm,none": 0.4108743278231428, "acc_norm_stderr,none": 0.004909870006388833}, "mmlu": {"acc,none": 0.24910981341689217, "acc_stderr,none": 0.0036474794496259818, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24803400637619555, "acc_stderr,none": 0.0063023441409652626, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.02782078198114968}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912025}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742178}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992005}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22508038585209003, "acc_stderr,none": 0.02372008851617903}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24967405475880053, "acc_stderr,none": 0.011054538377832327}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.03446296217088427}, "mmlu_other": {"acc,none": 0.2529771483746379, "acc_stderr,none": 0.0078007622696230905, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241238}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483098}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.03063659134869979}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.041858325989283136}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.029343114798094462}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036843}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.015543377313719681}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912248}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290403}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294264}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.23561910952226195, "acc_stderr,none": 0.007649827276453997, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.020660597485026928}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.018272575810231874}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.036942843353378}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073153}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21890547263681592, "acc_stderr,none": 0.029239174636647}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.260069774817634, "acc_stderr,none": 0.0077851820080167885, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.03915450630414251}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.03426059424403165}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.044405219061793275}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.1829787234042553, "acc_stderr,none": 0.025276041000449972}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3586206896551724, "acc_stderr,none": 0.03996629574876719}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2804232804232804, "acc_stderr,none": 0.02313528797432563}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.20967741935483872, "acc_stderr,none": 0.023157879349083515}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297698}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097838}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25, "acc_stderr,none": 0.029531221160930918}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.043642261558410445}, "sciq": {"alias": "sciq", "acc,none": 0.83, "acc_stderr,none": 0.011884495834541663, "acc_norm,none": 0.756, "acc_norm_stderr,none": 0.013588548437881431}} |
|
{"created_at": "2025-08-15T09:40:00.777396", "global_step": 8000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838398}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3529177454690301, "acc_stderr,none": 0.004769007545082276, "acc_norm,none": 0.4309898426608245, "acc_norm_stderr,none": 0.004942026200279592}, "mmlu": {"acc,none": 0.24818401937046006, "acc_stderr,none": 0.003640170840549424, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2507970244420829, "acc_stderr,none": 0.006323618078538124, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604673}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501947}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069415}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635463}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.044143436668549335}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212675}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468643}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2090032154340836, "acc_stderr,none": 0.023093140398374224}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.024383665531035447}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.258148631029987, "acc_stderr,none": 0.011176923719313397}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.27132281943997427, "acc_stderr,none": 0.007959564036721054, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.02661648298050171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.03214737302029472}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3721973094170404, "acc_stderr,none": 0.03244305283008731}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260595}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431187}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.01611731816683227}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843014}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3373493975903614, "acc_stderr,none": 0.03680783690727581}, "mmlu_social_sciences": {"acc,none": 0.23269418264543387, "acc_stderr,none": 0.007616216729602363, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18686868686868688, "acc_stderr,none": 0.027772533334218988}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775296}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.021020672680827912}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2605042016806723, "acc_stderr,none": 0.02851025151234193}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618684}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823944}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17551020408163265, "acc_stderr,none": 0.024352800722970015}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409217}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.2366000634316524, "acc_stderr,none": 0.00755019293371535, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325438}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03459777606810535}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03708284662416546}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102984}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727771}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.02241804289111394}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22580645161290322, "acc_stderr,none": 0.02378557788418101}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3251231527093596, "acc_stderr,none": 0.032957975663112704}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2851851851851852, "acc_stderr,none": 0.027528599210340492}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.152317880794702, "acc_stderr,none": 0.029339068831498706}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.02649191472735516}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "sciq": {"alias": "sciq", "acc,none": 0.865, "acc_stderr,none": 0.010811655372416051, "acc_norm,none": 0.828, "acc_norm_stderr,none": 0.011939788882495321}} |
|
{"created_at": "2025-08-15T11:23:05.987016", "global_step": 10000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.01156983455153429}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3646683927504481, "acc_stderr,none": 0.004803533333364227, "acc_norm,none": 0.4478191595299741, "acc_norm_stderr,none": 0.004962534264751917}, "mmlu": {"acc,none": 0.241917105825381, "acc_stderr,none": 0.0036110195934736748, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23846971307120085, "acc_stderr,none": 0.0062121146075559165, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147125}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.20606060606060606, "acc_stderr,none": 0.0315841532404771}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083292}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25316455696202533, "acc_stderr,none": 0.028304657943035286}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24916201117318434, "acc_stderr,none": 0.014465893829859924}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19935691318327975, "acc_stderr,none": 0.022691033780549656}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.022021366100220204}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23859191655801826, "acc_stderr,none": 0.010885929742002204}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.25555197940135177, "acc_stderr,none": 0.007825564798410494, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674043}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912258}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2867647058823529, "acc_stderr,none": 0.02747227447323382}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.03410646614071857}, "mmlu_social_sciences": {"acc,none": 0.23854403639909003, "acc_stderr,none": 0.007687784290264888, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813344}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.02833560973246335}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.030031147977641545}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2358974358974359, "acc_stderr,none": 0.021525965407408726}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863804}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21651376146788992, "acc_stderr,none": 0.017658710594443145}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.017740899509177795}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.027372942201788167}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014652}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.2369172216936251, "acc_stderr,none": 0.007545839887592788, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.11851851851851852, "acc_stderr,none": 0.02792205025063904}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.0327900040631005}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.04440521906179325}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.18620689655172415, "acc_stderr,none": 0.032439461590046154}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400192}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.022037217340267846}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.027986724666736212}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02730914058823018}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.036313298039696545}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422273}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.844, "acc_stderr,none": 0.011480235006122363, "acc_norm,none": 0.796, "acc_norm_stderr,none": 0.012749374359024384}} |
|
{"created_at": "2025-08-15T13:07:21.329502", "global_step": 12000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1941031941031941, "acc_stderr,none": 0.011323381588920439}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3739294961163115, "acc_stderr,none": 0.004828564090620291, "acc_norm,none": 0.472814180442143, "acc_norm_stderr,none": 0.00498240036893967}, "mmlu": {"acc,none": 0.2588662583677539, "acc_stderr,none": 0.003680970237257283, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24187035069075452, "acc_stderr,none": 0.006229436951987352, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04285714285714281}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.18787878787878787, "acc_stderr,none": 0.030501934059429144}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.031822318676475544}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676187}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1322314049586777, "acc_stderr,none": 0.030922788320445805}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.021855255263421802}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24115755627009647, "acc_stderr,none": 0.02429659403476343}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20987654320987653, "acc_stderr,none": 0.022658344085981382}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2392438070404172, "acc_stderr,none": 0.010896123652676648}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.03301405946987249}, "mmlu_other": {"acc,none": 0.24267782426778242, "acc_stderr,none": 0.007682044423168464, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197772}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.0277242364927009}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321658}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2242152466367713, "acc_stderr,none": 0.02799153425851954}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749486}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22094508301404853, "acc_stderr,none": 0.014836205167333574}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.025261691219729494}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22340425531914893, "acc_stderr,none": 0.024847921358063962}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.28308823529411764, "acc_stderr,none": 0.02736586113151381}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.033844291552331346}, "mmlu_social_sciences": {"acc,none": 0.2804679883002925, "acc_stderr,none": 0.008078307846050275, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481404}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3005181347150259, "acc_stderr,none": 0.03308818594415751}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33589743589743587, "acc_stderr,none": 0.02394672474156397}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3025210084033613, "acc_stderr,none": 0.029837962388291932}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30275229357798167, "acc_stderr,none": 0.019698711434756364}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.016906615927288128}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3306122448979592, "acc_stderr,none": 0.030116426296540582}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.20398009950248755, "acc_stderr,none": 0.02849317624532608}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_stem": {"acc,none": 0.2790992705359975, "acc_stderr,none": 0.007929943931107256, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066655}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.38235294117647056, "acc_stderr,none": 0.04835503696107223}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.18723404255319148, "acc_stderr,none": 0.025501588341883607}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.21379310344827587, "acc_stderr,none": 0.03416520447747548}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633345}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.026069362295335134}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233484}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712156}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.44907407407407407, "acc_stderr,none": 0.03392238405321616}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16964285714285715, "acc_stderr,none": 0.0356236785009539}, "sciq": {"alias": "sciq", "acc,none": 0.87, "acc_stderr,none": 0.010640169792499356, "acc_norm,none": 0.845, "acc_norm_stderr,none": 0.011450157470799473}} |
|
{"created_at": "2025-08-15T14:55:07.973477", "global_step": 14000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534292}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38478390758812986, "acc_stderr,none": 0.004855498343308385, "acc_norm,none": 0.4887472615016929, "acc_norm_stderr,none": 0.00498851759799862}, "mmlu": {"acc,none": 0.2669847600056972, "acc_stderr,none": 0.0037013145375584026, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23868225292242295, "acc_stderr,none": 0.0062043134032450115, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.042163702135578345}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.029102254389674082}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.16877637130801687, "acc_stderr,none": 0.02438140683258624}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.14049586776859505, "acc_stderr,none": 0.0317223342600216}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094631}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.022698657167855713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2572347266881029, "acc_stderr,none": 0.024826171289250885}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.022409674547304154}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23402868318122555, "acc_stderr,none": 0.010813585552659691}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.2645638879948503, "acc_stderr,none": 0.007797385514602835, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30566037735849055, "acc_stderr,none": 0.028353298073322666}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3352601156069364, "acc_stderr,none": 0.03599586301247077}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.13004484304932734, "acc_stderr,none": 0.02257451942417488}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.02812096650391441}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22094508301404853, "acc_stderr,none": 0.01483620516733357}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.31699346405228757, "acc_stderr,none": 0.02664327847450875}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.02551873104953776}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.44485294117647056, "acc_stderr,none": 0.030187532060329383}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.03141784291663926}, "mmlu_social_sciences": {"acc,none": 0.295417614559636, "acc_stderr,none": 0.008179088087496558, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813365}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3316062176165803, "acc_stderr,none": 0.03397636541089117}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.35128205128205126, "acc_stderr,none": 0.024203665177902803}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3445378151260504, "acc_stderr,none": 0.030868682604121626}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3211009174311927, "acc_stderr,none": 0.020018149772733744}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.01672993756553752}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.16363636363636364, "acc_stderr,none": 0.035434330542986774}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3551020408163265, "acc_stderr,none": 0.030635655150387634}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.03115715086935556}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.28385664446558834, "acc_stderr,none": 0.00795398990310705, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.037125378336148665}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.04784060704105652}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.18723404255319148, "acc_stderr,none": 0.025501588341883607}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131183}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.02271746789770862}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3064516129032258, "acc_stderr,none": 0.026226485652553873}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.031785297106427496}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.03861557546255169}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.46296296296296297, "acc_stderr,none": 0.03400603625538271}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16964285714285715, "acc_stderr,none": 0.0356236785009539}, "sciq": {"alias": "sciq", "acc,none": 0.879, "acc_stderr,none": 0.010318210380946097, "acc_norm,none": 0.835, "acc_norm_stderr,none": 0.011743632866916176}} |
|
{"created_at": "2025-08-15T16:55:14.487399", "global_step": 16000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.011586881879177826}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3942441744672376, "acc_stderr,none": 0.004876889983110827, "acc_norm,none": 0.501593308105955, "acc_norm_stderr,none": 0.004989756076956358}, "mmlu": {"acc,none": 0.24070645207235436, "acc_stderr,none": 0.003600129754966004, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2357066950053135, "acc_stderr,none": 0.006186739435224213, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0404061017820884}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104428}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478026}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20987654320987653, "acc_stderr,none": 0.022658344085981354}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2242503259452412, "acc_stderr,none": 0.010652615824906172}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.03528211258245231}, "mmlu_other": {"acc,none": 0.24010299324106857, "acc_stderr,none": 0.007653357358812127, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2037735849056604, "acc_stderr,none": 0.024790784501775402}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.04301250399690877}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.029058588303748845}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.20561941251596424, "acc_stderr,none": 0.014452500456785825}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307847}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22794117647058823, "acc_stderr,none": 0.025483081468029804}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.23074423139421515, "acc_stderr,none": 0.007591313671593955, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.026552207828215293}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.021444547301560486}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.02921354941437217}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21100917431192662, "acc_stderr,none": 0.017493922404112648}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.03547771004159463}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.017704531653250078}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.0258012834750905}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.25848398350777035, "acc_stderr,none": 0.007751253619530722, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.03068864761035266}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.03355045304882924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.0358687928008034}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.39215686274509803, "acc_stderr,none": 0.04858083574266346}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2297872340425532, "acc_stderr,none": 0.02750175294441242}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727771}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2830687830687831, "acc_stderr,none": 0.023201392938194974}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.02203721734026784}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959316}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.38425925925925924, "acc_stderr,none": 0.03317354514310742}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "sciq": {"alias": "sciq", "acc,none": 0.888, "acc_stderr,none": 0.009977753031397233, "acc_norm,none": 0.864, "acc_norm_stderr,none": 0.010845350230472988}} |
|
{"created_at": "2025-08-15T18:38:15.278003", "global_step": 18000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883527}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4012148974307907, "acc_stderr,none": 0.004891426533390629, "acc_norm,none": 0.5134435371439953, "acc_norm_stderr,none": 0.004987977492042157}, "mmlu": {"acc,none": 0.2458339267910554, "acc_stderr,none": 0.0036295608299807703, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23868225292242295, "acc_stderr,none": 0.0062119998526491554, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924314}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.02712329820522997}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.03695980128098824}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.04453197507374983}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.0332201579577674}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28034682080924855, "acc_stderr,none": 0.024182427496577615}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098414}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2347266881028939, "acc_stderr,none": 0.024071805887677048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.18209876543209877, "acc_stderr,none": 0.021473491834808355}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2333767926988266, "acc_stderr,none": 0.010803108481179094}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.2529771483746379, "acc_stderr,none": 0.007787337740945078, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.1, "acc_stderr,none": 0.030151134457776348}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.0336876293225943}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2645739910313901, "acc_stderr,none": 0.02960510321703831}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.043546310772605935}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2515964240102171, "acc_stderr,none": 0.01551732236552963}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242557}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.02657786094330785}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.025767252010855973}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.034605799075530255}, "mmlu_social_sciences": {"acc,none": 0.23756906077348067, "acc_stderr,none": 0.007673562305723165, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518752}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932032}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.030031147977641545}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24871794871794872, "acc_stderr,none": 0.021916957709213803}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.28991596638655465, "acc_stderr,none": 0.02947248583313608}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.018125669180861503}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.0364129708131373}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.017704531653250075}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884603}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.02671143055553843}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.1791044776119403, "acc_stderr,none": 0.027113286753111837}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_stem": {"acc,none": 0.2575325087218522, "acc_stderr,none": 0.007771742580370727, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073465}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566016}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04690650298201942}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.02713634960242405}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727771}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.02326651221373057}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24838709677419354, "acc_stderr,none": 0.024580028921481}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489617}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.03757949922943343}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.03191923445686185}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16964285714285715, "acc_stderr,none": 0.03562367850095391}, "sciq": {"alias": "sciq", "acc,none": 0.892, "acc_stderr,none": 0.00982000165134568, "acc_norm,none": 0.869, "acc_norm_stderr,none": 0.010674874844837957}} |
|
{"created_at": "2025-08-15T20:02:07.793600", "global_step": 20000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4041027683728341, "acc_stderr,none": 0.004897146690596248, "acc_norm,none": 0.5188209520015933, "acc_norm_stderr,none": 0.004986245115428458}, "mmlu": {"acc,none": 0.2575843896880786, "acc_stderr,none": 0.0036754869742361552, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23889479277364506, "acc_stderr,none": 0.006205018197736236, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3492063492063492, "acc_stderr,none": 0.042639068927951315}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.19393939393939394, "acc_stderr,none": 0.030874145136562094}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693264}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.20675105485232068, "acc_stderr,none": 0.026361651668389104}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.03390780612972776}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.044143436668549335}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2558659217877095, "acc_stderr,none": 0.014593620923210739}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.023222756797435122}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.1728395061728395, "acc_stderr,none": 0.021038517770157385}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24185136897001303, "acc_stderr,none": 0.010936550813827071}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.25490827164467333, "acc_stderr,none": 0.007797674464768559, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.11, "acc_stderr,none": 0.03144660377352203}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.027008766090708094}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.035506839891655796}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2556053811659193, "acc_stderr,none": 0.029275891003969927}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23504273504273504, "acc_stderr,none": 0.027778835904935434}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23243933588761176, "acc_stderr,none": 0.015104550008905704}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902013}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34191176470588236, "acc_stderr,none": 0.028814722422254177}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.03240004825594687}, "mmlu_social_sciences": {"acc,none": 0.2755931101722457, "acc_stderr,none": 0.008025892730230482, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436695}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3484848484848485, "acc_stderr,none": 0.033948539651564025}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32642487046632124, "acc_stderr,none": 0.033840286211432945}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3230769230769231, "acc_stderr,none": 0.02371088850197057}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.030489911417673227}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29541284403669726, "acc_stderr,none": 0.019560619182976}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.017401816711427653}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.02737294220178816}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.20398009950248755, "acc_stderr,none": 0.028493176245326088}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_stem": {"acc,none": 0.2705359974627339, "acc_stderr,none": 0.007862889959778019, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.047840607041056527}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838746}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633345}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.025189006660212385}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.02967833314144444}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712156}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.03362277436608044}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.03770970049347019}, "sciq": {"alias": "sciq", "acc,none": 0.891, "acc_stderr,none": 0.009859828407037186, "acc_norm,none": 0.859, "acc_norm_stderr,none": 0.011010914595992441}} |
|
{"created_at": "2025-08-15T21:42:46.474328", "global_step": 22000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4047002589125672, "acc_stderr,none": 0.004898308167211837, "acc_norm,none": 0.5201155148376817, "acc_norm_stderr,none": 0.004985741706385721}, "mmlu": {"acc,none": 0.25601766130180886, "acc_stderr,none": 0.003670829987667761, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23910733262486716, "acc_stderr,none": 0.0062089743458879805, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.20606060606060606, "acc_stderr,none": 0.03158415324047709}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350195}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.20253164556962025, "acc_stderr,none": 0.02616056824660143}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.03457272836917671}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.045245960070300476}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2670391061452514, "acc_stderr,none": 0.014796502622562551}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.023222756797435122}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.021185893615225153}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23598435462842243, "acc_stderr,none": 0.010844802669662687}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.25555197940135177, "acc_stderr,none": 0.007811371536867302, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.11, "acc_stderr,none": 0.03144660377352203}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.02648035717989569}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.035149425512674394}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2556053811659193, "acc_stderr,none": 0.029275891003969927}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.04541609446503948}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23243933588761176, "acc_stderr,none": 0.015104550008905707}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.025058503316958147}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.028418208619406794}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.033844291552331346}, "mmlu_social_sciences": {"acc,none": 0.27331816704582385, "acc_stderr,none": 0.00800943136476825, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481404}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35353535353535354, "acc_stderr,none": 0.03406086723547153}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3005181347150259, "acc_stderr,none": 0.03308818594415751}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.02366129639396428}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3319327731092437, "acc_stderr,none": 0.030588697013783663}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28440366972477066, "acc_stderr,none": 0.019342036587702602}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073163}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.19900497512437812, "acc_stderr,none": 0.028231365092758406}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_stem": {"acc,none": 0.26482714874722485, "acc_stderr,none": 0.007815328685149893, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560824}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.04810840148082636}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895525}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03010833071801162}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036844}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33112582781456956, "acc_stderr,none": 0.038425817186598696}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4027777777777778, "acc_stderr,none": 0.033448873829978666}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.1875, "acc_stderr,none": 0.0370468111477387}, "sciq": {"alias": "sciq", "acc,none": 0.891, "acc_stderr,none": 0.009859828407037183, "acc_norm,none": 0.853, "acc_norm_stderr,none": 0.011203415395160333}} |
|
|