{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.022589681033703804, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 229.33929443359375, "epoch": 2.2589681033703803e-05, "grad_norm": 3.321090749409433, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.0, "reward": 1.5635725259780884, "reward_std": 0.17227235436439514, "rewards/accuracy_reward": 0.43317365646362305, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1589704304933548, "step": 1 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 230.10714721679688, "epoch": 4.5179362067407606e-05, "grad_norm": 2.4720965528490386, "kl": 0.000537872314453125, "learning_rate": 9.999999987409007e-07, "loss": 0.0, "reward": 1.7868064641952515, "reward_std": 0.28859272599220276, "rewards/accuracy_reward": 0.6223132014274597, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.15020751953125, "step": 2 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 293.4464416503906, "epoch": 6.77690431011114e-05, "grad_norm": 3.235353072308969, "kl": 0.0005950927734375, "learning_rate": 9.99999994963603e-07, "loss": 0.0, "reward": 1.7658425569534302, "reward_std": 0.40306493639945984, "rewards/accuracy_reward": 0.5368803143501282, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1861049234867096, "step": 3 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 237.62501525878906, "epoch": 9.035872413481521e-05, "grad_norm": 5.911769962060281, "kl": 0.00189208984375, "learning_rate": 9.99999988668107e-07, "loss": 0.0001, "reward": 1.6295231580734253, "reward_std": 0.09346919506788254, "rewards/accuracy_reward": 0.3408738970756531, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2672206461429596, "step": 4 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 358.89288330078125, "epoch": 0.00011294840516851902, "grad_norm": 4.655337672019286, "kl": 0.000553131103515625, "learning_rate": 9.999999798544127e-07, "loss": 0.0, "reward": 1.7000911235809326, "reward_std": 0.3708450198173523, "rewards/accuracy_reward": 0.5323176383972168, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1820591539144516, "step": 5 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 256.375, "epoch": 0.0001355380862022228, "grad_norm": 2.931697091576695, "kl": 0.000606536865234375, "learning_rate": 9.9999996852252e-07, "loss": 0.0, "reward": 1.9866714477539062, "reward_std": 0.30217117071151733, "rewards/accuracy_reward": 0.739642858505249, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.239885613322258, "step": 6 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 210.9107208251953, "epoch": 0.00015812776723592663, "grad_norm": 6.583734488422231, "kl": 0.00074005126953125, "learning_rate": 9.999999546724292e-07, "loss": 0.0, "reward": 1.803772211074829, "reward_std": 0.11786586046218872, "rewards/accuracy_reward": 0.5032699704170227, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2826451063156128, "step": 7 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 244.58929443359375, "epoch": 0.00018071744826963042, "grad_norm": 8.873705455948262, "kl": 0.00086212158203125, "learning_rate": 9.9999993830414e-07, "loss": 0.0, "reward": 1.857612133026123, "reward_std": 0.19534830749034882, "rewards/accuracy_reward": 0.6322118639945984, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2004002183675766, "step": 8 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 216.60714721679688, "epoch": 0.00020330712930333424, "grad_norm": 2.524971574037387, "kl": 0.000873565673828125, "learning_rate": 9.999999194176527e-07, "loss": 0.0, "reward": 1.6315796375274658, "reward_std": 0.1613486111164093, "rewards/accuracy_reward": 0.43583112955093384, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1778913289308548, "step": 9 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 248.69644165039062, "epoch": 0.00022589681033703804, "grad_norm": 5.055162691900526, "kl": 0.0010223388671875, "learning_rate": 9.999998980129674e-07, "loss": 0.0, "reward": 1.636960506439209, "reward_std": 0.2200811356306076, "rewards/accuracy_reward": 0.40135425329208374, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2106061726808548, "step": 10 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 220.57144165039062, "epoch": 0.00024848649137074186, "grad_norm": 2.5202242060607793, "kl": 0.00083160400390625, "learning_rate": 9.999998740900842e-07, "loss": 0.0, "reward": 1.5240148305892944, "reward_std": 0.2758294939994812, "rewards/accuracy_reward": 0.4107142984867096, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1061575785279274, "step": 11 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 208.42857360839844, "epoch": 0.0002710761724044456, "grad_norm": 2.604550410266501, "kl": 0.0010986328125, "learning_rate": 9.999998476490033e-07, "loss": 0.0, "reward": 1.7536022663116455, "reward_std": 0.13335958123207092, "rewards/accuracy_reward": 0.5173158049583435, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.229143425822258, "step": 12 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 198.5357208251953, "epoch": 0.00029366585343814944, "grad_norm": 10.911184980883219, "kl": 0.001068115234375, "learning_rate": 9.999998186897247e-07, "loss": 0.0, "reward": 1.94025719165802, "reward_std": 0.05976281315088272, "rewards/accuracy_reward": 0.6504150629043579, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2755562961101532, "step": 13 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 268.3035888671875, "epoch": 0.00031625553447185326, "grad_norm": 2.5872475177055936, "kl": 0.000926971435546875, "learning_rate": 9.999997872122486e-07, "loss": 0.0, "reward": 1.6609598398208618, "reward_std": 0.20204812288284302, "rewards/accuracy_reward": 0.43552035093307495, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2004394680261612, "step": 14 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.9821472167969, "epoch": 0.0003388452155055571, "grad_norm": 2.4155133591787408, "kl": 0.001007080078125, "learning_rate": 9.999997532165751e-07, "loss": 0.0, "reward": 1.6944005489349365, "reward_std": 0.48361101746559143, "rewards/accuracy_reward": 0.4405902028083801, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.218096062541008, "step": 15 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 210.50001525878906, "epoch": 0.00036143489653926085, "grad_norm": 4.899976141169108, "kl": 0.00157928466796875, "learning_rate": 9.999997167027042e-07, "loss": 0.0001, "reward": 1.5183532238006592, "reward_std": 0.2995515763759613, "rewards/accuracy_reward": 0.3524578809738159, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.15875244140625, "step": 16 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 268.5, "epoch": 0.00038402457757296467, "grad_norm": 5.07535985545095, "kl": 0.00131988525390625, "learning_rate": 9.999996776706368e-07, "loss": 0.0001, "reward": 1.997049331665039, "reward_std": 0.1279676854610443, "rewards/accuracy_reward": 0.7532120943069458, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2009800523519516, "step": 17 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 224.2678680419922, "epoch": 0.0004066142586066685, "grad_norm": 3.409662133872403, "kl": 0.0016021728515625, "learning_rate": 9.999996361203724e-07, "loss": 0.0001, "reward": 1.907712697982788, "reward_std": 0.13319844007492065, "rewards/accuracy_reward": 0.6266319155693054, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2596522867679596, "step": 18 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 281.6785888671875, "epoch": 0.00042920393964037226, "grad_norm": 6.255857031945402, "kl": 0.00311279296875, "learning_rate": 9.999995920519112e-07, "loss": 0.0001, "reward": 1.6464269161224365, "reward_std": 0.23813490569591522, "rewards/accuracy_reward": 0.5178571939468384, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1142839789390564, "step": 19 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 253.37501525878906, "epoch": 0.0004517936206740761, "grad_norm": 3.774900930242461, "kl": 0.0021514892578125, "learning_rate": 9.99999545465254e-07, "loss": 0.0001, "reward": 1.787373661994934, "reward_std": 0.2550387978553772, "rewards/accuracy_reward": 0.5349216461181641, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2453090250492096, "step": 20 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 227.19644165039062, "epoch": 0.0004743833017077799, "grad_norm": 7.211622850671573, "kl": 0.00174713134765625, "learning_rate": 9.999994963604003e-07, "loss": 0.0001, "reward": 1.899523138999939, "reward_std": 0.2744324803352356, "rewards/accuracy_reward": 0.6485029458999634, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2331630289554596, "step": 21 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 314.625, "epoch": 0.0004969729827414837, "grad_norm": 2.4016121600461604, "kl": 0.0012969970703125, "learning_rate": 9.99999444737351e-07, "loss": 0.0001, "reward": 1.6262013912200928, "reward_std": 0.33216285705566406, "rewards/accuracy_reward": 0.4424751102924347, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1765834391117096, "step": 22 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 255.8928680419922, "epoch": 0.0005195626637751875, "grad_norm": 4.458534259688705, "kl": 0.00225830078125, "learning_rate": 9.999993905961059e-07, "loss": 0.0001, "reward": 1.8281751871109009, "reward_std": 0.11907106637954712, "rewards/accuracy_reward": 0.5204197764396667, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2684696912765503, "step": 23 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 247.46429443359375, "epoch": 0.0005421523448088912, "grad_norm": 4.030289298726358, "kl": 0.001983642578125, "learning_rate": 9.999993339366657e-07, "loss": 0.0001, "reward": 1.8060446977615356, "reward_std": 0.17014530301094055, "rewards/accuracy_reward": 0.5502821207046509, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2414768934249878, "step": 24 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 280.33929443359375, "epoch": 0.0005647420258425951, "grad_norm": 5.097080508136576, "kl": 0.002227783203125, "learning_rate": 9.999992747590303e-07, "loss": 0.0001, "reward": 1.824195146560669, "reward_std": 0.2512558400630951, "rewards/accuracy_reward": 0.5441448092460632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2479073703289032, "step": 25 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 236.2678680419922, "epoch": 0.0005873317068762989, "grad_norm": 3.023728556268416, "kl": 0.0025482177734375, "learning_rate": 9.999992130632e-07, "loss": 0.0001, "reward": 1.5532057285308838, "reward_std": 0.16572755575180054, "rewards/accuracy_reward": 0.36992061138153076, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1618565171957016, "step": 26 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 306.5535888671875, "epoch": 0.0006099213879100028, "grad_norm": 5.297310237658889, "kl": 0.0027618408203125, "learning_rate": 9.999991488491753e-07, "loss": 0.0001, "reward": 1.5387083292007446, "reward_std": 0.09268178790807724, "rewards/accuracy_reward": 0.475754976272583, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.1879534125328064, "step": 27 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 277.76788330078125, "epoch": 0.0006325110689437065, "grad_norm": 19.12617526523046, "kl": 0.002899169921875, "learning_rate": 9.999990821169566e-07, "loss": 0.0001, "reward": 1.7241307497024536, "reward_std": 0.2401685118675232, "rewards/accuracy_reward": 0.47141021490097046, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2170061469078064, "step": 28 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 298.89288330078125, "epoch": 0.0006551007499774103, "grad_norm": 3.2514291676610605, "kl": 0.002838134765625, "learning_rate": 9.99999012866544e-07, "loss": 0.0001, "reward": 1.775895357131958, "reward_std": 0.16082066297531128, "rewards/accuracy_reward": 0.544003427028656, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2068917453289032, "step": 29 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 320.76788330078125, "epoch": 0.0006776904310111142, "grad_norm": 2.672792000012074, "kl": 0.0027008056640625, "learning_rate": 9.99998941097938e-07, "loss": 0.0001, "reward": 1.5602918863296509, "reward_std": 0.4049931764602661, "rewards/accuracy_reward": 0.38195058703422546, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1569126695394516, "step": 30 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 303.875, "epoch": 0.0007002801120448179, "grad_norm": 2.040278310729214, "kl": 0.002685546875, "learning_rate": 9.999988668111387e-07, "loss": 0.0001, "reward": 1.8214373588562012, "reward_std": 0.224795863032341, "rewards/accuracy_reward": 0.5807216167449951, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1800014078617096, "step": 31 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 246.48214721679688, "epoch": 0.0007228697930785217, "grad_norm": 2.674414422914148, "kl": 0.004852294921875, "learning_rate": 9.99998790006147e-07, "loss": 0.0002, "reward": 1.337849736213684, "reward_std": 0.11113186925649643, "rewards/accuracy_reward": 0.2047371119260788, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1366838812828064, "step": 32 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 239.1428680419922, "epoch": 0.0007454594741122256, "grad_norm": 2.77583960257884, "kl": 0.005523681640625, "learning_rate": 9.999987106829628e-07, "loss": 0.0002, "reward": 1.7895009517669678, "reward_std": 0.2917839288711548, "rewards/accuracy_reward": 0.6130952835083008, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1549769937992096, "step": 33 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 345.7500305175781, "epoch": 0.0007680491551459293, "grad_norm": 4.561573282566516, "kl": 0.0038909912109375, "learning_rate": 9.999986288415868e-07, "loss": 0.0002, "reward": 1.4711700677871704, "reward_std": 0.42309805750846863, "rewards/accuracy_reward": 0.36662033200263977, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.190264031291008, "step": 34 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 319.4464416503906, "epoch": 0.0007906388361796331, "grad_norm": 2.300979788920904, "kl": 0.0032501220703125, "learning_rate": 9.99998544482019e-07, "loss": 0.0001, "reward": 1.765289545059204, "reward_std": 0.5035722851753235, "rewards/accuracy_reward": 0.6024302244186401, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.1985735297203064, "step": 35 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 249.23214721679688, "epoch": 0.000813228517213337, "grad_norm": 3.9704454647795093, "kl": 0.0052490234375, "learning_rate": 9.999984576042604e-07, "loss": 0.0002, "reward": 1.4928638935089111, "reward_std": 0.2906460464000702, "rewards/accuracy_reward": 0.33677423000335693, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.134661003947258, "step": 36 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 310.3571472167969, "epoch": 0.0008358181982470407, "grad_norm": 2.563729142984342, "kl": 0.00482177734375, "learning_rate": 9.99998368208311e-07, "loss": 0.0002, "reward": 1.2960306406021118, "reward_std": 0.11960454285144806, "rewards/accuracy_reward": 0.2328888475894928, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.130998894572258, "step": 37 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 262.51788330078125, "epoch": 0.0008584078792807445, "grad_norm": 4.69292908167976, "kl": 0.004791259765625, "learning_rate": 9.999982762941716e-07, "loss": 0.0002, "reward": 1.6216871738433838, "reward_std": 0.3001430928707123, "rewards/accuracy_reward": 0.46037983894348145, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1613072007894516, "step": 38 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 279.4821472167969, "epoch": 0.0008809975603144484, "grad_norm": 4.941491429530855, "kl": 0.00634765625, "learning_rate": 9.999981818618423e-07, "loss": 0.0003, "reward": 1.8578033447265625, "reward_std": 0.15855510532855988, "rewards/accuracy_reward": 0.6015140414237976, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2277178168296814, "step": 39 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 289.4464416503906, "epoch": 0.0009035872413481522, "grad_norm": 2.958397810143743, "kl": 0.0052490234375, "learning_rate": 9.999980849113238e-07, "loss": 0.0002, "reward": 1.8886305093765259, "reward_std": 0.3033541440963745, "rewards/accuracy_reward": 0.6334407329559326, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2123325914144516, "step": 40 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 243.23214721679688, "epoch": 0.0009261769223818559, "grad_norm": 2.459481004757318, "kl": 0.008056640625, "learning_rate": 9.999979854426164e-07, "loss": 0.0003, "reward": 1.6557544469833374, "reward_std": 0.09515701234340668, "rewards/accuracy_reward": 0.42377549409866333, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2069789469242096, "step": 41 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 279.0714416503906, "epoch": 0.0009487666034155598, "grad_norm": 13.454441494859143, "kl": 0.006683349609375, "learning_rate": 9.999978834557209e-07, "loss": 0.0003, "reward": 1.5759981870651245, "reward_std": 0.19754809141159058, "rewards/accuracy_reward": 0.3867858052253723, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1570696234703064, "step": 42 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 326.9464416503906, "epoch": 0.0009713562844492636, "grad_norm": 1.8037648348188018, "kl": 0.005523681640625, "learning_rate": 9.999977789506375e-07, "loss": 0.0002, "reward": 1.3815943002700806, "reward_std": 0.23407310247421265, "rewards/accuracy_reward": 0.2865467965602875, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1200474351644516, "step": 43 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 307.3214416503906, "epoch": 0.0009939459654829674, "grad_norm": 1.6797630841137026, "kl": 0.006439208984375, "learning_rate": 9.99997671927367e-07, "loss": 0.0003, "reward": 1.696540355682373, "reward_std": 0.33146044611930847, "rewards/accuracy_reward": 0.5714285969734192, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1679687649011612, "step": 44 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 298.1785888671875, "epoch": 0.0010165356465166712, "grad_norm": 2.239224896899741, "kl": 0.0081787109375, "learning_rate": 9.999975623859097e-07, "loss": 0.0003, "reward": 1.776603102684021, "reward_std": 0.26186269521713257, "rewards/accuracy_reward": 0.4863826036453247, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.243791863322258, "step": 45 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 241.9285888671875, "epoch": 0.001039125327550375, "grad_norm": 2.785160629513963, "kl": 0.00823974609375, "learning_rate": 9.999974503262663e-07, "loss": 0.0003, "reward": 1.8448606729507446, "reward_std": 0.26661762595176697, "rewards/accuracy_reward": 0.5421716570854187, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2741176187992096, "step": 46 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 282.375, "epoch": 0.0010617150085840787, "grad_norm": 4.372551450264199, "kl": 0.009033203125, "learning_rate": 9.999973357484372e-07, "loss": 0.0004, "reward": 1.6573244333267212, "reward_std": 0.3321269452571869, "rewards/accuracy_reward": 0.4084787666797638, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1917027086019516, "step": 47 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 231.62501525878906, "epoch": 0.0010843046896177825, "grad_norm": 2.994509490379109, "kl": 0.00958251953125, "learning_rate": 9.999972186524233e-07, "loss": 0.0004, "reward": 2.0747501850128174, "reward_std": 0.12610064446926117, "rewards/accuracy_reward": 0.7976190447807312, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2557024359703064, "step": 48 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 291.375, "epoch": 0.0011068943706514865, "grad_norm": 14.657528876563694, "kl": 0.010986328125, "learning_rate": 9.999970990382248e-07, "loss": 0.0004, "reward": 1.5944451093673706, "reward_std": 0.25094521045684814, "rewards/accuracy_reward": 0.3522506058216095, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1779087632894516, "step": 49 }, { "all_correct": 0.0, "all_wrong": 0.5714285714285714, "completion_length": 342.5714416503906, "epoch": 0.0011294840516851902, "grad_norm": 1.640449879383109, "kl": 0.00689697265625, "learning_rate": 9.999969769058427e-07, "loss": 0.0003, "reward": 1.0606597661972046, "reward_std": 0.25446024537086487, "rewards/accuracy_reward": 0.08865220844745636, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.0577218234539032, "step": 50 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 378.8035888671875, "epoch": 0.001152073732718894, "grad_norm": 3.5395955083614354, "kl": 0.008056640625, "learning_rate": 9.999968522552773e-07, "loss": 0.0003, "reward": 1.7214101552963257, "reward_std": 0.18476925790309906, "rewards/accuracy_reward": 0.48565399646759033, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1857561469078064, "step": 51 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 295.3571472167969, "epoch": 0.0011746634137525978, "grad_norm": 2.24314950573788, "kl": 0.00994873046875, "learning_rate": 9.999967250865291e-07, "loss": 0.0004, "reward": 1.7975130081176758, "reward_std": 0.18266119062900543, "rewards/accuracy_reward": 0.5216061472892761, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2151925265789032, "step": 52 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 278.9821472167969, "epoch": 0.0011972530947863015, "grad_norm": 2.629749869225687, "kl": 0.00909423828125, "learning_rate": 9.999965953995994e-07, "loss": 0.0004, "reward": 1.6177189350128174, "reward_std": 0.15284644067287445, "rewards/accuracy_reward": 0.4238364100456238, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1581682562828064, "step": 53 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 412.5000305175781, "epoch": 0.0012198427758200055, "grad_norm": 1.6919810151445733, "kl": 0.007171630859375, "learning_rate": 9.999964631944882e-07, "loss": 0.0003, "reward": 1.357873797416687, "reward_std": 0.2546359896659851, "rewards/accuracy_reward": 0.3472056984901428, "rewards/format_reward": 0.8392857313156128, "rewards/semantic_reward": 0.1535252183675766, "step": 54 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 261.4464416503906, "epoch": 0.0012424324568537093, "grad_norm": 6.109065562737483, "kl": 0.01373291015625, "learning_rate": 9.999963284711964e-07, "loss": 0.0005, "reward": 1.801571249961853, "reward_std": 0.1169729232788086, "rewards/accuracy_reward": 0.5396397113800049, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.211931511759758, "step": 55 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 342.96429443359375, "epoch": 0.001265022137887413, "grad_norm": 2.590922541104905, "kl": 0.01263427734375, "learning_rate": 9.999961912297247e-07, "loss": 0.0005, "reward": 1.749110221862793, "reward_std": 0.22542926669120789, "rewards/accuracy_reward": 0.6307750940322876, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.2219063937664032, "step": 56 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 281.1071472167969, "epoch": 0.0012876118189211168, "grad_norm": 2.673601188999968, "kl": 0.0120849609375, "learning_rate": 9.999960514700738e-07, "loss": 0.0005, "reward": 1.868161678314209, "reward_std": 0.30648961663246155, "rewards/accuracy_reward": 0.6328274011611938, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2353341281414032, "step": 57 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 337.6785888671875, "epoch": 0.0013102014999548206, "grad_norm": 2.662061898254339, "kl": 0.00885009765625, "learning_rate": 9.999959091922443e-07, "loss": 0.0004, "reward": 1.8885148763656616, "reward_std": 0.19556757807731628, "rewards/accuracy_reward": 0.6166502833366394, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2325788289308548, "step": 58 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 379.1071472167969, "epoch": 0.0013327911809885244, "grad_norm": 2.2471936952483476, "kl": 0.009765625, "learning_rate": 9.99995764396237e-07, "loss": 0.0004, "reward": 1.7162055969238281, "reward_std": 0.33362504839897156, "rewards/accuracy_reward": 0.5517209768295288, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1859131008386612, "step": 59 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 344.6964416503906, "epoch": 0.0013553808620222283, "grad_norm": 2.040102413503151, "kl": 0.008544921875, "learning_rate": 9.999956170820525e-07, "loss": 0.0003, "reward": 2.07378888130188, "reward_std": 0.13334918022155762, "rewards/accuracy_reward": 0.7188606858253479, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2656424641609192, "step": 60 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 351.21429443359375, "epoch": 0.001377970543055932, "grad_norm": 2.8027709504849248, "kl": 0.01287841796875, "learning_rate": 9.99995467249692e-07, "loss": 0.0005, "reward": 1.6859304904937744, "reward_std": 0.3101873993873596, "rewards/accuracy_reward": 0.4234234094619751, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1946498453617096, "step": 61 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 317.7321472167969, "epoch": 0.0014005602240896359, "grad_norm": 3.1683484841034617, "kl": 0.0162353515625, "learning_rate": 9.999953148991557e-07, "loss": 0.0006, "reward": 1.7245506048202515, "reward_std": 0.3598234951496124, "rewards/accuracy_reward": 0.4370887577533722, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2196045070886612, "step": 62 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 366.46429443359375, "epoch": 0.0014231499051233396, "grad_norm": 5.736105423586736, "kl": 0.01531982421875, "learning_rate": 9.999951600304445e-07, "loss": 0.0006, "reward": 1.7207865715026855, "reward_std": 0.3779502511024475, "rewards/accuracy_reward": 0.47296464443206787, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1871076375246048, "step": 63 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 329.7857360839844, "epoch": 0.0014457395861570434, "grad_norm": 2.193118090471648, "kl": 0.01361083984375, "learning_rate": 9.999950026435592e-07, "loss": 0.0005, "reward": 1.8050733804702759, "reward_std": 0.22645244002342224, "rewards/accuracy_reward": 0.6100292205810547, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2236153781414032, "step": 64 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 412.3214416503906, "epoch": 0.0014683292671907472, "grad_norm": 3.161197682876552, "kl": 0.01190185546875, "learning_rate": 9.999948427385008e-07, "loss": 0.0005, "reward": 1.6848863363265991, "reward_std": 0.32298505306243896, "rewards/accuracy_reward": 0.4733191430568695, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1687099039554596, "step": 65 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 399.71429443359375, "epoch": 0.0014909189482244511, "grad_norm": 3.537494493612248, "kl": 0.01434326171875, "learning_rate": 9.999946803152698e-07, "loss": 0.0006, "reward": 1.5438339710235596, "reward_std": 0.283683717250824, "rewards/accuracy_reward": 0.38342657685279846, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.2211216688156128, "step": 66 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 378.51788330078125, "epoch": 0.001513508629258155, "grad_norm": 3.1918199434166787, "kl": 0.018310546875, "learning_rate": 9.999945153738673e-07, "loss": 0.0007, "reward": 1.5825293064117432, "reward_std": 0.32329702377319336, "rewards/accuracy_reward": 0.34691956639289856, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1784668117761612, "step": 67 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 282.9821472167969, "epoch": 0.0015360983102918587, "grad_norm": 11.537254710559504, "kl": 0.00933837890625, "learning_rate": 9.99994347914294e-07, "loss": 0.0004, "reward": 1.5897372961044312, "reward_std": 0.23620979487895966, "rewards/accuracy_reward": 0.4393536150455475, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.125383660197258, "step": 68 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 499.4464416503906, "epoch": 0.0015586879913255624, "grad_norm": 2.4261032629256745, "kl": 0.0157470703125, "learning_rate": 9.999941779365509e-07, "loss": 0.0006, "reward": 1.2903941869735718, "reward_std": 0.23809722065925598, "rewards/accuracy_reward": 0.2536700665950775, "rewards/format_reward": 0.8035714626312256, "rewards/semantic_reward": 0.1688668429851532, "step": 69 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 350.4464416503906, "epoch": 0.0015812776723592662, "grad_norm": 2.5186620239950552, "kl": 0.0181884765625, "learning_rate": 9.999940054406383e-07, "loss": 0.0007, "reward": 1.755478024482727, "reward_std": 0.2475968897342682, "rewards/accuracy_reward": 0.4314859211444855, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2347063422203064, "step": 70 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 353.1607360839844, "epoch": 0.0016038673533929702, "grad_norm": 6.5653240201122, "kl": 0.017333984375, "learning_rate": 9.999938304265578e-07, "loss": 0.0007, "reward": 1.9708224534988403, "reward_std": 0.09640859067440033, "rewards/accuracy_reward": 0.5979115962982178, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2871965765953064, "step": 71 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 384.21429443359375, "epoch": 0.001626457034426674, "grad_norm": 2.343579044580637, "kl": 0.0206298828125, "learning_rate": 9.999936528943099e-07, "loss": 0.0008, "reward": 1.5104985237121582, "reward_std": 0.2494249790906906, "rewards/accuracy_reward": 0.26836681365966797, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1849888414144516, "step": 72 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 301.76788330078125, "epoch": 0.0016490467154603777, "grad_norm": 2.457284178024847, "kl": 0.01324462890625, "learning_rate": 9.999934728438955e-07, "loss": 0.0005, "reward": 1.6059505939483643, "reward_std": 0.23824600875377655, "rewards/accuracy_reward": 0.36575445532798767, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1830531656742096, "step": 73 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 342.1785888671875, "epoch": 0.0016716363964940815, "grad_norm": 3.5081333826759855, "kl": 0.0211181640625, "learning_rate": 9.999932902753157e-07, "loss": 0.0008, "reward": 1.9671738147735596, "reward_std": 0.2702382206916809, "rewards/accuracy_reward": 0.660714328289032, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2636021375656128, "step": 74 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 374.46429443359375, "epoch": 0.0016942260775277853, "grad_norm": 3.8611322416316702, "kl": 0.01611328125, "learning_rate": 9.99993105188571e-07, "loss": 0.0006, "reward": 2.1729390621185303, "reward_std": 0.17918628454208374, "rewards/accuracy_reward": 0.7839810252189636, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2711007297039032, "step": 75 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 339.3571472167969, "epoch": 0.001716815758561489, "grad_norm": 2.245187457490534, "kl": 0.0191650390625, "learning_rate": 9.999929175836629e-07, "loss": 0.0008, "reward": 1.722460150718689, "reward_std": 0.06098560988903046, "rewards/accuracy_reward": 0.4698512852191925, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2097516804933548, "step": 76 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 326.375, "epoch": 0.001739405439595193, "grad_norm": 2.563949359456531, "kl": 0.018310546875, "learning_rate": 9.999927274605919e-07, "loss": 0.0007, "reward": 1.3095741271972656, "reward_std": 0.19897213578224182, "rewards/accuracy_reward": 0.13275711238384247, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.130388543009758, "step": 77 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 372.14288330078125, "epoch": 0.0017619951206288968, "grad_norm": 2.120097201443929, "kl": 0.017822265625, "learning_rate": 9.999925348193589e-07, "loss": 0.0007, "reward": 2.1010749340057373, "reward_std": 0.3495636582374573, "rewards/accuracy_reward": 0.7501367926597595, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.229509636759758, "step": 78 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.5535888671875, "epoch": 0.0017845848016626005, "grad_norm": 2.1448818816439847, "kl": 0.018310546875, "learning_rate": 9.999923396599655e-07, "loss": 0.0007, "reward": 1.8175077438354492, "reward_std": 0.27822551131248474, "rewards/accuracy_reward": 0.4437735974788666, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2558768391609192, "step": 79 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 399.9107360839844, "epoch": 0.0018071744826963043, "grad_norm": 2.072361816402814, "kl": 0.02099609375, "learning_rate": 9.99992141982412e-07, "loss": 0.0008, "reward": 1.8503257036209106, "reward_std": 0.19703464210033417, "rewards/accuracy_reward": 0.48855796456336975, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2581961750984192, "step": 80 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 406.9821472167969, "epoch": 0.001829764163730008, "grad_norm": 6.12774399873514, "kl": 0.0186767578125, "learning_rate": 9.999919417866997e-07, "loss": 0.0007, "reward": 1.7187696695327759, "reward_std": 0.27814918756484985, "rewards/accuracy_reward": 0.5437383055686951, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.2107456773519516, "step": 81 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 346.5000305175781, "epoch": 0.0018523538447637118, "grad_norm": 36.302963638838996, "kl": 0.016357421875, "learning_rate": 9.999917390728295e-07, "loss": 0.0007, "reward": 1.7390848398208618, "reward_std": 0.4014543294906616, "rewards/accuracy_reward": 0.4777775704860687, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1970214992761612, "step": 82 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 437.2500305175781, "epoch": 0.0018749435257974158, "grad_norm": 2.9462808154311286, "kl": 0.0157470703125, "learning_rate": 9.999915338408026e-07, "loss": 0.0006, "reward": 1.8972747325897217, "reward_std": 0.20582540333271027, "rewards/accuracy_reward": 0.6095165014266968, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2377581000328064, "step": 83 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 433.71429443359375, "epoch": 0.0018975332068311196, "grad_norm": 1.7203753357871974, "kl": 0.010986328125, "learning_rate": 9.999913260906199e-07, "loss": 0.0004, "reward": 1.4302936792373657, "reward_std": 0.48546090722084045, "rewards/accuracy_reward": 0.31749364733695984, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.155657097697258, "step": 84 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 431.1250305175781, "epoch": 0.0019201228878648233, "grad_norm": 4.936488300748457, "kl": 0.0159912109375, "learning_rate": 9.999911158222825e-07, "loss": 0.0006, "reward": 1.7334420680999756, "reward_std": 0.1703929752111435, "rewards/accuracy_reward": 0.5496112108230591, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.2481166422367096, "step": 85 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 278.3571472167969, "epoch": 0.0019427125688985271, "grad_norm": 2.3380332270165622, "kl": 0.0194091796875, "learning_rate": 9.999909030357913e-07, "loss": 0.0008, "reward": 1.9962228536605835, "reward_std": 0.18772628903388977, "rewards/accuracy_reward": 0.7142857313156128, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2355085164308548, "step": 86 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 356.1785888671875, "epoch": 0.001965302249932231, "grad_norm": 1.9843251386617178, "kl": 0.01708984375, "learning_rate": 9.999906877311476e-07, "loss": 0.0007, "reward": 1.6117990016937256, "reward_std": 0.3437996804714203, "rewards/accuracy_reward": 0.3997470736503601, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1656232625246048, "step": 87 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 506.1250305175781, "epoch": 0.001987891930965935, "grad_norm": 1.2189191949546587, "kl": 0.01165771484375, "learning_rate": 9.999904699083524e-07, "loss": 0.0005, "reward": 1.3678876161575317, "reward_std": 0.5139690637588501, "rewards/accuracy_reward": 0.3125792443752289, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.1553083211183548, "step": 88 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 346.9464416503906, "epoch": 0.0020104816119996386, "grad_norm": 4.305714446756029, "kl": 0.0203857421875, "learning_rate": 9.999902495674067e-07, "loss": 0.0008, "reward": 1.7097487449645996, "reward_std": 0.3023189306259155, "rewards/accuracy_reward": 0.4294804334640503, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1766967922449112, "step": 89 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 388.5357360839844, "epoch": 0.0020330712930333424, "grad_norm": 4.710500100819524, "kl": 0.01275634765625, "learning_rate": 9.999900267083116e-07, "loss": 0.0005, "reward": 2.009028673171997, "reward_std": 0.384177565574646, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2602190375328064, "step": 90 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 338.1964416503906, "epoch": 0.002055660974067046, "grad_norm": 3.150263815602629, "kl": 0.0225830078125, "learning_rate": 9.999898013310685e-07, "loss": 0.0009, "reward": 2.1727542877197266, "reward_std": 0.15739862620830536, "rewards/accuracy_reward": 0.7570979595184326, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2977992594242096, "step": 91 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 344.26788330078125, "epoch": 0.00207825065510075, "grad_norm": 4.365416291770933, "kl": 0.0185546875, "learning_rate": 9.999895734356783e-07, "loss": 0.0007, "reward": 1.6886008977890015, "reward_std": 0.17356941103935242, "rewards/accuracy_reward": 0.4264250695705414, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1943185031414032, "step": 92 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 306.51788330078125, "epoch": 0.0021008403361344537, "grad_norm": 1.4925060064532536, "kl": 0.0162353515625, "learning_rate": 9.99989343022142e-07, "loss": 0.0007, "reward": 1.6857235431671143, "reward_std": 0.17175927758216858, "rewards/accuracy_reward": 0.4833343029022217, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1809605211019516, "step": 93 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 419.64288330078125, "epoch": 0.0021234300171681575, "grad_norm": 6.405741028770789, "kl": 0.016357421875, "learning_rate": 9.999891100904613e-07, "loss": 0.0007, "reward": 1.422725796699524, "reward_std": 0.33845674991607666, "rewards/accuracy_reward": 0.25856199860572815, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1427350789308548, "step": 94 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 356.1071472167969, "epoch": 0.0021460196982018612, "grad_norm": 2.240784908062342, "kl": 0.016845703125, "learning_rate": 9.999888746406367e-07, "loss": 0.0007, "reward": 2.0631675720214844, "reward_std": 0.4611093997955322, "rewards/accuracy_reward": 0.6872678995132446, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2973284125328064, "step": 95 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 381.08929443359375, "epoch": 0.002168609379235565, "grad_norm": 2.0841183762471105, "kl": 0.01544189453125, "learning_rate": 9.999886366726698e-07, "loss": 0.0006, "reward": 1.8259668350219727, "reward_std": 0.28112825751304626, "rewards/accuracy_reward": 0.5528953671455383, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2373570203781128, "step": 96 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 459.5714416503906, "epoch": 0.002191199060269269, "grad_norm": 1.3942103039957299, "kl": 0.01312255859375, "learning_rate": 9.999883961865618e-07, "loss": 0.0005, "reward": 1.3793435096740723, "reward_std": 0.5034505724906921, "rewards/accuracy_reward": 0.3050461709499359, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.1635829508304596, "step": 97 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 353.89288330078125, "epoch": 0.002213788741302973, "grad_norm": 1.7706643793695105, "kl": 0.023681640625, "learning_rate": 9.999881531823138e-07, "loss": 0.0009, "reward": 1.4240481853485107, "reward_std": 0.19863584637641907, "rewards/accuracy_reward": 0.23172813653945923, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1280343234539032, "step": 98 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 367.51788330078125, "epoch": 0.0022363784223366767, "grad_norm": 2.0783065153045195, "kl": 0.0186767578125, "learning_rate": 9.99987907659927e-07, "loss": 0.0007, "reward": 2.1288983821868896, "reward_std": 0.15431101620197296, "rewards/accuracy_reward": 0.7901461124420166, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2387520968914032, "step": 99 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 338.46429443359375, "epoch": 0.0022589681033703805, "grad_norm": 1.7508279727690879, "kl": 0.0194091796875, "learning_rate": 9.999876596194028e-07, "loss": 0.0008, "reward": 1.5328783988952637, "reward_std": 0.19361238181591034, "rewards/accuracy_reward": 0.2956293225288391, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1801060289144516, "step": 100 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 352.26788330078125, "epoch": 0.0022815577844040843, "grad_norm": 6.6134146269164, "kl": 0.023193359375, "learning_rate": 9.999874090607421e-07, "loss": 0.0009, "reward": 1.748439073562622, "reward_std": 0.2283543050289154, "rewards/accuracy_reward": 0.4493003487586975, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2062813937664032, "step": 101 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 358.4285888671875, "epoch": 0.002304147465437788, "grad_norm": 3.089086454935301, "kl": 0.0201416015625, "learning_rate": 9.999871559839466e-07, "loss": 0.0008, "reward": 1.9395480155944824, "reward_std": 0.19848059117794037, "rewards/accuracy_reward": 0.6442669034004211, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2524239718914032, "step": 102 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 378.6785888671875, "epoch": 0.0023267371464714918, "grad_norm": 49.60928241481453, "kl": 0.0203857421875, "learning_rate": 9.999869003890173e-07, "loss": 0.0008, "reward": 1.5367333889007568, "reward_std": 0.40028896927833557, "rewards/accuracy_reward": 0.3652106523513794, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1500941812992096, "step": 103 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 375.8571472167969, "epoch": 0.0023493268275051955, "grad_norm": 2.7966293473638535, "kl": 0.0198974609375, "learning_rate": 9.999866422759555e-07, "loss": 0.0008, "reward": 1.6062923669815063, "reward_std": 0.20040859282016754, "rewards/accuracy_reward": 0.42199406027793884, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1735839992761612, "step": 104 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 369.5535888671875, "epoch": 0.0023719165085388993, "grad_norm": 1.6270152223980043, "kl": 0.017822265625, "learning_rate": 9.999863816447627e-07, "loss": 0.0007, "reward": 1.5190186500549316, "reward_std": 0.20312291383743286, "rewards/accuracy_reward": 0.392857164144516, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1190185621380806, "step": 105 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 425.5714416503906, "epoch": 0.002394506189572603, "grad_norm": 1.5426341013655902, "kl": 0.01708984375, "learning_rate": 9.9998611849544e-07, "loss": 0.0007, "reward": 1.5620324611663818, "reward_std": 0.24406620860099792, "rewards/accuracy_reward": 0.36226609349250793, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.153337761759758, "step": 106 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 398.2321472167969, "epoch": 0.002417095870606307, "grad_norm": 2.170020240388684, "kl": 0.0208740234375, "learning_rate": 9.999858528279888e-07, "loss": 0.0008, "reward": 1.8127262592315674, "reward_std": 0.2272806614637375, "rewards/accuracy_reward": 0.5065407752990723, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.241899773478508, "step": 107 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 426.0000305175781, "epoch": 0.002439685551640011, "grad_norm": 4.027768823215243, "kl": 0.0185546875, "learning_rate": 9.999855846424103e-07, "loss": 0.0007, "reward": 1.6299974918365479, "reward_std": 0.33794936537742615, "rewards/accuracy_reward": 0.4788743257522583, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.1689801961183548, "step": 108 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 415.0714416503906, "epoch": 0.002462275232673715, "grad_norm": 3.2815180567406963, "kl": 0.018798828125, "learning_rate": 9.999853139387062e-07, "loss": 0.0008, "reward": 1.6855875253677368, "reward_std": 0.13668176531791687, "rewards/accuracy_reward": 0.36573532223701477, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2412807047367096, "step": 109 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.2857142857142857, "completion_length": 322.01788330078125, "epoch": 0.0024848649137074186, "grad_norm": 1.7231895267170834, "kl": 0.0198974609375, "learning_rate": 9.999850407168776e-07, "loss": 0.0008, "reward": 1.836098074913025, "reward_std": 0.15268215537071228, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1789550930261612, "step": 110 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 386.7500305175781, "epoch": 0.0025074545947411223, "grad_norm": 2.275400535625846, "kl": 0.0216064453125, "learning_rate": 9.999847649769258e-07, "loss": 0.0009, "reward": 1.638094425201416, "reward_std": 0.18520566821098328, "rewards/accuracy_reward": 0.3737911880016327, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2143031656742096, "step": 111 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 493.0357360839844, "epoch": 0.002530044275774826, "grad_norm": 1.9113830124847069, "kl": 0.01300048828125, "learning_rate": 9.999844867188523e-07, "loss": 0.0005, "reward": 1.5321661233901978, "reward_std": 0.24314944446086884, "rewards/accuracy_reward": 0.3985130488872528, "rewards/format_reward": 0.8392857313156128, "rewards/semantic_reward": 0.2086530476808548, "step": 112 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 409.7857360839844, "epoch": 0.00255263395680853, "grad_norm": 1.7413093086521638, "kl": 0.0137939453125, "learning_rate": 9.999842059426587e-07, "loss": 0.0006, "reward": 1.4934582710266113, "reward_std": 0.2087702453136444, "rewards/accuracy_reward": 0.41432178020477295, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.132707878947258, "step": 113 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 388.46429443359375, "epoch": 0.0025752236378422336, "grad_norm": 1.6867045541025323, "kl": 0.0194091796875, "learning_rate": 9.99983922648346e-07, "loss": 0.0008, "reward": 1.4838807582855225, "reward_std": 0.3665664792060852, "rewards/accuracy_reward": 0.34376487135887146, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.172258660197258, "step": 114 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 370.39288330078125, "epoch": 0.0025978133188759374, "grad_norm": 2.011041259433456, "kl": 0.01904296875, "learning_rate": 9.99983636835916e-07, "loss": 0.0008, "reward": 1.514014720916748, "reward_std": 0.3330693244934082, "rewards/accuracy_reward": 0.320519357919693, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1542096883058548, "step": 115 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 420.3750305175781, "epoch": 0.002620402999909641, "grad_norm": 2.4027540765094915, "kl": 0.017822265625, "learning_rate": 9.999833485053698e-07, "loss": 0.0007, "reward": 1.8531745672225952, "reward_std": 0.38550055027008057, "rewards/accuracy_reward": 0.6111615300178528, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.2420131266117096, "step": 116 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 342.8214416503906, "epoch": 0.002642992680943345, "grad_norm": 4.687314062965966, "kl": 0.024169921875, "learning_rate": 9.999830576567093e-07, "loss": 0.001, "reward": 1.7342420816421509, "reward_std": 0.28500092029571533, "rewards/accuracy_reward": 0.4455876350402832, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1922258734703064, "step": 117 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 382.1964416503906, "epoch": 0.0026655823619770487, "grad_norm": 3.123534791522741, "kl": 0.0250244140625, "learning_rate": 9.999827642899357e-07, "loss": 0.001, "reward": 1.6465975046157837, "reward_std": 0.2188887745141983, "rewards/accuracy_reward": 0.3696233630180359, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2019740641117096, "step": 118 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 374.58929443359375, "epoch": 0.002688172043010753, "grad_norm": 4.488855309918636, "kl": 0.022705078125, "learning_rate": 9.999824684050502e-07, "loss": 0.0009, "reward": 1.7431284189224243, "reward_std": 0.21455271542072296, "rewards/accuracy_reward": 0.4789334237575531, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2106236070394516, "step": 119 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 360.26788330078125, "epoch": 0.0027107617240444567, "grad_norm": 1.2481273882754127, "kl": 0.017578125, "learning_rate": 9.999821700020548e-07, "loss": 0.0007, "reward": 1.6655726432800293, "reward_std": 0.22231166064739227, "rewards/accuracy_reward": 0.4642857313156128, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1584298312664032, "step": 120 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 353.4107360839844, "epoch": 0.0027333514050781604, "grad_norm": 2.095514462297732, "kl": 0.0203857421875, "learning_rate": 9.999818690809507e-07, "loss": 0.0008, "reward": 1.8762714862823486, "reward_std": 0.22488775849342346, "rewards/accuracy_reward": 0.5324935913085938, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2652064859867096, "step": 121 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 368.5714416503906, "epoch": 0.002755941086111864, "grad_norm": 3.4422836798815744, "kl": 0.0262451171875, "learning_rate": 9.999815656417396e-07, "loss": 0.0011, "reward": 1.8636047840118408, "reward_std": 0.130012646317482, "rewards/accuracy_reward": 0.4698476195335388, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2794712781906128, "step": 122 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 377.6250305175781, "epoch": 0.002778530767145568, "grad_norm": 1.3053692170507731, "kl": 0.0179443359375, "learning_rate": 9.99981259684423e-07, "loss": 0.0007, "reward": 1.3255650997161865, "reward_std": 0.46355366706848145, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0612793006002903, "step": 123 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 392.8214416503906, "epoch": 0.0028011204481792717, "grad_norm": 3.1401401080537408, "kl": 0.0234375, "learning_rate": 9.99980951209002e-07, "loss": 0.0009, "reward": 1.5563733577728271, "reward_std": 0.3404160439968109, "rewards/accuracy_reward": 0.33171510696411133, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1853725016117096, "step": 124 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 418.0000305175781, "epoch": 0.0028237101292129755, "grad_norm": 7.319931063936742, "kl": 0.02294921875, "learning_rate": 9.999806402154789e-07, "loss": 0.0009, "reward": 1.5999630689620972, "reward_std": 0.12155549228191376, "rewards/accuracy_reward": 0.4473264813423157, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.2026367336511612, "step": 125 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 346.2500305175781, "epoch": 0.0028462998102466793, "grad_norm": 2.1499278938035227, "kl": 0.0228271484375, "learning_rate": 9.999803267038547e-07, "loss": 0.0009, "reward": 1.5019891262054443, "reward_std": 0.27324503660202026, "rewards/accuracy_reward": 0.27232247591018677, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1582380086183548, "step": 126 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 438.0714416503906, "epoch": 0.002868889491280383, "grad_norm": 3.0136355074220664, "kl": 0.01904296875, "learning_rate": 9.999800106741312e-07, "loss": 0.0008, "reward": 1.4904602766036987, "reward_std": 0.4437495470046997, "rewards/accuracy_reward": 0.374158650636673, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.1734444797039032, "step": 127 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 409.89288330078125, "epoch": 0.002891479172314087, "grad_norm": 3.0933623687568526, "kl": 0.017578125, "learning_rate": 9.9997969212631e-07, "loss": 0.0007, "reward": 1.5489513874053955, "reward_std": 0.41920265555381775, "rewards/accuracy_reward": 0.37250736355781555, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1407296359539032, "step": 128 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 503.2500305175781, "epoch": 0.0029140688533477906, "grad_norm": 2.2605712719708935, "kl": 0.01611328125, "learning_rate": 9.999793710603926e-07, "loss": 0.0006, "reward": 1.7169748544692993, "reward_std": 0.3974662125110626, "rewards/accuracy_reward": 0.5262347459793091, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.1871686726808548, "step": 129 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 356.3571472167969, "epoch": 0.0029366585343814943, "grad_norm": 1.9140473457484624, "kl": 0.0228271484375, "learning_rate": 9.999790474763807e-07, "loss": 0.0009, "reward": 1.6198030710220337, "reward_std": 0.31119823455810547, "rewards/accuracy_reward": 0.42793628573417664, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1454380601644516, "step": 130 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 363.21429443359375, "epoch": 0.0029592482154151985, "grad_norm": 2.0937978034459084, "kl": 0.0211181640625, "learning_rate": 9.999787213742759e-07, "loss": 0.0008, "reward": 1.8737727403640747, "reward_std": 0.09499320387840271, "rewards/accuracy_reward": 0.49844130873680115, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2503313422203064, "step": 131 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 354.1607360839844, "epoch": 0.0029818378964489023, "grad_norm": 2.037731406370498, "kl": 0.0274658203125, "learning_rate": 9.9997839275408e-07, "loss": 0.0011, "reward": 1.8720669746398926, "reward_std": 0.24881035089492798, "rewards/accuracy_reward": 0.5991212129592896, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1979457437992096, "step": 132 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 391.01788330078125, "epoch": 0.003004427577482606, "grad_norm": 3.342085052875403, "kl": 0.0267333984375, "learning_rate": 9.999780616157941e-07, "loss": 0.0011, "reward": 1.7691378593444824, "reward_std": 0.19508019089698792, "rewards/accuracy_reward": 0.4241183400154114, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2343052625656128, "step": 133 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 399.2321472167969, "epoch": 0.00302701725851631, "grad_norm": 2.2211651354529405, "kl": 0.0225830078125, "learning_rate": 9.999777279594206e-07, "loss": 0.0009, "reward": 1.7196989059448242, "reward_std": 0.26120254397392273, "rewards/accuracy_reward": 0.42592084407806396, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1937779039144516, "step": 134 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 354.0535888671875, "epoch": 0.0030496069395500136, "grad_norm": 1.9768897218183903, "kl": 0.0257568359375, "learning_rate": 9.999773917849607e-07, "loss": 0.001, "reward": 1.8416846990585327, "reward_std": 0.24926678836345673, "rewards/accuracy_reward": 0.5543625950813293, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2016078531742096, "step": 135 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 350.4107360839844, "epoch": 0.0030721966205837174, "grad_norm": 1.9321401203036743, "kl": 0.0279541015625, "learning_rate": 9.999770530924162e-07, "loss": 0.0011, "reward": 1.7570710182189941, "reward_std": 0.2582797706127167, "rewards/accuracy_reward": 0.48768284916877747, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2122454047203064, "step": 136 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 412.7500305175781, "epoch": 0.003094786301617421, "grad_norm": 3.6147672370569057, "kl": 0.022216796875, "learning_rate": 9.99976711881789e-07, "loss": 0.0009, "reward": 1.7630302906036377, "reward_std": 0.32883259654045105, "rewards/accuracy_reward": 0.4746582806110382, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.188371941447258, "step": 137 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 386.6964416503906, "epoch": 0.003117375982651125, "grad_norm": 3.7879983961413703, "kl": 0.0238037109375, "learning_rate": 9.999763681530807e-07, "loss": 0.001, "reward": 1.7791645526885986, "reward_std": 0.17284826934337616, "rewards/accuracy_reward": 0.4656425416469574, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2170933485031128, "step": 138 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 406.89288330078125, "epoch": 0.0031399656636848287, "grad_norm": 2.0186075254094478, "kl": 0.0233154296875, "learning_rate": 9.99976021906293e-07, "loss": 0.0009, "reward": 1.9628583192825317, "reward_std": 0.2916502356529236, "rewards/accuracy_reward": 0.5476412177085876, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2759312391281128, "step": 139 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 414.5714416503906, "epoch": 0.0031625553447185324, "grad_norm": 1.9606048071396551, "kl": 0.0224609375, "learning_rate": 9.999756731414275e-07, "loss": 0.0009, "reward": 2.0476818084716797, "reward_std": 0.3111831545829773, "rewards/accuracy_reward": 0.6538689732551575, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2652413547039032, "step": 140 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 468.857177734375, "epoch": 0.003185145025752236, "grad_norm": 5.176242293352078, "kl": 0.019287109375, "learning_rate": 9.99975321858486e-07, "loss": 0.0008, "reward": 1.7432301044464111, "reward_std": 0.24894970655441284, "rewards/accuracy_reward": 0.4553811550140381, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2235630750656128, "step": 141 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 375.83929443359375, "epoch": 0.0032077347067859404, "grad_norm": 1.8372643085528668, "kl": 0.02685546875, "learning_rate": 9.999749680574708e-07, "loss": 0.0011, "reward": 1.6276352405548096, "reward_std": 0.22988098859786987, "rewards/accuracy_reward": 0.38103562593460083, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1751709133386612, "step": 142 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 425.3035888671875, "epoch": 0.003230324387819644, "grad_norm": 2.366796462319782, "kl": 0.0238037109375, "learning_rate": 9.999746117383828e-07, "loss": 0.001, "reward": 1.8319493532180786, "reward_std": 0.16736756265163422, "rewards/accuracy_reward": 0.5515817403793335, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1982247531414032, "step": 143 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 476.1785888671875, "epoch": 0.003252914068853348, "grad_norm": 2.1803179947257108, "kl": 0.01806640625, "learning_rate": 9.999742529012246e-07, "loss": 0.0007, "reward": 1.28599214553833, "reward_std": 0.32084840536117554, "rewards/accuracy_reward": 0.27173078060150146, "rewards/format_reward": 0.8392857313156128, "rewards/semantic_reward": 0.1178327351808548, "step": 144 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 381.1071472167969, "epoch": 0.0032755037498870517, "grad_norm": 1.1593158548995632, "kl": 0.0203857421875, "learning_rate": 9.999738915459974e-07, "loss": 0.0008, "reward": 1.3375916481018066, "reward_std": 0.03852906450629234, "rewards/accuracy_reward": 0.19681313633918762, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1014927476644516, "step": 145 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 390.58929443359375, "epoch": 0.0032980934309207554, "grad_norm": 5.09918383366431, "kl": 0.0286865234375, "learning_rate": 9.999735276727034e-07, "loss": 0.0011, "reward": 2.0727930068969727, "reward_std": 0.20468464493751526, "rewards/accuracy_reward": 0.6188901662826538, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3039027750492096, "step": 146 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 396.26788330078125, "epoch": 0.003320683111954459, "grad_norm": 3.6494181528943073, "kl": 0.02734375, "learning_rate": 9.999731612813442e-07, "loss": 0.0011, "reward": 1.8537925481796265, "reward_std": 0.07046861946582794, "rewards/accuracy_reward": 0.48045435547828674, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2340523898601532, "step": 147 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 444.33929443359375, "epoch": 0.003343272792988163, "grad_norm": 4.089741071487933, "kl": 0.0201416015625, "learning_rate": 9.99972792371922e-07, "loss": 0.0008, "reward": 1.6263591051101685, "reward_std": 0.43292203545570374, "rewards/accuracy_reward": 0.4400378167629242, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.211321160197258, "step": 148 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 400.9285888671875, "epoch": 0.0033658624740218667, "grad_norm": 7.938151516884764, "kl": 0.02783203125, "learning_rate": 9.999724209444382e-07, "loss": 0.0011, "reward": 1.7731953859329224, "reward_std": 0.14121007919311523, "rewards/accuracy_reward": 0.4354563355445862, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2270246297121048, "step": 149 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 462.6785888671875, "epoch": 0.0033884521550555705, "grad_norm": 8.785235380684513, "kl": 0.0244140625, "learning_rate": 9.999720469988947e-07, "loss": 0.001, "reward": 1.6047743558883667, "reward_std": 0.3982895612716675, "rewards/accuracy_reward": 0.4060019254684448, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.2416294813156128, "step": 150 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 417.01788330078125, "epoch": 0.0034110418360892743, "grad_norm": 2.3204900400794717, "kl": 0.01416015625, "learning_rate": 9.999716705352938e-07, "loss": 0.0006, "reward": 1.942742109298706, "reward_std": 0.31884098052978516, "rewards/accuracy_reward": 0.6230748295783997, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2660958468914032, "step": 151 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 405.08929443359375, "epoch": 0.003433631517122978, "grad_norm": 4.889068661025964, "kl": 0.0233154296875, "learning_rate": 9.999712915536371e-07, "loss": 0.0009, "reward": 1.5757335424423218, "reward_std": 0.2517612874507904, "rewards/accuracy_reward": 0.3336332142353058, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1528145968914032, "step": 152 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 377.9107360839844, "epoch": 0.0034562211981566822, "grad_norm": 3.1660118555476284, "kl": 0.0255126953125, "learning_rate": 9.999709100539267e-07, "loss": 0.001, "reward": 1.8141776323318481, "reward_std": 0.14290374517440796, "rewards/accuracy_reward": 0.5103864073753357, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2073625922203064, "step": 153 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 446.58929443359375, "epoch": 0.003478810879190386, "grad_norm": 10.154974907500957, "kl": 0.023681640625, "learning_rate": 9.999705260361645e-07, "loss": 0.0009, "reward": 1.8778598308563232, "reward_std": 0.25315749645233154, "rewards/accuracy_reward": 0.6199218034744263, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2257952094078064, "step": 154 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 378.51788330078125, "epoch": 0.0035014005602240898, "grad_norm": 3.3992501916776017, "kl": 0.0233154296875, "learning_rate": 9.999701395003518e-07, "loss": 0.0009, "reward": 1.8045752048492432, "reward_std": 0.36261504888534546, "rewards/accuracy_reward": 0.5094229578971863, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2094377875328064, "step": 155 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 417.21429443359375, "epoch": 0.0035239902412577935, "grad_norm": 10.803361913009315, "kl": 0.029296875, "learning_rate": 9.999697504464915e-07, "loss": 0.0012, "reward": 1.3872826099395752, "reward_std": 0.3104549050331116, "rewards/accuracy_reward": 0.25430092215538025, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.1722673773765564, "step": 156 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 359.58929443359375, "epoch": 0.0035465799222914973, "grad_norm": 1.9757196119997777, "kl": 0.020263671875, "learning_rate": 9.99969358874585e-07, "loss": 0.0008, "reward": 1.6730841398239136, "reward_std": 0.36305803060531616, "rewards/accuracy_reward": 0.4523809552192688, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.167131707072258, "step": 157 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 360.2857360839844, "epoch": 0.003569169603325201, "grad_norm": 2.2021484743874487, "kl": 0.0260009765625, "learning_rate": 9.999689647846344e-07, "loss": 0.001, "reward": 1.923832893371582, "reward_std": 0.26496636867523193, "rewards/accuracy_reward": 0.5360047817230225, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2913993000984192, "step": 158 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 394.1071472167969, "epoch": 0.003591759284358905, "grad_norm": 2.5088572923653514, "kl": 0.02099609375, "learning_rate": 9.999685681766418e-07, "loss": 0.0008, "reward": 1.8282283544540405, "reward_std": 0.2286648452281952, "rewards/accuracy_reward": 0.5346910953521729, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2149658352136612, "step": 159 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 354.7500305175781, "epoch": 0.0036143489653926086, "grad_norm": 1.8544088371280547, "kl": 0.0269775390625, "learning_rate": 9.99968169050609e-07, "loss": 0.0011, "reward": 1.8628973960876465, "reward_std": 0.06557013094425201, "rewards/accuracy_reward": 0.5822159051895142, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.180681511759758, "step": 160 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 397.39288330078125, "epoch": 0.0036369386464263124, "grad_norm": 15.150845593760243, "kl": 0.0283203125, "learning_rate": 9.999677674065381e-07, "loss": 0.0011, "reward": 1.7887355089187622, "reward_std": 0.196210578083992, "rewards/accuracy_reward": 0.4657164216041565, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2194475531578064, "step": 161 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 460.58929443359375, "epoch": 0.003659528327460016, "grad_norm": 1.3814219438290327, "kl": 0.0177001953125, "learning_rate": 9.999673632444309e-07, "loss": 0.0007, "reward": 1.3710510730743408, "reward_std": 0.21893078088760376, "rewards/accuracy_reward": 0.24575094878673553, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1110142320394516, "step": 162 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 384.8214416503906, "epoch": 0.00368211800849372, "grad_norm": 2.072594005659272, "kl": 0.0267333984375, "learning_rate": 9.9996695656429e-07, "loss": 0.0011, "reward": 1.817350149154663, "reward_std": 0.25549912452697754, "rewards/accuracy_reward": 0.5029736757278442, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2358049750328064, "step": 163 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 362.7500305175781, "epoch": 0.0037047076895274237, "grad_norm": 2.52114277002413, "kl": 0.0244140625, "learning_rate": 9.99966547366117e-07, "loss": 0.001, "reward": 2.024670124053955, "reward_std": 0.3001940846443176, "rewards/accuracy_reward": 0.6664702892303467, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.243913933634758, "step": 164 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 383.4285888671875, "epoch": 0.003727297370561128, "grad_norm": 5.043704293088645, "kl": 0.0255126953125, "learning_rate": 9.999661356499138e-07, "loss": 0.001, "reward": 1.9222890138626099, "reward_std": 0.17921097576618195, "rewards/accuracy_reward": 0.6488096117973328, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2270507961511612, "step": 165 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 421.5357360839844, "epoch": 0.0037498870515948316, "grad_norm": 1.9522686608169713, "kl": 0.02294921875, "learning_rate": 9.999657214156831e-07, "loss": 0.0009, "reward": 1.5736730098724365, "reward_std": 0.1805059313774109, "rewards/accuracy_reward": 0.48274973034858704, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.1516374945640564, "step": 166 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 423.76788330078125, "epoch": 0.0037724767326285354, "grad_norm": 2.676711705548722, "kl": 0.0272216796875, "learning_rate": 9.999653046634264e-07, "loss": 0.0011, "reward": 1.7832709550857544, "reward_std": 0.10621102154254913, "rewards/accuracy_reward": 0.5806463360786438, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.2347673773765564, "step": 167 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 362.33929443359375, "epoch": 0.003795066413662239, "grad_norm": 1.342436061183843, "kl": 0.02294921875, "learning_rate": 9.999648853931463e-07, "loss": 0.0009, "reward": 1.596840739250183, "reward_std": 0.2909846901893616, "rewards/accuracy_reward": 0.4061146378517151, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1550118625164032, "step": 168 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 405.0000305175781, "epoch": 0.003817656094695943, "grad_norm": 1.7913000649160955, "kl": 0.0247802734375, "learning_rate": 9.999644636048443e-07, "loss": 0.001, "reward": 1.6472485065460205, "reward_std": 0.4147769510746002, "rewards/accuracy_reward": 0.4294261932373047, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1606794148683548, "step": 169 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 432.3750305175781, "epoch": 0.0038402457757296467, "grad_norm": 1.6479546944329353, "kl": 0.0224609375, "learning_rate": 9.999640392985228e-07, "loss": 0.0009, "reward": 1.8749128580093384, "reward_std": 0.23895509541034698, "rewards/accuracy_reward": 0.5528199076652527, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.225664421916008, "step": 170 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 365.1071472167969, "epoch": 0.0038628354567633505, "grad_norm": 2.6522350714662037, "kl": 0.030517578125, "learning_rate": 9.999636124741842e-07, "loss": 0.0012, "reward": 2.119114637374878, "reward_std": 0.09549988061189651, "rewards/accuracy_reward": 0.7311227321624756, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2629917860031128, "step": 171 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 402.6785888671875, "epoch": 0.0038854251377970542, "grad_norm": 2.6888890498072717, "kl": 0.0220947265625, "learning_rate": 9.999631831318304e-07, "loss": 0.0009, "reward": 1.77900230884552, "reward_std": 0.32349154353141785, "rewards/accuracy_reward": 0.5238646864891052, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1765659898519516, "step": 172 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.5714285714285714, "completion_length": 512.6428833007812, "epoch": 0.003908014818830758, "grad_norm": 8.321728398939015, "kl": 0.016357421875, "learning_rate": 9.999627512714637e-07, "loss": 0.0007, "reward": 1.3909040689468384, "reward_std": 0.17334648966789246, "rewards/accuracy_reward": 0.4285714626312256, "rewards/format_reward": 0.785714328289032, "rewards/semantic_reward": 0.1230468824505806, "step": 173 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 353.1071472167969, "epoch": 0.003930604499864462, "grad_norm": 5.036270978856433, "kl": 0.03173828125, "learning_rate": 9.999623168930862e-07, "loss": 0.0013, "reward": 1.7252103090286255, "reward_std": 0.26137059926986694, "rewards/accuracy_reward": 0.42446741461753845, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2078857570886612, "step": 174 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 421.7500305175781, "epoch": 0.0039531941808981655, "grad_norm": 2.241089433352706, "kl": 0.0233154296875, "learning_rate": 9.999618799966997e-07, "loss": 0.0009, "reward": 1.7218332290649414, "reward_std": 0.17630676925182343, "rewards/accuracy_reward": 0.42490747570991516, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1969255805015564, "step": 175 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 391.9285888671875, "epoch": 0.00397578386193187, "grad_norm": 6.788559889656986, "kl": 0.02685546875, "learning_rate": 9.99961440582307e-07, "loss": 0.0011, "reward": 1.8318644762039185, "reward_std": 0.25318095088005066, "rewards/accuracy_reward": 0.5578845739364624, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1775512844324112, "step": 176 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 375.26788330078125, "epoch": 0.003998373542965573, "grad_norm": 11.797488010084596, "kl": 0.033203125, "learning_rate": 9.999609986499103e-07, "loss": 0.0013, "reward": 1.3787106275558472, "reward_std": 0.04590386897325516, "rewards/accuracy_reward": 0.20761150121688843, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0996704176068306, "step": 177 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 396.9821472167969, "epoch": 0.004020963223999277, "grad_norm": 2.677675878116268, "kl": 0.02734375, "learning_rate": 9.999605541995114e-07, "loss": 0.0011, "reward": 1.894924283027649, "reward_std": 0.196466863155365, "rewards/accuracy_reward": 0.5127743482589722, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2571498453617096, "step": 178 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 379.76788330078125, "epoch": 0.004043552905032981, "grad_norm": 2.938470918462845, "kl": 0.03173828125, "learning_rate": 9.999601072311127e-07, "loss": 0.0013, "reward": 1.7304006814956665, "reward_std": 0.15876591205596924, "rewards/accuracy_reward": 0.35153618454933167, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.246721550822258, "step": 179 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 401.6250305175781, "epoch": 0.004066142586066685, "grad_norm": 1.9137013349836818, "kl": 0.0201416015625, "learning_rate": 9.999596577447167e-07, "loss": 0.0008, "reward": 1.8506263494491577, "reward_std": 0.3450123965740204, "rewards/accuracy_reward": 0.6507936716079712, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1855468899011612, "step": 180 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 391.51788330078125, "epoch": 0.004088732267100388, "grad_norm": 3.1619861713908954, "kl": 0.0233154296875, "learning_rate": 9.999592057403253e-07, "loss": 0.0009, "reward": 1.5909790992736816, "reward_std": 0.14332807064056396, "rewards/accuracy_reward": 0.4464285969734192, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1552647203207016, "step": 181 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 450.58929443359375, "epoch": 0.004111321948134092, "grad_norm": 15.664445896709728, "kl": 0.0223388671875, "learning_rate": 9.999587512179409e-07, "loss": 0.0009, "reward": 1.5162965059280396, "reward_std": 0.21309083700180054, "rewards/accuracy_reward": 0.3594883680343628, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.1746651828289032, "step": 182 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 449.7500305175781, "epoch": 0.0041339116291677965, "grad_norm": 2.0520663763823137, "kl": 0.0244140625, "learning_rate": 9.99958294177566e-07, "loss": 0.001, "reward": 1.719246506690979, "reward_std": 0.40284326672554016, "rewards/accuracy_reward": 0.4110572636127472, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2153320461511612, "step": 183 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 394.1250305175781, "epoch": 0.0041565013102015, "grad_norm": 2.636850051164526, "kl": 0.0252685546875, "learning_rate": 9.999578346192026e-07, "loss": 0.001, "reward": 1.7550297975540161, "reward_std": 0.059810493141412735, "rewards/accuracy_reward": 0.42674773931503296, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2032819539308548, "step": 184 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 402.46429443359375, "epoch": 0.004179090991235204, "grad_norm": 10.539802443612478, "kl": 0.03369140625, "learning_rate": 9.999573725428532e-07, "loss": 0.0014, "reward": 1.8873332738876343, "reward_std": 0.09555447846651077, "rewards/accuracy_reward": 0.43268755078315735, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2796456515789032, "step": 185 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 389.7321472167969, "epoch": 0.004201680672268907, "grad_norm": 1.6308403173950567, "kl": 0.028564453125, "learning_rate": 9.9995690794852e-07, "loss": 0.0011, "reward": 1.7918527126312256, "reward_std": 0.18909212946891785, "rewards/accuracy_reward": 0.4614047408103943, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2090192586183548, "step": 186 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 423.6607360839844, "epoch": 0.004224270353302612, "grad_norm": 3.124337423294188, "kl": 0.0230712890625, "learning_rate": 9.999564408362052e-07, "loss": 0.0009, "reward": 1.3802591562271118, "reward_std": 0.2432202249765396, "rewards/accuracy_reward": 0.2782258987426758, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.1198904886841774, "step": 187 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 414.1964416503906, "epoch": 0.004246860034336315, "grad_norm": 2.4359194277882006, "kl": 0.0260009765625, "learning_rate": 9.999559712059116e-07, "loss": 0.001, "reward": 1.7367103099822998, "reward_std": 0.25271159410476685, "rewards/accuracy_reward": 0.45514291524887085, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1958530992269516, "step": 188 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 468.8035888671875, "epoch": 0.004269449715370019, "grad_norm": 61.86583733573737, "kl": 0.0218505859375, "learning_rate": 9.999554990576414e-07, "loss": 0.0009, "reward": 1.8021421432495117, "reward_std": 0.5498673915863037, "rewards/accuracy_reward": 0.5464324951171875, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.191423699259758, "step": 189 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 461.482177734375, "epoch": 0.0042920393964037224, "grad_norm": 3.5432981453567582, "kl": 0.031494140625, "learning_rate": 9.999550243913968e-07, "loss": 0.0013, "reward": 1.5942728519439697, "reward_std": 0.22309795022010803, "rewards/accuracy_reward": 0.3020154535770416, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.2244001179933548, "step": 190 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 422.2500305175781, "epoch": 0.004314629077437427, "grad_norm": 2.958137568693394, "kl": 0.0262451171875, "learning_rate": 9.999545472071804e-07, "loss": 0.0011, "reward": 1.9051036834716797, "reward_std": 0.22149135172367096, "rewards/accuracy_reward": 0.5915327072143555, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2314278781414032, "step": 191 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 392.9464416503906, "epoch": 0.00433721875847113, "grad_norm": 2.103162476653039, "kl": 0.0245361328125, "learning_rate": 9.999540675049944e-07, "loss": 0.001, "reward": 1.947791576385498, "reward_std": 0.17951925098896027, "rewards/accuracy_reward": 0.6539368033409119, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2009975016117096, "step": 192 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 392.5535888671875, "epoch": 0.004359808439504834, "grad_norm": 2.685644442726693, "kl": 0.029052734375, "learning_rate": 9.999535852848413e-07, "loss": 0.0012, "reward": 2.1274869441986084, "reward_std": 0.08318132162094116, "rewards/accuracy_reward": 0.7328320145606995, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2446550726890564, "step": 193 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 445.1607360839844, "epoch": 0.004382398120538538, "grad_norm": 1.8177199402776885, "kl": 0.0203857421875, "learning_rate": 9.999531005467236e-07, "loss": 0.0008, "reward": 1.6099436283111572, "reward_std": 0.22704407572746277, "rewards/accuracy_reward": 0.3984961211681366, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.15430450439453125, "step": 194 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 397.5357360839844, "epoch": 0.004404987801572242, "grad_norm": 6.07617185577974, "kl": 0.0281982421875, "learning_rate": 9.999526132906438e-07, "loss": 0.0011, "reward": 2.0237274169921875, "reward_std": 0.1713549792766571, "rewards/accuracy_reward": 0.6382433176040649, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2461983859539032, "step": 195 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 424.46429443359375, "epoch": 0.004427577482605946, "grad_norm": 2.7508246566082577, "kl": 0.0284423828125, "learning_rate": 9.99952123516604e-07, "loss": 0.0011, "reward": 1.6200897693634033, "reward_std": 0.06626692414283752, "rewards/accuracy_reward": 0.43732255697250366, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.2291957437992096, "step": 196 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 366.2321472167969, "epoch": 0.004450167163639649, "grad_norm": 2.2215798199302492, "kl": 0.02880859375, "learning_rate": 9.99951631224607e-07, "loss": 0.0012, "reward": 1.8157014846801758, "reward_std": 0.13068412244319916, "rewards/accuracy_reward": 0.5400196313858032, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1899675726890564, "step": 197 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 447.5714416503906, "epoch": 0.0044727568446733534, "grad_norm": 3.1359196439582853, "kl": 0.01806640625, "learning_rate": 9.999511364146552e-07, "loss": 0.0007, "reward": 1.922991156578064, "reward_std": 0.23198950290679932, "rewards/accuracy_reward": 0.7321428656578064, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.2265625149011612, "step": 198 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 416.89288330078125, "epoch": 0.004495346525707057, "grad_norm": 2.553186269852962, "kl": 0.0252685546875, "learning_rate": 9.99950639086751e-07, "loss": 0.001, "reward": 2.211777687072754, "reward_std": 0.17212709784507751, "rewards/accuracy_reward": 0.8005926012992859, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2504708468914032, "step": 199 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 407.4821472167969, "epoch": 0.004517936206740761, "grad_norm": 1.838445252049524, "kl": 0.0230712890625, "learning_rate": 9.99950139240897e-07, "loss": 0.0009, "reward": 1.9424043893814087, "reward_std": 0.17941159009933472, "rewards/accuracy_reward": 0.6604148745536804, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2177037000656128, "step": 200 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 383.8214416503906, "epoch": 0.004540525887774464, "grad_norm": 3.8453183946402953, "kl": 0.0301513671875, "learning_rate": 9.999496368770958e-07, "loss": 0.0012, "reward": 1.80759596824646, "reward_std": 0.08364541083574295, "rewards/accuracy_reward": 0.44407743215560913, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2492327094078064, "step": 201 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 353.26788330078125, "epoch": 0.0045631155688081685, "grad_norm": 1.8063557707288864, "kl": 0.0277099609375, "learning_rate": 9.999491319953499e-07, "loss": 0.0011, "reward": 1.9216557741165161, "reward_std": 0.08350037038326263, "rewards/accuracy_reward": 0.5548675060272217, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2346453070640564, "step": 202 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 469.3750305175781, "epoch": 0.004585705249841872, "grad_norm": 1.018816505746038, "kl": 0.019287109375, "learning_rate": 9.999486245956617e-07, "loss": 0.0008, "reward": 1.5306740999221802, "reward_std": 0.47083038091659546, "rewards/accuracy_reward": 0.3890690207481384, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.1808907687664032, "step": 203 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 399.5535888671875, "epoch": 0.004608294930875576, "grad_norm": 2.965324629636563, "kl": 0.027587890625, "learning_rate": 9.999481146780336e-07, "loss": 0.0011, "reward": 1.8993874788284302, "reward_std": 0.13718266785144806, "rewards/accuracy_reward": 0.5284785628318787, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2351946234703064, "step": 204 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 390.26788330078125, "epoch": 0.00463088461190928, "grad_norm": 5.026221572074899, "kl": 0.026611328125, "learning_rate": 9.999476022424687e-07, "loss": 0.0011, "reward": 1.5783181190490723, "reward_std": 0.0997685045003891, "rewards/accuracy_reward": 0.3122744560241699, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1946149617433548, "step": 205 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 366.1071472167969, "epoch": 0.0046534742929429836, "grad_norm": 4.2621044422023315, "kl": 0.030517578125, "learning_rate": 9.99947087288969e-07, "loss": 0.0012, "reward": 1.9522714614868164, "reward_std": 0.24301478266716003, "rewards/accuracy_reward": 0.6119915246963501, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2331368625164032, "step": 206 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 406.0000305175781, "epoch": 0.004676063973976688, "grad_norm": 4.444114609847484, "kl": 0.0238037109375, "learning_rate": 9.999465698175376e-07, "loss": 0.001, "reward": 1.6237565279006958, "reward_std": 0.16261546313762665, "rewards/accuracy_reward": 0.3348788022994995, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2067348062992096, "step": 207 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 391.3035888671875, "epoch": 0.004698653655010391, "grad_norm": 1.7246455217423065, "kl": 0.0245361328125, "learning_rate": 9.99946049828177e-07, "loss": 0.001, "reward": 1.292259693145752, "reward_std": 0.20633742213249207, "rewards/accuracy_reward": 0.1516764760017395, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.097726009786129, "step": 208 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 409.6964416503906, "epoch": 0.004721243336044095, "grad_norm": 2.637025810998613, "kl": 0.0267333984375, "learning_rate": 9.999455273208891e-07, "loss": 0.0011, "reward": 1.96980619430542, "reward_std": 0.16196146607398987, "rewards/accuracy_reward": 0.582020103931427, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2485002875328064, "step": 209 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 392.9107360839844, "epoch": 0.004743833017077799, "grad_norm": 3.436330944907639, "kl": 0.0255126953125, "learning_rate": 9.999450022956777e-07, "loss": 0.001, "reward": 2.0143370628356934, "reward_std": 0.2244153618812561, "rewards/accuracy_reward": 0.7202381491661072, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.2726702094078064, "step": 210 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 385.4821472167969, "epoch": 0.004766422698111503, "grad_norm": 2.6463976708099253, "kl": 0.0252685546875, "learning_rate": 9.999444747525446e-07, "loss": 0.001, "reward": 1.6008833646774292, "reward_std": 0.407810240983963, "rewards/accuracy_reward": 0.3664630651473999, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1558489203453064, "step": 211 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 404.5000305175781, "epoch": 0.004789012379145206, "grad_norm": 2.8810465048739173, "kl": 0.0255126953125, "learning_rate": 9.999439446914928e-07, "loss": 0.001, "reward": 2.021334648132324, "reward_std": 0.16667775809764862, "rewards/accuracy_reward": 0.6632430553436279, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2402343899011612, "step": 212 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 357.1250305175781, "epoch": 0.00481160206017891, "grad_norm": 3.753623282806152, "kl": 0.0272216796875, "learning_rate": 9.99943412112525e-07, "loss": 0.0011, "reward": 1.7276506423950195, "reward_std": 0.39889347553253174, "rewards/accuracy_reward": 0.43470966815948486, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2107979953289032, "step": 213 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 419.46429443359375, "epoch": 0.004834191741212614, "grad_norm": 2.47500707267951, "kl": 0.0224609375, "learning_rate": 9.999428770156438e-07, "loss": 0.0009, "reward": 1.5211817026138306, "reward_std": 0.33218875527381897, "rewards/accuracy_reward": 0.42072829604148865, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.143310546875, "step": 214 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 401.6607360839844, "epoch": 0.004856781422246318, "grad_norm": 4.740644048610411, "kl": 0.0224609375, "learning_rate": 9.999423394008517e-07, "loss": 0.0009, "reward": 1.5843161344528198, "reward_std": 0.32276710867881775, "rewards/accuracy_reward": 0.3466661274433136, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1805071234703064, "step": 215 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 341.1964416503906, "epoch": 0.004879371103280022, "grad_norm": 2.0156236457307877, "kl": 0.0264892578125, "learning_rate": 9.999417992681516e-07, "loss": 0.0011, "reward": 1.745589256286621, "reward_std": 0.23836743831634521, "rewards/accuracy_reward": 0.4510508179664612, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2231096625328064, "step": 216 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 396.1250305175781, "epoch": 0.004901960784313725, "grad_norm": 2.40273251498487, "kl": 0.0213623046875, "learning_rate": 9.999412566175462e-07, "loss": 0.0009, "reward": 1.8775578737258911, "reward_std": 0.3670785129070282, "rewards/accuracy_reward": 0.5658495426177979, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2152797281742096, "step": 217 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 344.9464416503906, "epoch": 0.00492455046534743, "grad_norm": 2.0546949145362765, "kl": 0.02783203125, "learning_rate": 9.999407114490383e-07, "loss": 0.0011, "reward": 1.8266847133636475, "reward_std": 0.21063531935214996, "rewards/accuracy_reward": 0.5680001378059387, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.19082751870155334, "step": 218 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 369.26788330078125, "epoch": 0.004947140146381133, "grad_norm": 1.6018185589049836, "kl": 0.022705078125, "learning_rate": 9.999401637626306e-07, "loss": 0.0009, "reward": 1.6815526485443115, "reward_std": 0.24351546168327332, "rewards/accuracy_reward": 0.4490714371204376, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1681954562664032, "step": 219 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 409.0357360839844, "epoch": 0.004969729827414837, "grad_norm": 2.145025202935218, "kl": 0.025146484375, "learning_rate": 9.99939613558326e-07, "loss": 0.001, "reward": 1.9914954900741577, "reward_std": 0.27916979789733887, "rewards/accuracy_reward": 0.6220093965530396, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2587716281414032, "step": 220 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 354.08929443359375, "epoch": 0.0049923195084485405, "grad_norm": 2.9775638107130584, "kl": 0.03173828125, "learning_rate": 9.99939060836127e-07, "loss": 0.0013, "reward": 2.0453720092773438, "reward_std": 0.10164360702037811, "rewards/accuracy_reward": 0.6563233137130737, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2747628390789032, "step": 221 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 422.1964416503906, "epoch": 0.005014909189482245, "grad_norm": 1.6876147710152427, "kl": 0.026611328125, "learning_rate": 9.999385055960363e-07, "loss": 0.0011, "reward": 1.5248587131500244, "reward_std": 0.15655794739723206, "rewards/accuracy_reward": 0.4304807782173157, "rewards/format_reward": 0.8571429252624512, "rewards/semantic_reward": 0.1658063679933548, "step": 222 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 403.6250305175781, "epoch": 0.005037498870515948, "grad_norm": 2.0653933266615114, "kl": 0.023681640625, "learning_rate": 9.99937947838057e-07, "loss": 0.0009, "reward": 1.822373867034912, "reward_std": 0.45485734939575195, "rewards/accuracy_reward": 0.5211042165756226, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1941266804933548, "step": 223 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 360.7321472167969, "epoch": 0.005060088551549652, "grad_norm": 1.9123372817234912, "kl": 0.0257568359375, "learning_rate": 9.999373875621918e-07, "loss": 0.001, "reward": 1.6899139881134033, "reward_std": 0.1830849051475525, "rewards/accuracy_reward": 0.40560680627822876, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1914498507976532, "step": 224 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 349.89288330078125, "epoch": 0.0050826782325833555, "grad_norm": 2.505034488727106, "kl": 0.032958984375, "learning_rate": 9.999368247684434e-07, "loss": 0.0013, "reward": 2.1888389587402344, "reward_std": 0.09752067923545837, "rewards/accuracy_reward": 0.7712275981903076, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2926112711429596, "step": 225 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 440.1785888671875, "epoch": 0.00510526791361706, "grad_norm": 6.1771173095236565, "kl": 0.0272216796875, "learning_rate": 9.999362594568151e-07, "loss": 0.0011, "reward": 1.859405279159546, "reward_std": 0.2744136452674866, "rewards/accuracy_reward": 0.512331485748291, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2685023844242096, "step": 226 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 371.58929443359375, "epoch": 0.005127857594650764, "grad_norm": 8.701265213440777, "kl": 0.035400390625, "learning_rate": 9.999356916273091e-07, "loss": 0.0014, "reward": 1.9724925756454468, "reward_std": 0.11214444041252136, "rewards/accuracy_reward": 0.5670342445373535, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2733154296875, "step": 227 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 369.1250305175781, "epoch": 0.005150447275684467, "grad_norm": 2.9813062830741326, "kl": 0.031005859375, "learning_rate": 9.999351212799287e-07, "loss": 0.0012, "reward": 2.104713201522827, "reward_std": 0.06794153898954391, "rewards/accuracy_reward": 0.7309094071388245, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2380894422531128, "step": 228 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 391.51788330078125, "epoch": 0.0051730369567181715, "grad_norm": 2.19459074010848, "kl": 0.0322265625, "learning_rate": 9.999345484146765e-07, "loss": 0.0013, "reward": 2.155425548553467, "reward_std": 0.15073934197425842, "rewards/accuracy_reward": 0.753203809261322, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2593645453453064, "step": 229 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 428.2321472167969, "epoch": 0.005195626637751875, "grad_norm": 1.720445436085165, "kl": 0.0284423828125, "learning_rate": 9.999339730315556e-07, "loss": 0.0011, "reward": 1.949944257736206, "reward_std": 0.08930954337120056, "rewards/accuracy_reward": 0.5573451519012451, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2711704969406128, "step": 230 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 404.8571472167969, "epoch": 0.005218216318785579, "grad_norm": 2.7664252122970314, "kl": 0.03173828125, "learning_rate": 9.99933395130569e-07, "loss": 0.0013, "reward": 1.5151699781417847, "reward_std": 0.12450746446847916, "rewards/accuracy_reward": 0.27363115549087524, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1665387898683548, "step": 231 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 426.01788330078125, "epoch": 0.005240805999819282, "grad_norm": 3.718211279609596, "kl": 0.03759765625, "learning_rate": 9.99932814711719e-07, "loss": 0.0015, "reward": 1.819545030593872, "reward_std": 0.18829753994941711, "rewards/accuracy_reward": 0.45458951592445374, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2399553656578064, "step": 232 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 449.46429443359375, "epoch": 0.0052633956808529865, "grad_norm": 47.27885860532444, "kl": 0.037109375, "learning_rate": 9.999322317750094e-07, "loss": 0.0015, "reward": 1.8710787296295166, "reward_std": 0.34857505559921265, "rewards/accuracy_reward": 0.5585297346115112, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2018345445394516, "step": 233 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 411.5535888671875, "epoch": 0.00528598536188669, "grad_norm": 3.3343996240074203, "kl": 0.031494140625, "learning_rate": 9.999316463204423e-07, "loss": 0.0013, "reward": 1.663825511932373, "reward_std": 0.2731500566005707, "rewards/accuracy_reward": 0.3536936044692993, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2244175672531128, "step": 234 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 451.0535888671875, "epoch": 0.005308575042920394, "grad_norm": 1.9485168967179347, "kl": 0.033935546875, "learning_rate": 9.999310583480214e-07, "loss": 0.0014, "reward": 1.4532703161239624, "reward_std": 0.10746242851018906, "rewards/accuracy_reward": 0.25823676586151123, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.123604916036129, "step": 235 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 422.51788330078125, "epoch": 0.005331164723954097, "grad_norm": 3.9508538325432636, "kl": 0.037841796875, "learning_rate": 9.999304678577492e-07, "loss": 0.0015, "reward": 1.9456682205200195, "reward_std": 0.2721923291683197, "rewards/accuracy_reward": 0.5946201086044312, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2260480672121048, "step": 236 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 456.76788330078125, "epoch": 0.005353754404987802, "grad_norm": 1.7055133385488679, "kl": 0.0296630859375, "learning_rate": 9.999298748496286e-07, "loss": 0.0012, "reward": 1.6125752925872803, "reward_std": 0.13833265006542206, "rewards/accuracy_reward": 0.3988790214061737, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1458391547203064, "step": 237 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 396.3214416503906, "epoch": 0.005376344086021506, "grad_norm": 1.8652003698166988, "kl": 0.03662109375, "learning_rate": 9.99929279323663e-07, "loss": 0.0015, "reward": 1.5565539598464966, "reward_std": 0.061157483607530594, "rewards/accuracy_reward": 0.34329351782798767, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.145403191447258, "step": 238 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 425.9107360839844, "epoch": 0.005398933767055209, "grad_norm": 2.1898820034656477, "kl": 0.03271484375, "learning_rate": 9.99928681279855e-07, "loss": 0.0013, "reward": 1.6990907192230225, "reward_std": 0.2606106400489807, "rewards/accuracy_reward": 0.4349794387817383, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.2319684773683548, "step": 239 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 449.83929443359375, "epoch": 0.005421523448088913, "grad_norm": 2.5273094718217326, "kl": 0.0341796875, "learning_rate": 9.999280807182077e-07, "loss": 0.0014, "reward": 2.017282247543335, "reward_std": 0.22083045542240143, "rewards/accuracy_reward": 0.6092427372932434, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2758963704109192, "step": 240 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 392.0000305175781, "epoch": 0.005444113129122617, "grad_norm": 2.6429447708249767, "kl": 0.03515625, "learning_rate": 9.999274776387242e-07, "loss": 0.0014, "reward": 1.9179500341415405, "reward_std": 0.2852023243904114, "rewards/accuracy_reward": 0.5880774855613708, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2084437906742096, "step": 241 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 411.5535888671875, "epoch": 0.005466702810156321, "grad_norm": 13.212837648121836, "kl": 0.032958984375, "learning_rate": 9.999268720414076e-07, "loss": 0.0013, "reward": 1.9410629272460938, "reward_std": 0.2526363730430603, "rewards/accuracy_reward": 0.582763671875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2297276258468628, "step": 242 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 392.51788330078125, "epoch": 0.005489292491190024, "grad_norm": 3.0587727229393966, "kl": 0.036376953125, "learning_rate": 9.999262639262608e-07, "loss": 0.0015, "reward": 1.7454216480255127, "reward_std": 0.20508673787117004, "rewards/accuracy_reward": 0.4468899071216583, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1913888156414032, "step": 243 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 383.4464416503906, "epoch": 0.005511882172223728, "grad_norm": 1.8171883110478035, "kl": 0.027099609375, "learning_rate": 9.999256532932872e-07, "loss": 0.0011, "reward": 1.6595901250839233, "reward_std": 0.39663761854171753, "rewards/accuracy_reward": 0.4523809552192688, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1572091281414032, "step": 244 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 395.71429443359375, "epoch": 0.005534471853257432, "grad_norm": 6.421781166768523, "kl": 0.032470703125, "learning_rate": 9.999250401424895e-07, "loss": 0.0013, "reward": 1.7797155380249023, "reward_std": 0.16708479821681976, "rewards/accuracy_reward": 0.4299349784851074, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2390660047531128, "step": 245 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 399.7857360839844, "epoch": 0.005557061534291136, "grad_norm": 3.457053670564217, "kl": 0.0269775390625, "learning_rate": 9.99924424473871e-07, "loss": 0.0011, "reward": 1.8581868410110474, "reward_std": 0.05721600353717804, "rewards/accuracy_reward": 0.5243871808052063, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2159423977136612, "step": 246 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 386.0714416503906, "epoch": 0.005579651215324839, "grad_norm": 5.349603415183048, "kl": 0.0311279296875, "learning_rate": 9.999238062874344e-07, "loss": 0.0012, "reward": 1.5186572074890137, "reward_std": 0.05413474515080452, "rewards/accuracy_reward": 0.26166772842407227, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1641322672367096, "step": 247 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 377.9285888671875, "epoch": 0.0056022408963585435, "grad_norm": 3.2291873708508536, "kl": 0.03369140625, "learning_rate": 9.999231855831833e-07, "loss": 0.0013, "reward": 1.7102599143981934, "reward_std": 0.23085792362689972, "rewards/accuracy_reward": 0.43138155341148376, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1717354953289032, "step": 248 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 434.3571472167969, "epoch": 0.005624830577392247, "grad_norm": 2.0014570087066623, "kl": 0.0260009765625, "learning_rate": 9.999225623611207e-07, "loss": 0.001, "reward": 1.60336172580719, "reward_std": 0.24247021973133087, "rewards/accuracy_reward": 0.35073885321617126, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.2240513563156128, "step": 249 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 375.7321472167969, "epoch": 0.005647420258425951, "grad_norm": 3.7121975968249363, "kl": 0.03466796875, "learning_rate": 9.999219366212496e-07, "loss": 0.0014, "reward": 1.6248927116394043, "reward_std": 0.1636936068534851, "rewards/accuracy_reward": 0.3499937951564789, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1820417195558548, "step": 250 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 350.64288330078125, "epoch": 0.005670009939459655, "grad_norm": 2.5160091448956283, "kl": 0.0299072265625, "learning_rate": 9.999213083635734e-07, "loss": 0.0012, "reward": 1.826963186264038, "reward_std": 0.08665375411510468, "rewards/accuracy_reward": 0.523206889629364, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2251848578453064, "step": 251 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 375.7857360839844, "epoch": 0.0056925996204933585, "grad_norm": 2.816820024688894, "kl": 0.0286865234375, "learning_rate": 9.999206775880949e-07, "loss": 0.0011, "reward": 1.6486443281173706, "reward_std": 0.18218988180160522, "rewards/accuracy_reward": 0.37280383706092834, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1829834133386612, "step": 252 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 389.01788330078125, "epoch": 0.005715189301527063, "grad_norm": 3.0556773683177965, "kl": 0.03662109375, "learning_rate": 9.999200442948177e-07, "loss": 0.0015, "reward": 2.0582785606384277, "reward_std": 0.11428092420101166, "rewards/accuracy_reward": 0.585412859916687, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3121512532234192, "step": 253 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 395.1964416503906, "epoch": 0.005737778982560766, "grad_norm": 2.9556031687486475, "kl": 0.0311279296875, "learning_rate": 9.999194084837446e-07, "loss": 0.0012, "reward": 2.199312925338745, "reward_std": 0.16072094440460205, "rewards/accuracy_reward": 0.7898731827735901, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2915824055671692, "step": 254 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 386.2500305175781, "epoch": 0.00576036866359447, "grad_norm": 5.045578624196195, "kl": 0.03125, "learning_rate": 9.99918770154879e-07, "loss": 0.0013, "reward": 1.6287434101104736, "reward_std": 0.061499837785959244, "rewards/accuracy_reward": 0.2972699701786041, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2064732164144516, "step": 255 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 353.3035888671875, "epoch": 0.005782958344628174, "grad_norm": 2.620259372890814, "kl": 0.0291748046875, "learning_rate": 9.99918129308224e-07, "loss": 0.0012, "reward": 1.8754082918167114, "reward_std": 0.1229163259267807, "rewards/accuracy_reward": 0.5642160773277283, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2219063937664032, "step": 256 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 347.51788330078125, "epoch": 0.005805548025661878, "grad_norm": 15.171605510913206, "kl": 0.03759765625, "learning_rate": 9.99917485943783e-07, "loss": 0.0015, "reward": 1.7916791439056396, "reward_std": 0.09696292132139206, "rewards/accuracy_reward": 0.49081069231033325, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2115827351808548, "step": 257 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 362.6607360839844, "epoch": 0.005828137706695581, "grad_norm": 6.0662418090856844, "kl": 0.033203125, "learning_rate": 9.999168400615592e-07, "loss": 0.0013, "reward": 2.102046012878418, "reward_std": 0.15649496018886566, "rewards/accuracy_reward": 0.7470828890800476, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2299630343914032, "step": 258 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 387.4464416503906, "epoch": 0.005850727387729285, "grad_norm": 2.557006379333921, "kl": 0.034912109375, "learning_rate": 9.999161916615557e-07, "loss": 0.0014, "reward": 1.9920021295547485, "reward_std": 0.10146117955446243, "rewards/accuracy_reward": 0.6154534816741943, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2479771375656128, "step": 259 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 380.5000305175781, "epoch": 0.005873317068762989, "grad_norm": 2.018132351779264, "kl": 0.0361328125, "learning_rate": 9.99915540743776e-07, "loss": 0.0014, "reward": 1.4542620182037354, "reward_std": 0.11191042512655258, "rewards/accuracy_reward": 0.20848210155963898, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.156494140625, "step": 260 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 405.3750305175781, "epoch": 0.005895906749796693, "grad_norm": 1.3328946776166972, "kl": 0.0281982421875, "learning_rate": 9.99914887308223e-07, "loss": 0.0011, "reward": 1.5806498527526855, "reward_std": 0.15196795761585236, "rewards/accuracy_reward": 0.33678990602493286, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1617170125246048, "step": 261 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 373.08929443359375, "epoch": 0.005918496430830397, "grad_norm": 5.362006232494452, "kl": 0.031982421875, "learning_rate": 9.999142313549005e-07, "loss": 0.0013, "reward": 1.892488718032837, "reward_std": 0.17046763002872467, "rewards/accuracy_reward": 0.582632303237915, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1919991672039032, "step": 262 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 397.5714416503906, "epoch": 0.0059410861118641, "grad_norm": 4.085165820165471, "kl": 0.03076171875, "learning_rate": 9.999135728838116e-07, "loss": 0.0012, "reward": 1.905714511871338, "reward_std": 0.2455075979232788, "rewards/accuracy_reward": 0.5813944935798645, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.231462761759758, "step": 263 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 408.2500305175781, "epoch": 0.005963675792897805, "grad_norm": 1.9014502436675684, "kl": 0.0311279296875, "learning_rate": 9.999129118949594e-07, "loss": 0.0012, "reward": 1.7165656089782715, "reward_std": 0.2121201902627945, "rewards/accuracy_reward": 0.504400372505188, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1835937649011612, "step": 264 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 381.6071472167969, "epoch": 0.005986265473931508, "grad_norm": 1.6285737424032354, "kl": 0.03271484375, "learning_rate": 9.999122483883474e-07, "loss": 0.0013, "reward": 1.4295364618301392, "reward_std": 0.26547834277153015, "rewards/accuracy_reward": 0.19050176441669464, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1533203125, "step": 265 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 363.4821472167969, "epoch": 0.006008855154965212, "grad_norm": 2.6342508747354896, "kl": 0.0311279296875, "learning_rate": 9.99911582363979e-07, "loss": 0.0012, "reward": 1.9863611459732056, "reward_std": 0.1269199252128601, "rewards/accuracy_reward": 0.6202653050422668, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2482387125492096, "step": 266 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 418.2321472167969, "epoch": 0.0060314448359989154, "grad_norm": 4.764861709896876, "kl": 0.0299072265625, "learning_rate": 9.999109138218575e-07, "loss": 0.0012, "reward": 1.8572595119476318, "reward_std": 0.15508025884628296, "rewards/accuracy_reward": 0.5241803526878357, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2187936007976532, "step": 267 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 433.5714416503906, "epoch": 0.00605403451703262, "grad_norm": 1.8835652288930955, "kl": 0.024658203125, "learning_rate": 9.99910242761986e-07, "loss": 0.001, "reward": 1.8498722314834595, "reward_std": 0.34379851818084717, "rewards/accuracy_reward": 0.5833333134651184, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1843959391117096, "step": 268 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 409.5535888671875, "epoch": 0.006076624198066323, "grad_norm": 2.1742717804934615, "kl": 0.03369140625, "learning_rate": 9.999095691843684e-07, "loss": 0.0014, "reward": 1.8469884395599365, "reward_std": 0.1071857437491417, "rewards/accuracy_reward": 0.5316179394721985, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2189418375492096, "step": 269 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 387.58929443359375, "epoch": 0.006099213879100027, "grad_norm": 9.907418112281837, "kl": 0.031494140625, "learning_rate": 9.999088930890078e-07, "loss": 0.0013, "reward": 1.8878490924835205, "reward_std": 0.2905329465866089, "rewards/accuracy_reward": 0.594158411026001, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2115478664636612, "step": 270 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 408.8750305175781, "epoch": 0.0061218035601337305, "grad_norm": 1.8807237243786985, "kl": 0.0322265625, "learning_rate": 9.999082144759077e-07, "loss": 0.0013, "reward": 1.6310527324676514, "reward_std": 0.21561121940612793, "rewards/accuracy_reward": 0.44474542140960693, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.1970214992761612, "step": 271 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 377.1964416503906, "epoch": 0.006144393241167435, "grad_norm": 2.337235674247711, "kl": 0.033203125, "learning_rate": 9.999075333450713e-07, "loss": 0.0013, "reward": 1.9440317153930664, "reward_std": 0.1516009271144867, "rewards/accuracy_reward": 0.5444779992103577, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2566964328289032, "step": 272 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 362.3750305175781, "epoch": 0.006166982922201139, "grad_norm": 1.5322095192513743, "kl": 0.0263671875, "learning_rate": 9.999068496965022e-07, "loss": 0.0011, "reward": 1.8244839906692505, "reward_std": 0.13418594002723694, "rewards/accuracy_reward": 0.5535714626312256, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.185198113322258, "step": 273 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 365.5000305175781, "epoch": 0.006189572603234842, "grad_norm": 2.4822094406436306, "kl": 0.041748046875, "learning_rate": 9.999061635302038e-07, "loss": 0.0017, "reward": 1.7896696329116821, "reward_std": 0.16278040409088135, "rewards/accuracy_reward": 0.45336589217185974, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2113037258386612, "step": 274 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 398.9107360839844, "epoch": 0.0062121622842685464, "grad_norm": 8.236610307001529, "kl": 0.0380859375, "learning_rate": 9.999054748461797e-07, "loss": 0.0015, "reward": 1.6779491901397705, "reward_std": 0.1792031079530716, "rewards/accuracy_reward": 0.32040515542030334, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2325439602136612, "step": 275 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 459.4107360839844, "epoch": 0.00623475196530225, "grad_norm": 1.224182068155212, "kl": 0.0260009765625, "learning_rate": 9.999047836444333e-07, "loss": 0.001, "reward": 1.5847527980804443, "reward_std": 0.2751067876815796, "rewards/accuracy_reward": 0.41954976320266724, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.1544887125492096, "step": 276 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 378.2857360839844, "epoch": 0.006257341646335954, "grad_norm": 4.463684410562413, "kl": 0.0260009765625, "learning_rate": 9.999040899249679e-07, "loss": 0.001, "reward": 1.9494839906692505, "reward_std": 0.1655396819114685, "rewards/accuracy_reward": 0.5892857313156128, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2209123969078064, "step": 277 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 382.1250305175781, "epoch": 0.006279931327369657, "grad_norm": 11.738968747643169, "kl": 0.037841796875, "learning_rate": 9.99903393687787e-07, "loss": 0.0015, "reward": 1.8913804292678833, "reward_std": 0.16852597892284393, "rewards/accuracy_reward": 0.4826364815235138, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2480294406414032, "step": 278 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 375.5714416503906, "epoch": 0.0063025210084033615, "grad_norm": 4.703129804771499, "kl": 0.041015625, "learning_rate": 9.999026949328943e-07, "loss": 0.0016, "reward": 1.9075950384140015, "reward_std": 0.0953553095459938, "rewards/accuracy_reward": 0.5368673205375671, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2457275539636612, "step": 279 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 377.8571472167969, "epoch": 0.006325110689437065, "grad_norm": 1.97260761825242, "kl": 0.032958984375, "learning_rate": 9.999019936602932e-07, "loss": 0.0013, "reward": 1.9549247026443481, "reward_std": 0.28502359986305237, "rewards/accuracy_reward": 0.6785714626312256, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1906389594078064, "step": 280 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 381.1964416503906, "epoch": 0.006347700370470769, "grad_norm": 3.1208585016527497, "kl": 0.037353515625, "learning_rate": 9.999012898699874e-07, "loss": 0.0015, "reward": 1.5005427598953247, "reward_std": 0.0793982744216919, "rewards/accuracy_reward": 0.20825044810771942, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1887207180261612, "step": 281 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 450.6607360839844, "epoch": 0.006370290051504472, "grad_norm": 1.7283956010260122, "kl": 0.0245361328125, "learning_rate": 9.999005835619804e-07, "loss": 0.001, "reward": 1.7873258590698242, "reward_std": 0.1956460177898407, "rewards/accuracy_reward": 0.5389056205749512, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2019914984703064, "step": 282 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 403.9821472167969, "epoch": 0.006392879732538177, "grad_norm": 1.6343671112673293, "kl": 0.02978515625, "learning_rate": 9.998998747362755e-07, "loss": 0.0012, "reward": 1.7448315620422363, "reward_std": 0.2600570321083069, "rewards/accuracy_reward": 0.5605506300926208, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.191423699259758, "step": 283 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 441.96429443359375, "epoch": 0.006415469413571881, "grad_norm": 2.047398410194079, "kl": 0.0234375, "learning_rate": 9.998991633928765e-07, "loss": 0.0009, "reward": 1.7441917657852173, "reward_std": 0.3493563234806061, "rewards/accuracy_reward": 0.547619104385376, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.12871551513671875, "step": 284 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 374.0714416503906, "epoch": 0.006438059094605584, "grad_norm": 2.358723974953758, "kl": 0.03515625, "learning_rate": 9.998984495317871e-07, "loss": 0.0014, "reward": 1.9037432670593262, "reward_std": 0.2625734806060791, "rewards/accuracy_reward": 0.5102251172065735, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2756609320640564, "step": 285 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 412.14288330078125, "epoch": 0.006460648775639288, "grad_norm": 3.2458466588814177, "kl": 0.031982421875, "learning_rate": 9.998977331530104e-07, "loss": 0.0013, "reward": 1.7489650249481201, "reward_std": 0.3283669948577881, "rewards/accuracy_reward": 0.44564467668533325, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2068917453289032, "step": 286 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 370.76788330078125, "epoch": 0.006483238456672992, "grad_norm": 2.1955655359706925, "kl": 0.0302734375, "learning_rate": 9.998970142565505e-07, "loss": 0.0012, "reward": 2.0961525440216064, "reward_std": 0.08303436636924744, "rewards/accuracy_reward": 0.7129738330841064, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2581787109375, "step": 287 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 365.3214416503906, "epoch": 0.006505828137706696, "grad_norm": 2.184845759478264, "kl": 0.0419921875, "learning_rate": 9.998962928424112e-07, "loss": 0.0017, "reward": 2.0247528553009033, "reward_std": 0.2356462925672531, "rewards/accuracy_reward": 0.6005408763885498, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2992118000984192, "step": 288 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 390.6071472167969, "epoch": 0.006528417818740399, "grad_norm": 9.029359996911621, "kl": 0.039794921875, "learning_rate": 9.998955689105953e-07, "loss": 0.0016, "reward": 1.939788579940796, "reward_std": 0.15690064430236816, "rewards/accuracy_reward": 0.5660720467567444, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2380022406578064, "step": 289 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.2857142857142857, "completion_length": 383.2500305175781, "epoch": 0.006551007499774103, "grad_norm": 2.7565800289517615, "kl": 0.03515625, "learning_rate": 9.99894842461107e-07, "loss": 0.0014, "reward": 1.9010820388793945, "reward_std": 0.15238624811172485, "rewards/accuracy_reward": 0.5934995412826538, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2004394680261612, "step": 290 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 359.9107360839844, "epoch": 0.006573597180807807, "grad_norm": 3.9273256289385925, "kl": 0.043212890625, "learning_rate": 9.9989411349395e-07, "loss": 0.0017, "reward": 2.2684714794158936, "reward_std": 0.13048318028450012, "rewards/accuracy_reward": 0.8442944884300232, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2813197672367096, "step": 291 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 428.89288330078125, "epoch": 0.006596186861841511, "grad_norm": 1.338160593792757, "kl": 0.0269775390625, "learning_rate": 9.99893382009128e-07, "loss": 0.0011, "reward": 1.5854934453964233, "reward_std": 0.2778022885322571, "rewards/accuracy_reward": 0.3988095223903656, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1723981648683548, "step": 292 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 428.4464416503906, "epoch": 0.006618776542875214, "grad_norm": 1.876497401898846, "kl": 0.03271484375, "learning_rate": 9.998926480066445e-07, "loss": 0.0013, "reward": 1.7693349123001099, "reward_std": 0.2752881944179535, "rewards/accuracy_reward": 0.49869102239608765, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1777866929769516, "step": 293 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 422.6964416503906, "epoch": 0.006641366223908918, "grad_norm": 3.2063406513373414, "kl": 0.039306640625, "learning_rate": 9.998919114865033e-07, "loss": 0.0016, "reward": 2.0500621795654297, "reward_std": 0.27145877480506897, "rewards/accuracy_reward": 0.6038705706596375, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.3104771375656128, "step": 294 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 424.58929443359375, "epoch": 0.006663955904942623, "grad_norm": 2.287959606868257, "kl": 0.0322265625, "learning_rate": 9.998911724487078e-07, "loss": 0.0013, "reward": 2.02573561668396, "reward_std": 0.17988865077495575, "rewards/accuracy_reward": 0.6256241202354431, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2572544813156128, "step": 295 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 404.5535888671875, "epoch": 0.006686545585976326, "grad_norm": 3.3217871795265848, "kl": 0.033935546875, "learning_rate": 9.99890430893262e-07, "loss": 0.0014, "reward": 1.9530912637710571, "reward_std": 0.2593131959438324, "rewards/accuracy_reward": 0.5361582040786743, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2847900390625, "step": 296 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 421.76788330078125, "epoch": 0.00670913526701003, "grad_norm": 1.7472947383987663, "kl": 0.040771484375, "learning_rate": 9.998896868201697e-07, "loss": 0.0016, "reward": 1.563525915145874, "reward_std": 0.13448213040828705, "rewards/accuracy_reward": 0.37208133935928345, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1235874742269516, "step": 297 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 412.64288330078125, "epoch": 0.0067317249480437335, "grad_norm": 5.178286760583718, "kl": 0.0380859375, "learning_rate": 9.998889402294346e-07, "loss": 0.0015, "reward": 1.934621810913086, "reward_std": 0.2516809403896332, "rewards/accuracy_reward": 0.5464276075363159, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2560512125492096, "step": 298 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 483.5714416503906, "epoch": 0.006754314629077438, "grad_norm": 1.4251057963170988, "kl": 0.03173828125, "learning_rate": 9.998881911210607e-07, "loss": 0.0013, "reward": 1.6160709857940674, "reward_std": 0.4311463236808777, "rewards/accuracy_reward": 0.511904776096344, "rewards/format_reward": 0.8392857313156128, "rewards/semantic_reward": 0.1720232367515564, "step": 299 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 532.5178833007812, "epoch": 0.006776904310111141, "grad_norm": 2.183386452947999, "kl": 0.02587890625, "learning_rate": 9.99887439495051e-07, "loss": 0.001, "reward": 1.4112321138381958, "reward_std": 0.37957867980003357, "rewards/accuracy_reward": 0.2914111316204071, "rewards/format_reward": 0.8392857313156128, "rewards/semantic_reward": 0.1912493109703064, "step": 300 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 468.8214416503906, "epoch": 0.006799493991144845, "grad_norm": 5.698083433126071, "kl": 0.0311279296875, "learning_rate": 9.9988668535141e-07, "loss": 0.0012, "reward": 1.6893982887268066, "reward_std": 0.2649637758731842, "rewards/accuracy_reward": 0.4004962146282196, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1960449367761612, "step": 301 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 397.6607360839844, "epoch": 0.0068220836721785486, "grad_norm": 2.3983414852489497, "kl": 0.03662109375, "learning_rate": 9.998859286901414e-07, "loss": 0.0015, "reward": 1.9558531045913696, "reward_std": 0.17496441304683685, "rewards/accuracy_reward": 0.5921952724456787, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2493722140789032, "step": 302 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 493.96429443359375, "epoch": 0.006844673353212253, "grad_norm": 2.2805981704713387, "kl": 0.0294189453125, "learning_rate": 9.998851695112486e-07, "loss": 0.0012, "reward": 1.5787774324417114, "reward_std": 0.3522557318210602, "rewards/accuracy_reward": 0.31329184770584106, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2297712117433548, "step": 303 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 428.3571472167969, "epoch": 0.006867263034245956, "grad_norm": 2.633924396560968, "kl": 0.044189453125, "learning_rate": 9.99884407814736e-07, "loss": 0.0018, "reward": 1.9901622533798218, "reward_std": 0.24868054687976837, "rewards/accuracy_reward": 0.6163269281387329, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.216692253947258, "step": 304 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.8750305175781, "epoch": 0.00688985271527966, "grad_norm": 2.306236268607625, "kl": 0.037109375, "learning_rate": 9.99883643600607e-07, "loss": 0.0015, "reward": 1.794360876083374, "reward_std": 0.3213340938091278, "rewards/accuracy_reward": 0.4181646406650543, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2440534383058548, "step": 305 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 435.0000305175781, "epoch": 0.0069124423963133645, "grad_norm": 9.703866322297065, "kl": 0.04150390625, "learning_rate": 9.998828768688657e-07, "loss": 0.0017, "reward": 2.036858081817627, "reward_std": 0.21515145897865295, "rewards/accuracy_reward": 0.6230046153068542, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.288853257894516, "step": 306 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 429.01788330078125, "epoch": 0.006935032077347068, "grad_norm": 1.6707126726299286, "kl": 0.0311279296875, "learning_rate": 9.998821076195156e-07, "loss": 0.0012, "reward": 1.8693695068359375, "reward_std": 0.27125391364097595, "rewards/accuracy_reward": 0.5220304131507874, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2366245985031128, "step": 307 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 463.2857360839844, "epoch": 0.006957621758380772, "grad_norm": 2.1943065459758846, "kl": 0.0341796875, "learning_rate": 9.99881335852561e-07, "loss": 0.0014, "reward": 1.7360156774520874, "reward_std": 0.28460654616355896, "rewards/accuracy_reward": 0.44511163234710693, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2123325914144516, "step": 308 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 376.6071472167969, "epoch": 0.006980211439414475, "grad_norm": 2.4352774259516803, "kl": 0.03662109375, "learning_rate": 9.998805615680057e-07, "loss": 0.0015, "reward": 1.784929633140564, "reward_std": 0.17021919786930084, "rewards/accuracy_reward": 0.4810965955257416, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1966901570558548, "step": 309 }, { "all_correct": 0.0, "all_wrong": 0.5714285714285714, "completion_length": 411.1071472167969, "epoch": 0.0070028011204481795, "grad_norm": 0.7767920041770567, "kl": 0.0284423828125, "learning_rate": 9.998797847658534e-07, "loss": 0.0011, "reward": 1.1996198892593384, "reward_std": 0.28164488077163696, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.0567626990377903, "step": 310 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 366.4821472167969, "epoch": 0.007025390801481883, "grad_norm": 1.4876424468815261, "kl": 0.038818359375, "learning_rate": 9.998790054461082e-07, "loss": 0.0016, "reward": 1.834550142288208, "reward_std": 0.18738766014575958, "rewards/accuracy_reward": 0.5592901706695557, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1859741359949112, "step": 311 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 421.0000305175781, "epoch": 0.007047980482515587, "grad_norm": 1.7713068445762912, "kl": 0.0341796875, "learning_rate": 9.998782236087738e-07, "loss": 0.0014, "reward": 2.0045981407165527, "reward_std": 0.2822612524032593, "rewards/accuracy_reward": 0.5784924626350403, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2939627766609192, "step": 312 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 382.39288330078125, "epoch": 0.00707057016354929, "grad_norm": 2.075576485387812, "kl": 0.037841796875, "learning_rate": 9.998774392538545e-07, "loss": 0.0015, "reward": 1.929435133934021, "reward_std": 0.1263498216867447, "rewards/accuracy_reward": 0.5698821544647217, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2381243109703064, "step": 313 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 372.6250305175781, "epoch": 0.007093159844582995, "grad_norm": 3.5368404314961723, "kl": 0.04443359375, "learning_rate": 9.99876652381354e-07, "loss": 0.0018, "reward": 1.823461890220642, "reward_std": 0.24591940641403198, "rewards/accuracy_reward": 0.47404423356056213, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2244175672531128, "step": 314 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 369.1964416503906, "epoch": 0.007115749525616698, "grad_norm": 3.1876448533743598, "kl": 0.03662109375, "learning_rate": 9.998758629912761e-07, "loss": 0.0015, "reward": 1.6396740674972534, "reward_std": 0.1476828157901764, "rewards/accuracy_reward": 0.4047619104385376, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1420549750328064, "step": 315 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 372.58929443359375, "epoch": 0.007138339206650402, "grad_norm": 4.182389231549157, "kl": 0.03955078125, "learning_rate": 9.998750710836255e-07, "loss": 0.0016, "reward": 2.1294972896575928, "reward_std": 0.06633459031581879, "rewards/accuracy_reward": 0.6541240215301514, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2932303547859192, "step": 316 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 378.1071472167969, "epoch": 0.0071609288876841055, "grad_norm": 3.9348260916828792, "kl": 0.03173828125, "learning_rate": 9.99874276658405e-07, "loss": 0.0013, "reward": 1.544913411140442, "reward_std": 0.2293895035982132, "rewards/accuracy_reward": 0.2906129062175751, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1578717976808548, "step": 317 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 393.7857360839844, "epoch": 0.00718351856871781, "grad_norm": 2.1865038977484277, "kl": 0.032470703125, "learning_rate": 9.998734797156198e-07, "loss": 0.0013, "reward": 2.2162139415740967, "reward_std": 0.32262536883354187, "rewards/accuracy_reward": 0.779596209526062, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2830461859703064, "step": 318 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 387.5357360839844, "epoch": 0.007206108249751514, "grad_norm": 2.2055794382707474, "kl": 0.029052734375, "learning_rate": 9.998726802552732e-07, "loss": 0.0012, "reward": 1.8024026155471802, "reward_std": 0.10305539518594742, "rewards/accuracy_reward": 0.47789067029953003, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2137974351644516, "step": 319 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 412.6964416503906, "epoch": 0.007228697930785217, "grad_norm": 2.480314260320467, "kl": 0.03515625, "learning_rate": 9.998718782773694e-07, "loss": 0.0014, "reward": 1.5359152555465698, "reward_std": 0.2556445300579071, "rewards/accuracy_reward": 0.4250161051750183, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.1537562906742096, "step": 320 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 371.9285888671875, "epoch": 0.007251287611818921, "grad_norm": 3.199366213775915, "kl": 0.0361328125, "learning_rate": 9.998710737819126e-07, "loss": 0.0014, "reward": 1.6321319341659546, "reward_std": 0.05170086771249771, "rewards/accuracy_reward": 0.3125150203704834, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2053309977054596, "step": 321 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 390.9821472167969, "epoch": 0.007273877292852625, "grad_norm": 2.6349043278224435, "kl": 0.03955078125, "learning_rate": 9.998702667689065e-07, "loss": 0.0016, "reward": 1.8492047786712646, "reward_std": 0.1066059097647667, "rewards/accuracy_reward": 0.4598110020160675, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2536795735359192, "step": 322 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 429.9107360839844, "epoch": 0.007296466973886329, "grad_norm": 2.931062556766078, "kl": 0.03173828125, "learning_rate": 9.998694572383555e-07, "loss": 0.0013, "reward": 1.8692848682403564, "reward_std": 0.2728475332260132, "rewards/accuracy_reward": 0.5833333134651184, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1895228922367096, "step": 323 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 427.4821472167969, "epoch": 0.007319056654920032, "grad_norm": 1.9117064823145673, "kl": 0.03564453125, "learning_rate": 9.998686451902633e-07, "loss": 0.0014, "reward": 1.5600475072860718, "reward_std": 0.2905277609825134, "rewards/accuracy_reward": 0.371722549200058, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1490391343832016, "step": 324 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 384.1964416503906, "epoch": 0.0073416463359537365, "grad_norm": 2.17418582841531, "kl": 0.038818359375, "learning_rate": 9.998678306246346e-07, "loss": 0.0016, "reward": 1.9025037288665771, "reward_std": 0.2592600882053375, "rewards/accuracy_reward": 0.5591896772384644, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2361711859703064, "step": 325 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 439.58929443359375, "epoch": 0.00736423601698744, "grad_norm": 2.571126580590726, "kl": 0.0380859375, "learning_rate": 9.998670135414728e-07, "loss": 0.0015, "reward": 1.8325713872909546, "reward_std": 0.13965967297554016, "rewards/accuracy_reward": 0.43750298023223877, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2486397922039032, "step": 326 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 378.1071472167969, "epoch": 0.007386825698021144, "grad_norm": 2.6974977433589693, "kl": 0.034423828125, "learning_rate": 9.998661939407826e-07, "loss": 0.0014, "reward": 1.7790248394012451, "reward_std": 0.25841158628463745, "rewards/accuracy_reward": 0.5000000596046448, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1968819797039032, "step": 327 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 400.3035888671875, "epoch": 0.007409415379054847, "grad_norm": 1.9988745968884971, "kl": 0.03173828125, "learning_rate": 9.998653718225679e-07, "loss": 0.0013, "reward": 1.8029439449310303, "reward_std": 0.09877941757440567, "rewards/accuracy_reward": 0.5105960965156555, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1744907945394516, "step": 328 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 401.5535888671875, "epoch": 0.0074320050600885515, "grad_norm": 17.863638135787493, "kl": 0.0322265625, "learning_rate": 9.998645471868326e-07, "loss": 0.0013, "reward": 1.6356216669082642, "reward_std": 0.1523858904838562, "rewards/accuracy_reward": 0.35986825823783875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1828962117433548, "step": 329 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 350.96429443359375, "epoch": 0.007454594741122256, "grad_norm": 2.3324062315071012, "kl": 0.03955078125, "learning_rate": 9.998637200335813e-07, "loss": 0.0016, "reward": 1.838255524635315, "reward_std": 0.10554096102714539, "rewards/accuracy_reward": 0.5224351286888123, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2158203274011612, "step": 330 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 404.1785888671875, "epoch": 0.007477184422155959, "grad_norm": 2.4030733375169775, "kl": 0.036865234375, "learning_rate": 9.998628903628177e-07, "loss": 0.0015, "reward": 1.7892835140228271, "reward_std": 0.24735037982463837, "rewards/accuracy_reward": 0.42690908908843994, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2373744547367096, "step": 331 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 386.0357360839844, "epoch": 0.007499774103189663, "grad_norm": 4.013310891917419, "kl": 0.032470703125, "learning_rate": 9.998620581745463e-07, "loss": 0.0013, "reward": 1.8032081127166748, "reward_std": 0.1242733895778656, "rewards/accuracy_reward": 0.4907996654510498, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2052655965089798, "step": 332 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 405.9107360839844, "epoch": 0.007522363784223367, "grad_norm": 1.8476775370232004, "kl": 0.031982421875, "learning_rate": 9.998612234687713e-07, "loss": 0.0013, "reward": 2.020258665084839, "reward_std": 0.28160402178764343, "rewards/accuracy_reward": 0.6515573859214783, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2579869031906128, "step": 333 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 389.83929443359375, "epoch": 0.007544953465257071, "grad_norm": 3.110236791077695, "kl": 0.034912109375, "learning_rate": 9.998603862454969e-07, "loss": 0.0014, "reward": 1.7893526554107666, "reward_std": 0.22658483684062958, "rewards/accuracy_reward": 0.4702049791812897, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2155761867761612, "step": 334 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 384.76788330078125, "epoch": 0.007567543146290774, "grad_norm": 3.9741393530784763, "kl": 0.0419921875, "learning_rate": 9.99859546504727e-07, "loss": 0.0017, "reward": 1.8564965724945068, "reward_std": 0.17727962136268616, "rewards/accuracy_reward": 0.416297048330307, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2937709391117096, "step": 335 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 377.64288330078125, "epoch": 0.007590132827324478, "grad_norm": 1.7190631353811263, "kl": 0.040283203125, "learning_rate": 9.998587042464663e-07, "loss": 0.0016, "reward": 2.1397604942321777, "reward_std": 0.15798740088939667, "rewards/accuracy_reward": 0.7164170145988464, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2840576171875, "step": 336 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 438.6250305175781, "epoch": 0.007612722508358182, "grad_norm": 1.5047604078139445, "kl": 0.0322265625, "learning_rate": 9.998578594707185e-07, "loss": 0.0013, "reward": 1.8521054983139038, "reward_std": 0.2710760831832886, "rewards/accuracy_reward": 0.5823614001274109, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2018868625164032, "step": 337 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 418.58929443359375, "epoch": 0.007635312189391886, "grad_norm": 11.618368194532321, "kl": 0.037841796875, "learning_rate": 9.998570121774884e-07, "loss": 0.0015, "reward": 1.8666402101516724, "reward_std": 0.07445579022169113, "rewards/accuracy_reward": 0.4815431237220764, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2422398328781128, "step": 338 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 382.8035888671875, "epoch": 0.007657901870425589, "grad_norm": 4.414629665376302, "kl": 0.036865234375, "learning_rate": 9.9985616236678e-07, "loss": 0.0015, "reward": 1.561800479888916, "reward_std": 0.1683075726032257, "rewards/accuracy_reward": 0.26951533555984497, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1815708726644516, "step": 339 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 385.08929443359375, "epoch": 0.007680491551459293, "grad_norm": 3.2934427848076693, "kl": 0.036865234375, "learning_rate": 9.998553100385976e-07, "loss": 0.0015, "reward": 1.8097567558288574, "reward_std": 0.3126884400844574, "rewards/accuracy_reward": 0.4410119950771332, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2401733547449112, "step": 340 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 388.64288330078125, "epoch": 0.007703081232492998, "grad_norm": 2.8168703522144694, "kl": 0.0341796875, "learning_rate": 9.998544551929452e-07, "loss": 0.0014, "reward": 2.0518875122070312, "reward_std": 0.22017614543437958, "rewards/accuracy_reward": 0.6015769243240356, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3003104329109192, "step": 341 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 380.6607360839844, "epoch": 0.007725670913526701, "grad_norm": 3.7020425102777152, "kl": 0.041748046875, "learning_rate": 9.998535978298279e-07, "loss": 0.0017, "reward": 1.8871361017227173, "reward_std": 0.2257334440946579, "rewards/accuracy_reward": 0.5321834683418274, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2192382961511612, "step": 342 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 381.1250305175781, "epoch": 0.007748260594560405, "grad_norm": 3.8388941188536974, "kl": 0.032958984375, "learning_rate": 9.998527379492492e-07, "loss": 0.0013, "reward": 1.8000633716583252, "reward_std": 0.12223243713378906, "rewards/accuracy_reward": 0.4533173143863678, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2181745320558548, "step": 343 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 383.9464416503906, "epoch": 0.0077708502755941085, "grad_norm": 3.681829142177806, "kl": 0.0419921875, "learning_rate": 9.998518755512136e-07, "loss": 0.0017, "reward": 2.020153760910034, "reward_std": 0.08506789058446884, "rewards/accuracy_reward": 0.5379444360733032, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3000662922859192, "step": 344 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 402.3035888671875, "epoch": 0.007793439956627813, "grad_norm": 55.65036951763811, "kl": 0.042236328125, "learning_rate": 9.99851010635726e-07, "loss": 0.0017, "reward": 2.1180837154388428, "reward_std": 0.1849765181541443, "rewards/accuracy_reward": 0.7009937763214111, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.284947007894516, "step": 345 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 367.4285888671875, "epoch": 0.007816029637661516, "grad_norm": 3.038121750496329, "kl": 0.039794921875, "learning_rate": 9.9985014320279e-07, "loss": 0.0016, "reward": 2.14558482170105, "reward_std": 0.054539699107408524, "rewards/accuracy_reward": 0.7425889372825623, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2494245320558548, "step": 346 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 405.6250305175781, "epoch": 0.00783861931869522, "grad_norm": 5.587047765236707, "kl": 0.036376953125, "learning_rate": 9.998492732524106e-07, "loss": 0.0015, "reward": 2.0013720989227295, "reward_std": 0.14648419618606567, "rewards/accuracy_reward": 0.5582603216171265, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2788260579109192, "step": 347 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 366.4285888671875, "epoch": 0.007861208999728924, "grad_norm": 10.163161248995552, "kl": 0.045166015625, "learning_rate": 9.99848400784592e-07, "loss": 0.0018, "reward": 1.810409665107727, "reward_std": 0.050742439925670624, "rewards/accuracy_reward": 0.4699658155441284, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2225865125656128, "step": 348 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 419.3750305175781, "epoch": 0.007883798680762628, "grad_norm": 4.746484386287993, "kl": 0.0361328125, "learning_rate": 9.99847525799338e-07, "loss": 0.0014, "reward": 1.6301913261413574, "reward_std": 0.3551667332649231, "rewards/accuracy_reward": 0.38291171193122864, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1758510172367096, "step": 349 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 385.8571472167969, "epoch": 0.007906388361796331, "grad_norm": 2.435313191089197, "kl": 0.044921875, "learning_rate": 9.99846648296654e-07, "loss": 0.0018, "reward": 2.081721782684326, "reward_std": 0.28603726625442505, "rewards/accuracy_reward": 0.6460632681846619, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2820870578289032, "step": 350 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 410.2500305175781, "epoch": 0.007928978042830034, "grad_norm": 4.072286801031304, "kl": 0.04541015625, "learning_rate": 9.998457682765435e-07, "loss": 0.0018, "reward": 2.0178749561309814, "reward_std": 0.07036341726779938, "rewards/accuracy_reward": 0.6061315536499023, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2510288953781128, "step": 351 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 404.5535888671875, "epoch": 0.00795156772386374, "grad_norm": 3.364597040264925, "kl": 0.044921875, "learning_rate": 9.998448857390117e-07, "loss": 0.0018, "reward": 1.9735572338104248, "reward_std": 0.051671020686626434, "rewards/accuracy_reward": 0.52464359998703, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2524850070476532, "step": 352 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 388.83929443359375, "epoch": 0.007974157404897443, "grad_norm": 3.1631996106811577, "kl": 0.03564453125, "learning_rate": 9.998440006840624e-07, "loss": 0.0014, "reward": 1.5173161029815674, "reward_std": 0.27739548683166504, "rewards/accuracy_reward": 0.2596047520637512, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1719970852136612, "step": 353 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 402.0357360839844, "epoch": 0.007996747085931146, "grad_norm": 1.8329185850778753, "kl": 0.0419921875, "learning_rate": 9.998431131117007e-07, "loss": 0.0017, "reward": 2.118600845336914, "reward_std": 0.14393125474452972, "rewards/accuracy_reward": 0.7193052172660828, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.260009765625, "step": 354 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 435.01788330078125, "epoch": 0.008019336766964851, "grad_norm": 2.7880550069253003, "kl": 0.041259765625, "learning_rate": 9.998422230219306e-07, "loss": 0.0017, "reward": 1.6418813467025757, "reward_std": 0.20139877498149872, "rewards/accuracy_reward": 0.3818994164466858, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.195696160197258, "step": 355 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 452.1250305175781, "epoch": 0.008041926447998555, "grad_norm": 3.342930122522253, "kl": 0.032958984375, "learning_rate": 9.998413304147564e-07, "loss": 0.0013, "reward": 1.8321945667266846, "reward_std": 0.3876144289970398, "rewards/accuracy_reward": 0.5288689136505127, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2390398383140564, "step": 356 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 401.64288330078125, "epoch": 0.008064516129032258, "grad_norm": 1.8947258998098218, "kl": 0.038330078125, "learning_rate": 9.998404352901833e-07, "loss": 0.0015, "reward": 2.001668691635132, "reward_std": 0.24861319363117218, "rewards/accuracy_reward": 0.6267001628875732, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2356829047203064, "step": 357 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 391.4285888671875, "epoch": 0.008087105810065961, "grad_norm": 6.014555669321055, "kl": 0.03125, "learning_rate": 9.998395376482152e-07, "loss": 0.0013, "reward": 1.8771814107894897, "reward_std": 0.2544044554233551, "rewards/accuracy_reward": 0.5232157707214355, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2253941297531128, "step": 358 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 383.21429443359375, "epoch": 0.008109695491099666, "grad_norm": 4.417393561157188, "kl": 0.047607421875, "learning_rate": 9.998386374888568e-07, "loss": 0.0019, "reward": 1.5485426187515259, "reward_std": 0.05239222198724747, "rewards/accuracy_reward": 0.28372329473495483, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1541050523519516, "step": 359 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 411.0535888671875, "epoch": 0.00813228517213337, "grad_norm": 4.442374054691859, "kl": 0.04052734375, "learning_rate": 9.998377348121128e-07, "loss": 0.0016, "reward": 1.8049968481063843, "reward_std": 0.3332853317260742, "rewards/accuracy_reward": 0.5435047149658203, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1722063422203064, "step": 360 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 364.4107360839844, "epoch": 0.008154874853167073, "grad_norm": 6.2427906858382665, "kl": 0.0439453125, "learning_rate": 9.998368296179876e-07, "loss": 0.0018, "reward": 1.9210519790649414, "reward_std": 0.1775970757007599, "rewards/accuracy_reward": 0.6044363379478455, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1916155219078064, "step": 361 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 367.1071472167969, "epoch": 0.008177464534200776, "grad_norm": 3.2255106515456227, "kl": 0.04638671875, "learning_rate": 9.998359219064859e-07, "loss": 0.0019, "reward": 1.8181759119033813, "reward_std": 0.1560308337211609, "rewards/accuracy_reward": 0.5040645003318787, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2141113430261612, "step": 362 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 349.26788330078125, "epoch": 0.008200054215234481, "grad_norm": 1.8945894802033385, "kl": 0.0478515625, "learning_rate": 9.99835011677612e-07, "loss": 0.0019, "reward": 2.067822217941284, "reward_std": 0.08710461109876633, "rewards/accuracy_reward": 0.6762728095054626, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2665492594242096, "step": 363 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 438.6250305175781, "epoch": 0.008222643896268185, "grad_norm": 1.8696944566316263, "kl": 0.0303955078125, "learning_rate": 9.998340989313705e-07, "loss": 0.0012, "reward": 1.9326345920562744, "reward_std": 0.28548362851142883, "rewards/accuracy_reward": 0.6355780363082886, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2149135172367096, "step": 364 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 445.1964416503906, "epoch": 0.008245233577301888, "grad_norm": 2.884437742010669, "kl": 0.033935546875, "learning_rate": 9.998331836677662e-07, "loss": 0.0014, "reward": 1.6302653551101685, "reward_std": 0.17346414923667908, "rewards/accuracy_reward": 0.44070395827293396, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.1752755343914032, "step": 365 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 411.5535888671875, "epoch": 0.008267823258335593, "grad_norm": 3.9968624742704324, "kl": 0.03466796875, "learning_rate": 9.998322658868037e-07, "loss": 0.0014, "reward": 1.8994675874710083, "reward_std": 0.2243237942457199, "rewards/accuracy_reward": 0.5730828642845154, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2049560695886612, "step": 366 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 413.5714416503906, "epoch": 0.008290412939369296, "grad_norm": 1.5395084052115322, "kl": 0.0283203125, "learning_rate": 9.998313455884874e-07, "loss": 0.0011, "reward": 1.6522977352142334, "reward_std": 0.32405540347099304, "rewards/accuracy_reward": 0.42155349254608154, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1557442843914032, "step": 367 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 384.96429443359375, "epoch": 0.008313002620403, "grad_norm": 3.4470889987385696, "kl": 0.0390625, "learning_rate": 9.998304227728222e-07, "loss": 0.0016, "reward": 2.056460380554199, "reward_std": 0.27920714020729065, "rewards/accuracy_reward": 0.644884467124939, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2937186360359192, "step": 368 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 388.58929443359375, "epoch": 0.008335592301436703, "grad_norm": 3.342596293658965, "kl": 0.03955078125, "learning_rate": 9.998294974398126e-07, "loss": 0.0016, "reward": 1.591866374015808, "reward_std": 0.08889474719762802, "rewards/accuracy_reward": 0.32813531160354614, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1815883219242096, "step": 369 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 377.1071472167969, "epoch": 0.008358181982470408, "grad_norm": 2.382838358917144, "kl": 0.045654296875, "learning_rate": 9.998285695894635e-07, "loss": 0.0018, "reward": 2.0662600994110107, "reward_std": 0.14085139334201813, "rewards/accuracy_reward": 0.6624495983123779, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2538103461265564, "step": 370 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 381.0000305175781, "epoch": 0.008380771663504111, "grad_norm": 2.8648729109164313, "kl": 0.04345703125, "learning_rate": 9.998276392217792e-07, "loss": 0.0017, "reward": 1.9632720947265625, "reward_std": 0.09597443044185638, "rewards/accuracy_reward": 0.6157325506210327, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2011108547449112, "step": 371 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 387.1071472167969, "epoch": 0.008403361344537815, "grad_norm": 3.5915952543396297, "kl": 0.04345703125, "learning_rate": 9.998267063367647e-07, "loss": 0.0017, "reward": 2.2661774158477783, "reward_std": 0.17516566812992096, "rewards/accuracy_reward": 0.8150298595428467, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2832903265953064, "step": 372 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 371.7500305175781, "epoch": 0.008425951025571518, "grad_norm": 2.0099741084739384, "kl": 0.044189453125, "learning_rate": 9.998257709344243e-07, "loss": 0.0018, "reward": 1.7357620000839233, "reward_std": 0.028665119782090187, "rewards/accuracy_reward": 0.49389174580574036, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1490129828453064, "step": 373 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 397.8571472167969, "epoch": 0.008448540706605223, "grad_norm": 2.2169867056095263, "kl": 0.046142578125, "learning_rate": 9.99824833014763e-07, "loss": 0.0018, "reward": 2.03769850730896, "reward_std": 0.38317492604255676, "rewards/accuracy_reward": 0.6406962871551514, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.27557373046875, "step": 374 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 366.6250305175781, "epoch": 0.008471130387638926, "grad_norm": 1.870483360248504, "kl": 0.048095703125, "learning_rate": 9.998238925777857e-07, "loss": 0.0019, "reward": 2.1103310585021973, "reward_std": 0.09718696027994156, "rewards/accuracy_reward": 0.7016185522079468, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2694266438484192, "step": 375 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 430.8035888671875, "epoch": 0.00849372006867263, "grad_norm": 2.0102186943757894, "kl": 0.039794921875, "learning_rate": 9.998229496234968e-07, "loss": 0.0016, "reward": 1.7661737203598022, "reward_std": 0.10161762684583664, "rewards/accuracy_reward": 0.4240523874759674, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.209978386759758, "step": 376 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 432.3571472167969, "epoch": 0.008516309749706335, "grad_norm": 4.57700074764244, "kl": 0.0546875, "learning_rate": 9.998220041519013e-07, "loss": 0.0022, "reward": 2.0864853858947754, "reward_std": 0.0741170197725296, "rewards/accuracy_reward": 0.6728412508964539, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2529296875, "step": 377 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 417.01788330078125, "epoch": 0.008538899430740038, "grad_norm": 2.412672969191097, "kl": 0.044677734375, "learning_rate": 9.998210561630037e-07, "loss": 0.0018, "reward": 1.9799001216888428, "reward_std": 0.1903795599937439, "rewards/accuracy_reward": 0.6152724027633667, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2253418117761612, "step": 378 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 449.01788330078125, "epoch": 0.008561489111773742, "grad_norm": 7.951581965090125, "kl": 0.044921875, "learning_rate": 9.99820105656809e-07, "loss": 0.0018, "reward": 2.055248260498047, "reward_std": 0.18710957467556, "rewards/accuracy_reward": 0.6898744106292725, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2582310438156128, "step": 379 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 428.3571472167969, "epoch": 0.008584078792807445, "grad_norm": 1.724274735923552, "kl": 0.045654296875, "learning_rate": 9.998191526333218e-07, "loss": 0.0018, "reward": 1.6962324380874634, "reward_std": 0.26645633578300476, "rewards/accuracy_reward": 0.4197709262371063, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1943185031414032, "step": 380 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 431.71429443359375, "epoch": 0.00860666847384115, "grad_norm": 1.5531412291948699, "kl": 0.041259765625, "learning_rate": 9.99818197092547e-07, "loss": 0.0016, "reward": 1.5744372606277466, "reward_std": 0.13288220763206482, "rewards/accuracy_reward": 0.27898502349853516, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1775948703289032, "step": 381 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 396.1785888671875, "epoch": 0.008629258154874853, "grad_norm": 1.7064774730360452, "kl": 0.04150390625, "learning_rate": 9.998172390344896e-07, "loss": 0.0017, "reward": 1.7499686479568481, "reward_std": 0.3445099890232086, "rewards/accuracy_reward": 0.5178571939468384, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1463971883058548, "step": 382 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 364.0000305175781, "epoch": 0.008651847835908557, "grad_norm": 2.1664380282561453, "kl": 0.057373046875, "learning_rate": 9.99816278459154e-07, "loss": 0.0023, "reward": 1.3358556032180786, "reward_std": 0.08457227051258087, "rewards/accuracy_reward": 0.20592550933361053, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0799299031496048, "step": 383 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 388.4464416503906, "epoch": 0.00867443751694226, "grad_norm": 4.272615410046249, "kl": 0.05126953125, "learning_rate": 9.998153153665454e-07, "loss": 0.0021, "reward": 1.7158334255218506, "reward_std": 0.35866767168045044, "rewards/accuracy_reward": 0.47863855957984924, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.133623406291008, "step": 384 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 380.39288330078125, "epoch": 0.008697027197975965, "grad_norm": 2.309051797037609, "kl": 0.04248046875, "learning_rate": 9.998143497566685e-07, "loss": 0.0017, "reward": 2.1418967247009277, "reward_std": 0.17455719411373138, "rewards/accuracy_reward": 0.7216153740882874, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2702811360359192, "step": 385 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 388.3214416503906, "epoch": 0.008719616879009668, "grad_norm": 13.103117402616778, "kl": 0.05126953125, "learning_rate": 9.998133816295283e-07, "loss": 0.002, "reward": 1.9120635986328125, "reward_std": 0.23828883469104767, "rewards/accuracy_reward": 0.4628516137599945, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2813546359539032, "step": 386 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 422.8035888671875, "epoch": 0.008742206560043372, "grad_norm": 7.716710222174443, "kl": 0.04541015625, "learning_rate": 9.998124109851295e-07, "loss": 0.0018, "reward": 1.880523443222046, "reward_std": 0.20103470981121063, "rewards/accuracy_reward": 0.5230039358139038, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2610909640789032, "step": 387 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 374.14288330078125, "epoch": 0.008764796241077077, "grad_norm": 3.782401960688634, "kl": 0.0615234375, "learning_rate": 9.99811437823477e-07, "loss": 0.0025, "reward": 2.0936684608459473, "reward_std": 0.16234861314296722, "rewards/accuracy_reward": 0.6428661346435547, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2865164875984192, "step": 388 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 374.58929443359375, "epoch": 0.00878738592211078, "grad_norm": 2.3679765336379104, "kl": 0.056884765625, "learning_rate": 9.998104621445758e-07, "loss": 0.0023, "reward": 2.215667486190796, "reward_std": 0.2677229344844818, "rewards/accuracy_reward": 0.7915254831314087, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.281284898519516, "step": 389 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 414.8035888671875, "epoch": 0.008809975603144483, "grad_norm": 3.0282614166034074, "kl": 0.0703125, "learning_rate": 9.998094839484306e-07, "loss": 0.0028, "reward": 1.8717529773712158, "reward_std": 0.16758543252944946, "rewards/accuracy_reward": 0.5558524131774902, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1909005343914032, "step": 390 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 371.8214416503906, "epoch": 0.008832565284178187, "grad_norm": 5.248216096921401, "kl": 0.06396484375, "learning_rate": 9.998085032350467e-07, "loss": 0.0026, "reward": 1.7502175569534302, "reward_std": 0.06086720898747444, "rewards/accuracy_reward": 0.44428136944770813, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1916504055261612, "step": 391 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 399.01788330078125, "epoch": 0.008855154965211892, "grad_norm": 3.7700885293911917, "kl": 0.0537109375, "learning_rate": 9.998075200044286e-07, "loss": 0.0021, "reward": 1.7122958898544312, "reward_std": 0.10438226163387299, "rewards/accuracy_reward": 0.3885478675365448, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1808907687664032, "step": 392 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 369.9464416503906, "epoch": 0.008877744646245595, "grad_norm": 3.6745113636536835, "kl": 0.060546875, "learning_rate": 9.998065342565819e-07, "loss": 0.0024, "reward": 1.9073222875595093, "reward_std": 0.18451401591300964, "rewards/accuracy_reward": 0.5531334280967712, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2220459133386612, "step": 393 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 434.58929443359375, "epoch": 0.008900334327279298, "grad_norm": 3.867355352756361, "kl": 0.0478515625, "learning_rate": 9.998055459915108e-07, "loss": 0.0019, "reward": 1.7219464778900146, "reward_std": 0.302684485912323, "rewards/accuracy_reward": 0.5112355947494507, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1571393758058548, "step": 394 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 395.5714416503906, "epoch": 0.008922924008313002, "grad_norm": 5.358953104871426, "kl": 0.046875, "learning_rate": 9.998045552092208e-07, "loss": 0.0019, "reward": 1.6033467054367065, "reward_std": 0.15334564447402954, "rewards/accuracy_reward": 0.30301186442375183, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1824776828289032, "step": 395 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 409.21429443359375, "epoch": 0.008945513689346707, "grad_norm": 4.000350796659094, "kl": 0.037109375, "learning_rate": 9.998035619097164e-07, "loss": 0.0015, "reward": 2.0250978469848633, "reward_std": 0.18860764801502228, "rewards/accuracy_reward": 0.6036378741264343, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2821742594242096, "step": 396 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 438.3571472167969, "epoch": 0.00896810337038041, "grad_norm": 3.9580756555584085, "kl": 0.041259765625, "learning_rate": 9.998025660930033e-07, "loss": 0.0017, "reward": 1.5805302858352661, "reward_std": 0.29100853204727173, "rewards/accuracy_reward": 0.3526962697505951, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.14569091796875, "step": 397 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.2857142857142857, "completion_length": 383.51788330078125, "epoch": 0.008990693051414114, "grad_norm": 6.236269258988159, "kl": 0.046630859375, "learning_rate": 9.998015677590862e-07, "loss": 0.0019, "reward": 1.9776618480682373, "reward_std": 0.04715551808476448, "rewards/accuracy_reward": 0.6630952954292297, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1895664781332016, "step": 398 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.2857142857142857, "completion_length": 356.8571472167969, "epoch": 0.009013282732447819, "grad_norm": 2.0212867702310375, "kl": 0.054443359375, "learning_rate": 9.998005669079696e-07, "loss": 0.0022, "reward": 2.038114070892334, "reward_std": 0.05509406700730324, "rewards/accuracy_reward": 0.7142857313156128, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2095424234867096, "step": 399 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 385.51788330078125, "epoch": 0.009035872413481522, "grad_norm": 1.582414907299189, "kl": 0.05029296875, "learning_rate": 9.997995635396595e-07, "loss": 0.002, "reward": 1.606191873550415, "reward_std": 0.14432214200496674, "rewards/accuracy_reward": 0.3743034899234772, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1318882554769516, "step": 400 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 393.8035888671875, "epoch": 0.009058462094515225, "grad_norm": 4.3080760056163205, "kl": 0.04150390625, "learning_rate": 9.9979855765416e-07, "loss": 0.0017, "reward": 1.7308679819107056, "reward_std": 0.36425265669822693, "rewards/accuracy_reward": 0.37828710675239563, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2168666422367096, "step": 401 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 389.76788330078125, "epoch": 0.009081051775548929, "grad_norm": 2.8713644794125672, "kl": 0.050048828125, "learning_rate": 9.99797549251477e-07, "loss": 0.002, "reward": 2.079434871673584, "reward_std": 0.18583329021930695, "rewards/accuracy_reward": 0.6346873044967651, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2947475016117096, "step": 402 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 387.1964416503906, "epoch": 0.009103641456582634, "grad_norm": 13.650369116722478, "kl": 0.04443359375, "learning_rate": 9.99796538331615e-07, "loss": 0.0018, "reward": 2.058262586593628, "reward_std": 0.1161104217171669, "rewards/accuracy_reward": 0.6393696665763855, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2581787109375, "step": 403 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 383.0357360839844, "epoch": 0.009126231137616337, "grad_norm": 7.608183476272721, "kl": 0.047119140625, "learning_rate": 9.997955248945792e-07, "loss": 0.0019, "reward": 1.9923145771026611, "reward_std": 0.1913708746433258, "rewards/accuracy_reward": 0.6093519926071167, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2686767578125, "step": 404 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 373.1964416503906, "epoch": 0.00914882081865004, "grad_norm": 2.629765464100262, "kl": 0.05517578125, "learning_rate": 9.99794508940375e-07, "loss": 0.0022, "reward": 2.118009328842163, "reward_std": 0.1937105804681778, "rewards/accuracy_reward": 0.6886497139930725, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.272216796875, "step": 405 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 379.9821472167969, "epoch": 0.009171410499683744, "grad_norm": 2.8468143639674413, "kl": 0.044189453125, "learning_rate": 9.997934904690075e-07, "loss": 0.0018, "reward": 1.764988660812378, "reward_std": 0.302298903465271, "rewards/accuracy_reward": 0.4488648772239685, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2054094672203064, "step": 406 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.46429443359375, "epoch": 0.009194000180717449, "grad_norm": 7.738318591683144, "kl": 0.049072265625, "learning_rate": 9.997924694804814e-07, "loss": 0.002, "reward": 1.986049771308899, "reward_std": 0.2690337300300598, "rewards/accuracy_reward": 0.5267340540885925, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2807442843914032, "step": 407 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 424.39288330078125, "epoch": 0.009216589861751152, "grad_norm": 1.272679087509261, "kl": 0.0380859375, "learning_rate": 9.99791445974802e-07, "loss": 0.0015, "reward": 1.7512418031692505, "reward_std": 0.15740473568439484, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.16552734375, "step": 408 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 414.3750305175781, "epoch": 0.009239179542784855, "grad_norm": 1.5996661881139613, "kl": 0.046142578125, "learning_rate": 9.997904199519746e-07, "loss": 0.0018, "reward": 1.9598842859268188, "reward_std": 0.20504829287528992, "rewards/accuracy_reward": 0.6428571939468384, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2027413547039032, "step": 409 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 443.26788330078125, "epoch": 0.00926176922381856, "grad_norm": 2.7855456622237127, "kl": 0.03759765625, "learning_rate": 9.997893914120043e-07, "loss": 0.0015, "reward": 1.6187412738800049, "reward_std": 0.13515536487102509, "rewards/accuracy_reward": 0.38010066747665405, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.1957833468914032, "step": 410 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 436.3214416503906, "epoch": 0.009284358904852264, "grad_norm": 2.4857469577987823, "kl": 0.04150390625, "learning_rate": 9.997883603548965e-07, "loss": 0.0017, "reward": 1.6926316022872925, "reward_std": 0.1848250776529312, "rewards/accuracy_reward": 0.39694592356681824, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2028285562992096, "step": 411 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 449.357177734375, "epoch": 0.009306948585885967, "grad_norm": 3.188422994857852, "kl": 0.034912109375, "learning_rate": 9.99787326780656e-07, "loss": 0.0014, "reward": 2.0740108489990234, "reward_std": 0.14789381623268127, "rewards/accuracy_reward": 0.6426665782928467, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.3242013156414032, "step": 412 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 434.2321472167969, "epoch": 0.00932953826691967, "grad_norm": 2.3965137352073143, "kl": 0.043212890625, "learning_rate": 9.997862906892883e-07, "loss": 0.0017, "reward": 1.776749849319458, "reward_std": 0.12012588977813721, "rewards/accuracy_reward": 0.45576417446136475, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2066999226808548, "step": 413 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 393.7857360839844, "epoch": 0.009352127947953376, "grad_norm": 1.9160529190173872, "kl": 0.04150390625, "learning_rate": 9.997852520807983e-07, "loss": 0.0017, "reward": 1.544538974761963, "reward_std": 0.0341479629278183, "rewards/accuracy_reward": 0.2742263078689575, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1595982164144516, "step": 414 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 407.9464416503906, "epoch": 0.009374717628987079, "grad_norm": 3.311281564438098, "kl": 0.038330078125, "learning_rate": 9.997842109551915e-07, "loss": 0.0015, "reward": 1.7990875244140625, "reward_std": 0.18184107542037964, "rewards/accuracy_reward": 0.46265462040901184, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2185756266117096, "step": 415 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 378.83929443359375, "epoch": 0.009397307310020782, "grad_norm": 2.0858649458110095, "kl": 0.047607421875, "learning_rate": 9.997831673124732e-07, "loss": 0.0019, "reward": 2.3149728775024414, "reward_std": 0.06956717371940613, "rewards/accuracy_reward": 0.8292690515518188, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.310703843832016, "step": 416 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 393.1785888671875, "epoch": 0.009419896991054486, "grad_norm": 7.413036307814448, "kl": 0.039794921875, "learning_rate": 9.997821211526485e-07, "loss": 0.0016, "reward": 1.9092589616775513, "reward_std": 0.23293033242225647, "rewards/accuracy_reward": 0.529473602771759, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2440708875656128, "step": 417 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 365.9464416503906, "epoch": 0.00944248667208819, "grad_norm": 2.5605243800992423, "kl": 0.043212890625, "learning_rate": 9.997810724757226e-07, "loss": 0.0017, "reward": 1.718626618385315, "reward_std": 0.22682899236679077, "rewards/accuracy_reward": 0.42958492040634155, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.196184441447258, "step": 418 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 354.89288330078125, "epoch": 0.009465076353121894, "grad_norm": 2.8747798613225153, "kl": 0.043701171875, "learning_rate": 9.99780021281701e-07, "loss": 0.0018, "reward": 1.9727232456207275, "reward_std": 0.05873589962720871, "rewards/accuracy_reward": 0.5358021855354309, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2904924750328064, "step": 419 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 347.3035888671875, "epoch": 0.009487666034155597, "grad_norm": 12.081815218089627, "kl": 0.044189453125, "learning_rate": 9.997789675705887e-07, "loss": 0.0018, "reward": 1.7444372177124023, "reward_std": 0.06265303492546082, "rewards/accuracy_reward": 0.3838273584842682, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2320382297039032, "step": 420 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 382.3214416503906, "epoch": 0.009510255715189302, "grad_norm": 1.7566334447618293, "kl": 0.03857421875, "learning_rate": 9.997779113423914e-07, "loss": 0.0015, "reward": 1.9446849822998047, "reward_std": 0.28817859292030334, "rewards/accuracy_reward": 0.6069582104682922, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2234410047531128, "step": 421 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 370.4821472167969, "epoch": 0.009532845396223006, "grad_norm": 2.2325006260488873, "kl": 0.040283203125, "learning_rate": 9.997768525971142e-07, "loss": 0.0016, "reward": 1.6806167364120483, "reward_std": 0.15663491189479828, "rewards/accuracy_reward": 0.3688769042491913, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1938825398683548, "step": 422 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 376.7857360839844, "epoch": 0.009555435077256709, "grad_norm": 2.1108848378627165, "kl": 0.05029296875, "learning_rate": 9.997757913347624e-07, "loss": 0.002, "reward": 1.7646428346633911, "reward_std": 0.2395399510860443, "rewards/accuracy_reward": 0.3916376233100891, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.240862175822258, "step": 423 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 367.01788330078125, "epoch": 0.009578024758290412, "grad_norm": 2.883857462428841, "kl": 0.04345703125, "learning_rate": 9.997747275553411e-07, "loss": 0.0017, "reward": 2.0646347999572754, "reward_std": 0.07066681981086731, "rewards/accuracy_reward": 0.6877406239509583, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2268938422203064, "step": 424 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 391.4821472167969, "epoch": 0.009600614439324117, "grad_norm": 2.680181553865584, "kl": 0.033203125, "learning_rate": 9.997736612588562e-07, "loss": 0.0013, "reward": 2.0799381732940674, "reward_std": 0.24066145718097687, "rewards/accuracy_reward": 0.6812530755996704, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2593994140625, "step": 425 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 395.8214416503906, "epoch": 0.00962320412035782, "grad_norm": 11.06452891144969, "kl": 0.049072265625, "learning_rate": 9.99772592445313e-07, "loss": 0.002, "reward": 2.0418777465820312, "reward_std": 0.09047583490610123, "rewards/accuracy_reward": 0.5895688533782959, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.295166015625, "step": 426 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 375.1071472167969, "epoch": 0.009645793801391524, "grad_norm": 3.3675297201304497, "kl": 0.048583984375, "learning_rate": 9.997715211147165e-07, "loss": 0.002, "reward": 2.0075182914733887, "reward_std": 0.12673886120319366, "rewards/accuracy_reward": 0.6015819907188416, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2809361219406128, "step": 427 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 395.14288330078125, "epoch": 0.009668383482425227, "grad_norm": 2.181397097445849, "kl": 0.048583984375, "learning_rate": 9.997704472670722e-07, "loss": 0.0019, "reward": 1.6215420961380005, "reward_std": 0.15886443853378296, "rewards/accuracy_reward": 0.33963635563850403, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.167619988322258, "step": 428 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 425.2321472167969, "epoch": 0.009690973163458932, "grad_norm": 14.415607226771561, "kl": 0.049072265625, "learning_rate": 9.997693709023857e-07, "loss": 0.002, "reward": 1.934741735458374, "reward_std": 0.07087776064872742, "rewards/accuracy_reward": 0.4867541193962097, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2765590250492096, "step": 429 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 415.71429443359375, "epoch": 0.009713562844492636, "grad_norm": 1.898411330382349, "kl": 0.031005859375, "learning_rate": 9.997682920206624e-07, "loss": 0.0012, "reward": 1.8991107940673828, "reward_std": 0.20826785266399384, "rewards/accuracy_reward": 0.5645769834518433, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2273908406496048, "step": 430 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 387.8750305175781, "epoch": 0.009736152525526339, "grad_norm": 2.2876360714969586, "kl": 0.04541015625, "learning_rate": 9.997672106219075e-07, "loss": 0.0018, "reward": 1.6900640726089478, "reward_std": 0.204123392701149, "rewards/accuracy_reward": 0.39924368262290955, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1979631781578064, "step": 431 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 360.8214416503906, "epoch": 0.009758742206560044, "grad_norm": 2.2394428683540535, "kl": 0.05126953125, "learning_rate": 9.997661267061268e-07, "loss": 0.0021, "reward": 2.1064605712890625, "reward_std": 0.05878223851323128, "rewards/accuracy_reward": 0.7129300236701965, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2435302883386612, "step": 432 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 371.0357360839844, "epoch": 0.009781331887593748, "grad_norm": 3.104803226332901, "kl": 0.0498046875, "learning_rate": 9.997650402733252e-07, "loss": 0.002, "reward": 1.904751181602478, "reward_std": 0.06376233696937561, "rewards/accuracy_reward": 0.5027456283569336, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.234148308634758, "step": 433 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 429.4821472167969, "epoch": 0.00980392156862745, "grad_norm": 1.9705830106893691, "kl": 0.03466796875, "learning_rate": 9.99763951323509e-07, "loss": 0.0014, "reward": 1.8338836431503296, "reward_std": 0.3345312476158142, "rewards/accuracy_reward": 0.511684238910675, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2186279445886612, "step": 434 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 345.6964416503906, "epoch": 0.009826511249661154, "grad_norm": 3.4657368905071912, "kl": 0.04638671875, "learning_rate": 9.99762859856683e-07, "loss": 0.0019, "reward": 1.8772588968276978, "reward_std": 0.2316921502351761, "rewards/accuracy_reward": 0.586309552192688, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2052350789308548, "step": 435 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 409.7321472167969, "epoch": 0.00984910093069486, "grad_norm": 4.231327142353432, "kl": 0.043701171875, "learning_rate": 9.997617658728527e-07, "loss": 0.0018, "reward": 1.875851035118103, "reward_std": 0.19517652690410614, "rewards/accuracy_reward": 0.5434814691543579, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2145124226808548, "step": 436 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 419.6250305175781, "epoch": 0.009871690611728563, "grad_norm": 1.5693783551408238, "kl": 0.048095703125, "learning_rate": 9.997606693720242e-07, "loss": 0.0019, "reward": 1.4551035165786743, "reward_std": 0.15716880559921265, "rewards/accuracy_reward": 0.25659969449043274, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1449323445558548, "step": 437 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 393.6607360839844, "epoch": 0.009894280292762266, "grad_norm": 2.9847776643400463, "kl": 0.04931640625, "learning_rate": 9.997595703542023e-07, "loss": 0.002, "reward": 1.6804968118667603, "reward_std": 0.2673247158527374, "rewards/accuracy_reward": 0.3100900650024414, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2382638156414032, "step": 438 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 389.39288330078125, "epoch": 0.00991686997379597, "grad_norm": 5.754960167895071, "kl": 0.049072265625, "learning_rate": 9.99758468819393e-07, "loss": 0.002, "reward": 1.6472755670547485, "reward_std": 0.09376844018697739, "rewards/accuracy_reward": 0.36260753870010376, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1775251179933548, "step": 439 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 390.9107360839844, "epoch": 0.009939459654829674, "grad_norm": 2.053632827313075, "kl": 0.053955078125, "learning_rate": 9.997573647676019e-07, "loss": 0.0022, "reward": 1.8259949684143066, "reward_std": 0.1322939246892929, "rewards/accuracy_reward": 0.5370789170265198, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1746303141117096, "step": 440 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 371.1785888671875, "epoch": 0.009962049335863378, "grad_norm": 1.8837268537552008, "kl": 0.057861328125, "learning_rate": 9.99756258198834e-07, "loss": 0.0023, "reward": 1.7408876419067383, "reward_std": 0.0477709136903286, "rewards/accuracy_reward": 0.41187673807144165, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2004394680261612, "step": 441 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 378.51788330078125, "epoch": 0.009984639016897081, "grad_norm": 2.5897412657661834, "kl": 0.045166015625, "learning_rate": 9.997551491130958e-07, "loss": 0.0018, "reward": 2.1974408626556396, "reward_std": 0.163031205534935, "rewards/accuracy_reward": 0.778460681438446, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2582659125328064, "step": 442 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 431.51788330078125, "epoch": 0.010007228697930786, "grad_norm": 2.403041693062637, "kl": 0.0380859375, "learning_rate": 9.99754037510392e-07, "loss": 0.0015, "reward": 1.826485276222229, "reward_std": 0.177900493144989, "rewards/accuracy_reward": 0.5156924724578857, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1965070515871048, "step": 443 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 387.5714416503906, "epoch": 0.01002981837896449, "grad_norm": 4.89313416503314, "kl": 0.05322265625, "learning_rate": 9.997529233907286e-07, "loss": 0.0021, "reward": 1.8913795948028564, "reward_std": 0.208868607878685, "rewards/accuracy_reward": 0.47358158230781555, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2785121500492096, "step": 444 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 409.0000305175781, "epoch": 0.010052408059998193, "grad_norm": 4.919231561334678, "kl": 0.04150390625, "learning_rate": 9.997518067541111e-07, "loss": 0.0017, "reward": 1.9203178882598877, "reward_std": 0.08214937895536423, "rewards/accuracy_reward": 0.5290005207061768, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2341744601726532, "step": 445 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 380.4285888671875, "epoch": 0.010074997741031896, "grad_norm": 2.4814661137711247, "kl": 0.054443359375, "learning_rate": 9.997506876005452e-07, "loss": 0.0022, "reward": 1.9264237880706787, "reward_std": 0.06552348285913467, "rewards/accuracy_reward": 0.5324400663375854, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2439836859703064, "step": 446 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 398.2857360839844, "epoch": 0.010097587422065601, "grad_norm": 2.4049651574102904, "kl": 0.041015625, "learning_rate": 9.997495659300365e-07, "loss": 0.0016, "reward": 2.2803828716278076, "reward_std": 0.20022277534008026, "rewards/accuracy_reward": 0.8210809230804443, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3200160562992096, "step": 447 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 385.8035888671875, "epoch": 0.010120177103099304, "grad_norm": 2.163981177001273, "kl": 0.05126953125, "learning_rate": 9.997484417425908e-07, "loss": 0.0021, "reward": 1.8174968957901, "reward_std": 0.3987562358379364, "rewards/accuracy_reward": 0.4885904788970947, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2181919813156128, "step": 448 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 363.2500305175781, "epoch": 0.010142766784133008, "grad_norm": 2.8791526013441815, "kl": 0.058837890625, "learning_rate": 9.997473150382136e-07, "loss": 0.0024, "reward": 1.9211512804031372, "reward_std": 0.0689319372177124, "rewards/accuracy_reward": 0.5471433997154236, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2240077555179596, "step": 449 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 353.9107360839844, "epoch": 0.010165356465166711, "grad_norm": 2.6599247663627303, "kl": 0.057861328125, "learning_rate": 9.997461858169104e-07, "loss": 0.0023, "reward": 1.9897487163543701, "reward_std": 0.09796372801065445, "rewards/accuracy_reward": 0.6213264465332031, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2577078938484192, "step": 450 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 353.5357360839844, "epoch": 0.010187946146200416, "grad_norm": 1.727957951979571, "kl": 0.050537109375, "learning_rate": 9.997450540786873e-07, "loss": 0.002, "reward": 1.914562463760376, "reward_std": 0.24219217896461487, "rewards/accuracy_reward": 0.5664597153663635, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2338169813156128, "step": 451 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 366.1250305175781, "epoch": 0.01021053582723412, "grad_norm": 2.4299994005044665, "kl": 0.05517578125, "learning_rate": 9.997439198235497e-07, "loss": 0.0022, "reward": 1.9556424617767334, "reward_std": 0.10090465843677521, "rewards/accuracy_reward": 0.6109054088592529, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2304513156414032, "step": 452 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 372.1250305175781, "epoch": 0.010233125508267823, "grad_norm": 2.9689919707028203, "kl": 0.051513671875, "learning_rate": 9.997427830515036e-07, "loss": 0.0021, "reward": 2.09559965133667, "reward_std": 0.13747623562812805, "rewards/accuracy_reward": 0.6726190447807312, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2694091796875, "step": 453 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 378.1785888671875, "epoch": 0.010255715189301528, "grad_norm": 2.3943132892971186, "kl": 0.057373046875, "learning_rate": 9.997416437625542e-07, "loss": 0.0023, "reward": 1.6050193309783936, "reward_std": 0.211176335811615, "rewards/accuracy_reward": 0.32472145557403564, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1802978664636612, "step": 454 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 391.8750305175781, "epoch": 0.010278304870335231, "grad_norm": 2.643524168284553, "kl": 0.047607421875, "learning_rate": 9.997405019567078e-07, "loss": 0.0019, "reward": 2.0572123527526855, "reward_std": 0.06110613793134689, "rewards/accuracy_reward": 0.6034629344940186, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.289463609457016, "step": 455 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 380.0535888671875, "epoch": 0.010300894551368935, "grad_norm": 2.9032465773155396, "kl": 0.06298828125, "learning_rate": 9.9973935763397e-07, "loss": 0.0025, "reward": 2.2152249813079834, "reward_std": 0.13063356280326843, "rewards/accuracy_reward": 0.7320044040679932, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3010777235031128, "step": 456 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 403.2857360839844, "epoch": 0.010323484232402638, "grad_norm": 2.5659874060470695, "kl": 0.040771484375, "learning_rate": 9.997382107943463e-07, "loss": 0.0016, "reward": 1.794928789138794, "reward_std": 0.21958117187023163, "rewards/accuracy_reward": 0.469834566116333, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2036655992269516, "step": 457 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 358.4107360839844, "epoch": 0.010346073913436343, "grad_norm": 1.954986487098903, "kl": 0.0634765625, "learning_rate": 9.997370614378426e-07, "loss": 0.0025, "reward": 1.668771505355835, "reward_std": 0.15842504799365997, "rewards/accuracy_reward": 0.3977612853050232, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1781529039144516, "step": 458 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 397.6071472167969, "epoch": 0.010368663594470046, "grad_norm": 1.8886950851067892, "kl": 0.049560546875, "learning_rate": 9.99735909564465e-07, "loss": 0.002, "reward": 1.7959160804748535, "reward_std": 0.18703694641590118, "rewards/accuracy_reward": 0.4759451448917389, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1842564195394516, "step": 459 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 376.76788330078125, "epoch": 0.01039125327550375, "grad_norm": 2.5214452261281397, "kl": 0.051025390625, "learning_rate": 9.997347551742187e-07, "loss": 0.002, "reward": 1.6489598751068115, "reward_std": 0.18962426483631134, "rewards/accuracy_reward": 0.35730937123298645, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1916504055261612, "step": 460 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 387.6964416503906, "epoch": 0.010413842956537453, "grad_norm": 12.173110959456286, "kl": 0.048583984375, "learning_rate": 9.9973359826711e-07, "loss": 0.0019, "reward": 2.185236930847168, "reward_std": 0.17428439855575562, "rewards/accuracy_reward": 0.7553573846817017, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2691650390625, "step": 461 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 363.2857360839844, "epoch": 0.010436432637571158, "grad_norm": 3.9449312914857146, "kl": 0.054931640625, "learning_rate": 9.997324388431447e-07, "loss": 0.0022, "reward": 2.230416774749756, "reward_std": 0.11386154592037201, "rewards/accuracy_reward": 0.7470563650131226, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3190743625164032, "step": 462 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 419.5000305175781, "epoch": 0.010459022318604861, "grad_norm": 18.767965422775067, "kl": 0.047119140625, "learning_rate": 9.997312769023284e-07, "loss": 0.0019, "reward": 1.9933644533157349, "reward_std": 0.16877704858779907, "rewards/accuracy_reward": 0.6392766833305359, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2433733344078064, "step": 463 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 379.51788330078125, "epoch": 0.010481611999638565, "grad_norm": 1.7451928863890693, "kl": 0.043701171875, "learning_rate": 9.99730112444667e-07, "loss": 0.0017, "reward": 1.996673822402954, "reward_std": 0.21464431285858154, "rewards/accuracy_reward": 0.5776187181472778, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2726266086101532, "step": 464 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 404.58929443359375, "epoch": 0.01050420168067227, "grad_norm": 1.7377563888905079, "kl": 0.046630859375, "learning_rate": 9.997289454701668e-07, "loss": 0.0019, "reward": 2.0394067764282227, "reward_std": 0.12390829622745514, "rewards/accuracy_reward": 0.68452388048172, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2370256781578064, "step": 465 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 371.96429443359375, "epoch": 0.010526791361705973, "grad_norm": 4.838737841891456, "kl": 0.047607421875, "learning_rate": 9.99727775978833e-07, "loss": 0.0019, "reward": 1.9935518503189087, "reward_std": 0.24593360722064972, "rewards/accuracy_reward": 0.65028315782547, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2432686984539032, "step": 466 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 397.4107360839844, "epoch": 0.010549381042739676, "grad_norm": 4.282814884953462, "kl": 0.052490234375, "learning_rate": 9.997266039706718e-07, "loss": 0.0021, "reward": 2.12546443939209, "reward_std": 0.12309129536151886, "rewards/accuracy_reward": 0.7457713484764099, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.229692742228508, "step": 467 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 390.4821472167969, "epoch": 0.01057197072377338, "grad_norm": 2.7978721441460737, "kl": 0.052734375, "learning_rate": 9.997254294456892e-07, "loss": 0.0021, "reward": 1.6362932920455933, "reward_std": 0.3594817519187927, "rewards/accuracy_reward": 0.38796737790107727, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1590401828289032, "step": 468 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 391.8035888671875, "epoch": 0.010594560404807085, "grad_norm": 4.514376390129834, "kl": 0.058349609375, "learning_rate": 9.99724252403891e-07, "loss": 0.0023, "reward": 1.9486037492752075, "reward_std": 0.16011843085289001, "rewards/accuracy_reward": 0.5428664088249207, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2557373046875, "step": 469 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 386.6071472167969, "epoch": 0.010617150085840788, "grad_norm": 2.9498336060831054, "kl": 0.059814453125, "learning_rate": 9.997230728452832e-07, "loss": 0.0024, "reward": 1.939788579940796, "reward_std": 0.14989230036735535, "rewards/accuracy_reward": 0.5240973234176636, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.262119859457016, "step": 470 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 409.46429443359375, "epoch": 0.010639739766874491, "grad_norm": 2.128148781140512, "kl": 0.052490234375, "learning_rate": 9.997218907698718e-07, "loss": 0.0021, "reward": 1.926666021347046, "reward_std": 0.21869094669818878, "rewards/accuracy_reward": 0.4958082437515259, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.26300048828125, "step": 471 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 378.6964416503906, "epoch": 0.010662329447908195, "grad_norm": 1.8692685047270003, "kl": 0.06298828125, "learning_rate": 9.997207061776625e-07, "loss": 0.0025, "reward": 2.074786424636841, "reward_std": 0.22936013340950012, "rewards/accuracy_reward": 0.7089694738388062, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2301025539636612, "step": 472 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 397.71429443359375, "epoch": 0.0106849191289419, "grad_norm": 4.3128582785930885, "kl": 0.058837890625, "learning_rate": 9.997195190686615e-07, "loss": 0.0024, "reward": 2.2796363830566406, "reward_std": 0.0714561939239502, "rewards/accuracy_reward": 0.7854538559913635, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3048967719078064, "step": 473 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 405.3214416503906, "epoch": 0.010707508809975603, "grad_norm": 4.863979620860495, "kl": 0.0615234375, "learning_rate": 9.997183294428744e-07, "loss": 0.0025, "reward": 2.0231661796569824, "reward_std": 0.15422241389751434, "rewards/accuracy_reward": 0.6015769839286804, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2715890109539032, "step": 474 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 412.6785888671875, "epoch": 0.010730098491009307, "grad_norm": 2.0191988899906135, "kl": 0.056396484375, "learning_rate": 9.997171373003076e-07, "loss": 0.0023, "reward": 2.0831298828125, "reward_std": 0.07933492958545685, "rewards/accuracy_reward": 0.6769005060195923, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.259800523519516, "step": 475 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 410.2857360839844, "epoch": 0.010752688172043012, "grad_norm": 2.14076652530316, "kl": 0.05029296875, "learning_rate": 9.997159426409674e-07, "loss": 0.002, "reward": 1.9360425472259521, "reward_std": 0.27402767539024353, "rewards/accuracy_reward": 0.5032858848571777, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2898995578289032, "step": 476 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 444.607177734375, "epoch": 0.010775277853076715, "grad_norm": 2.4664816790622783, "kl": 0.0537109375, "learning_rate": 9.997147454648588e-07, "loss": 0.0022, "reward": 1.7078007459640503, "reward_std": 0.19277305901050568, "rewards/accuracy_reward": 0.46721675992012024, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1834411770105362, "step": 477 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 428.1785888671875, "epoch": 0.010797867534110418, "grad_norm": 1.8621219518724312, "kl": 0.048583984375, "learning_rate": 9.99713545771989e-07, "loss": 0.0019, "reward": 1.5817593336105347, "reward_std": 0.1452091485261917, "rewards/accuracy_reward": 0.40697208046913147, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.1747872531414032, "step": 478 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 389.39288330078125, "epoch": 0.010820457215144122, "grad_norm": 1.7674708491410362, "kl": 0.05810546875, "learning_rate": 9.99712343562363e-07, "loss": 0.0023, "reward": 1.643746018409729, "reward_std": 0.12120793759822845, "rewards/accuracy_reward": 0.32399484515190125, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2090366929769516, "step": 479 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 382.89288330078125, "epoch": 0.010843046896177827, "grad_norm": 3.737799476583445, "kl": 0.06298828125, "learning_rate": 9.997111388359877e-07, "loss": 0.0025, "reward": 1.8111934661865234, "reward_std": 0.03175181522965431, "rewards/accuracy_reward": 0.46482399106025696, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2070835679769516, "step": 480 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 380.2500305175781, "epoch": 0.01086563657721153, "grad_norm": 2.300440436946896, "kl": 0.07177734375, "learning_rate": 9.997099315928687e-07, "loss": 0.0029, "reward": 1.8798134326934814, "reward_std": 0.21333296597003937, "rewards/accuracy_reward": 0.5594868659973145, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1917550265789032, "step": 481 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 374.9285888671875, "epoch": 0.010888226258245233, "grad_norm": 2.0281031676064822, "kl": 0.053466796875, "learning_rate": 9.99708721833012e-07, "loss": 0.0021, "reward": 1.423134207725525, "reward_std": 0.2324414700269699, "rewards/accuracy_reward": 0.3035714328289032, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0695626437664032, "step": 482 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 364.9107360839844, "epoch": 0.010910815939278937, "grad_norm": 1.582767570127536, "kl": 0.046142578125, "learning_rate": 9.99707509556424e-07, "loss": 0.0018, "reward": 1.9347820281982422, "reward_std": 0.29487964510917664, "rewards/accuracy_reward": 0.5952381491661072, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2466866672039032, "step": 483 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 364.8214416503906, "epoch": 0.010933405620312642, "grad_norm": 2.6244278310763436, "kl": 0.064453125, "learning_rate": 9.997062947631107e-07, "loss": 0.0026, "reward": 1.6703726053237915, "reward_std": 0.04038035869598389, "rewards/accuracy_reward": 0.39224761724472046, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1852678656578064, "step": 484 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 385.3214416503906, "epoch": 0.010955995301346345, "grad_norm": 1.2004518259543027, "kl": 0.06591796875, "learning_rate": 9.997050774530784e-07, "loss": 0.0026, "reward": 1.493172526359558, "reward_std": 0.02956664189696312, "rewards/accuracy_reward": 0.288642019033432, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1259591281414032, "step": 485 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 355.1785888671875, "epoch": 0.010978584982380048, "grad_norm": 3.859504220199664, "kl": 0.06591796875, "learning_rate": 9.997038576263326e-07, "loss": 0.0026, "reward": 1.8923091888427734, "reward_std": 0.08397135138511658, "rewards/accuracy_reward": 0.5631621479988098, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.214861199259758, "step": 486 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 355.96429443359375, "epoch": 0.011001174663413752, "grad_norm": 3.0327047926481505, "kl": 0.0673828125, "learning_rate": 9.9970263528288e-07, "loss": 0.0027, "reward": 1.7224953174591064, "reward_std": 0.24008624255657196, "rewards/accuracy_reward": 0.4381237328052521, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1772286593914032, "step": 487 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 360.8571472167969, "epoch": 0.011023764344447457, "grad_norm": 16.546187702101925, "kl": 0.06884765625, "learning_rate": 9.997014104227268e-07, "loss": 0.0028, "reward": 1.9704290628433228, "reward_std": 0.15302737057209015, "rewards/accuracy_reward": 0.5800409317016602, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.254673570394516, "step": 488 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 368.9464416503906, "epoch": 0.01104635402548116, "grad_norm": 2.632306832382785, "kl": 0.07080078125, "learning_rate": 9.99700183045879e-07, "loss": 0.0028, "reward": 1.8865166902542114, "reward_std": 0.23034267127513885, "rewards/accuracy_reward": 0.4754953384399414, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2788783609867096, "step": 489 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 355.58929443359375, "epoch": 0.011068943706514863, "grad_norm": 3.424644336176556, "kl": 0.08203125, "learning_rate": 9.996989531523427e-07, "loss": 0.0033, "reward": 1.8979209661483765, "reward_std": 0.1969013661146164, "rewards/accuracy_reward": 0.5605672597885132, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2159249484539032, "step": 490 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 370.0357360839844, "epoch": 0.011091533387548569, "grad_norm": 2.2182352985760545, "kl": 0.07666015625, "learning_rate": 9.996977207421242e-07, "loss": 0.0031, "reward": 1.983752965927124, "reward_std": 0.2562522888183594, "rewards/accuracy_reward": 0.5934723019599915, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2831377387046814, "step": 491 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 381.9285888671875, "epoch": 0.011114123068582272, "grad_norm": 51.481047983481325, "kl": 0.322265625, "learning_rate": 9.996964858152296e-07, "loss": 0.013, "reward": 1.8244318962097168, "reward_std": 0.360969215631485, "rewards/accuracy_reward": 0.5926865339279175, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.2353166937828064, "step": 492 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 389.8750305175781, "epoch": 0.011136712749615975, "grad_norm": 20.763223665555703, "kl": 0.083984375, "learning_rate": 9.996952483716652e-07, "loss": 0.0033, "reward": 1.8418422937393188, "reward_std": 0.24559733271598816, "rewards/accuracy_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1739850789308548, "step": 493 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 410.3035888671875, "epoch": 0.011159302430649679, "grad_norm": 1.8246946626791416, "kl": 0.07666015625, "learning_rate": 9.996940084114372e-07, "loss": 0.0031, "reward": 2.0404906272888184, "reward_std": 0.16661052405834198, "rewards/accuracy_reward": 0.6622331142425537, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2496861219406128, "step": 494 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 381.2857360839844, "epoch": 0.011181892111683384, "grad_norm": 325.3127975608338, "kl": 2.234375, "learning_rate": 9.99692765934552e-07, "loss": 0.0896, "reward": 1.8668735027313232, "reward_std": 0.3607932925224304, "rewards/accuracy_reward": 0.5177174806594849, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2420131266117096, "step": 495 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 394.9821472167969, "epoch": 0.011204481792717087, "grad_norm": 7.279687858150825, "kl": 0.09375, "learning_rate": 9.996915209410158e-07, "loss": 0.0037, "reward": 1.986413598060608, "reward_std": 0.19808821380138397, "rewards/accuracy_reward": 0.5703299045562744, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.280369371175766, "step": 496 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 423.8035888671875, "epoch": 0.01122707147375079, "grad_norm": 3.6315007919076847, "kl": 0.07373046875, "learning_rate": 9.996902734308345e-07, "loss": 0.0029, "reward": 1.5460346937179565, "reward_std": 0.11008824408054352, "rewards/accuracy_reward": 0.34093815088272095, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.130096435546875, "step": 497 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 406.21429443359375, "epoch": 0.011249661154784494, "grad_norm": 9.235769000452764, "kl": 0.1005859375, "learning_rate": 9.99689023404015e-07, "loss": 0.004, "reward": 2.094040632247925, "reward_std": 0.1678343564271927, "rewards/accuracy_reward": 0.686144232749939, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2721819281578064, "step": 498 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 376.7321472167969, "epoch": 0.011272250835818199, "grad_norm": 2.5655993613132484, "kl": 0.06884765625, "learning_rate": 9.99687770860563e-07, "loss": 0.0027, "reward": 1.9894109964370728, "reward_std": 0.2566153407096863, "rewards/accuracy_reward": 0.6008262634277344, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2457275539636612, "step": 499 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 400.0535888671875, "epoch": 0.011294840516851902, "grad_norm": 2.068672282122027, "kl": 0.08740234375, "learning_rate": 9.996865158004851e-07, "loss": 0.0035, "reward": 1.9349946975708008, "reward_std": 0.06428404152393341, "rewards/accuracy_reward": 0.572946310043335, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2156197726726532, "step": 500 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 406.64288330078125, "epoch": 0.011317430197885605, "grad_norm": 3.028545165376012, "kl": 0.08740234375, "learning_rate": 9.99685258223788e-07, "loss": 0.0035, "reward": 1.9346755743026733, "reward_std": 0.24455709755420685, "rewards/accuracy_reward": 0.5605124831199646, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2313058078289032, "step": 501 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 414.1071472167969, "epoch": 0.01134001987891931, "grad_norm": 2.419079596542988, "kl": 0.0791015625, "learning_rate": 9.996839981304775e-07, "loss": 0.0032, "reward": 1.6461514234542847, "reward_std": 0.05834679305553436, "rewards/accuracy_reward": 0.3196796178817749, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1871861070394516, "step": 502 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 396.51788330078125, "epoch": 0.011362609559953014, "grad_norm": 4.068939880726073, "kl": 0.08203125, "learning_rate": 9.996827355205599e-07, "loss": 0.0033, "reward": 1.6124058961868286, "reward_std": 0.2015247493982315, "rewards/accuracy_reward": 0.3308210074901581, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1780133992433548, "step": 503 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 417.3214416503906, "epoch": 0.011385199240986717, "grad_norm": 2.576048729593263, "kl": 0.08837890625, "learning_rate": 9.996814703940417e-07, "loss": 0.0035, "reward": 1.5865702629089355, "reward_std": 0.2282116562128067, "rewards/accuracy_reward": 0.37570422887802124, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1465802937746048, "step": 504 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 418.3571472167969, "epoch": 0.01140778892202042, "grad_norm": 2.3424521895830233, "kl": 0.0830078125, "learning_rate": 9.996802027509295e-07, "loss": 0.0033, "reward": 2.088595151901245, "reward_std": 0.2781344950199127, "rewards/accuracy_reward": 0.7417423129081726, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2361384928226471, "step": 505 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 420.4285888671875, "epoch": 0.011430378603054125, "grad_norm": 1.6757613173732169, "kl": 0.07470703125, "learning_rate": 9.996789325912296e-07, "loss": 0.003, "reward": 1.7274904251098633, "reward_std": 0.153874009847641, "rewards/accuracy_reward": 0.34423840045928955, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2439662516117096, "step": 506 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 386.2857360839844, "epoch": 0.011452968284087829, "grad_norm": 2.5384004482469034, "kl": 0.0732421875, "learning_rate": 9.996776599149481e-07, "loss": 0.0029, "reward": 1.7929176092147827, "reward_std": 0.056754425168037415, "rewards/accuracy_reward": 0.4464121162891388, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2215053141117096, "step": 507 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 420.4464416503906, "epoch": 0.011475557965121532, "grad_norm": 1.941251138857776, "kl": 0.055908203125, "learning_rate": 9.996763847220918e-07, "loss": 0.0022, "reward": 1.7270690202713013, "reward_std": 0.32464534044265747, "rewards/accuracy_reward": 0.4748002290725708, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1736973375082016, "step": 508 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.2857142857142857, "completion_length": 400.2500305175781, "epoch": 0.011498147646155235, "grad_norm": 1.6638849125014987, "kl": 0.06396484375, "learning_rate": 9.996751070126667e-07, "loss": 0.0026, "reward": 1.9704631567001343, "reward_std": 0.051131028681993484, "rewards/accuracy_reward": 0.6558635234832764, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1788853257894516, "step": 509 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 373.1964416503906, "epoch": 0.01152073732718894, "grad_norm": 2.574305035209971, "kl": 0.064453125, "learning_rate": 9.996738267866796e-07, "loss": 0.0026, "reward": 2.010843276977539, "reward_std": 0.19825027883052826, "rewards/accuracy_reward": 0.5547606945037842, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3239397406578064, "step": 510 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 379.9285888671875, "epoch": 0.011543327008222644, "grad_norm": 1.4291015758885277, "kl": 0.0595703125, "learning_rate": 9.996725440441366e-07, "loss": 0.0024, "reward": 1.702086329460144, "reward_std": 0.1294317990541458, "rewards/accuracy_reward": 0.44677796959877014, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1553083211183548, "step": 511 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 438.1964416503906, "epoch": 0.011565916689256347, "grad_norm": 1.5883169679128541, "kl": 0.04150390625, "learning_rate": 9.996712587850448e-07, "loss": 0.0017, "reward": 2.17756724357605, "reward_std": 0.42758074402809143, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2739955484867096, "step": 512 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 376.6071472167969, "epoch": 0.011588506370290052, "grad_norm": 1.9781021106728167, "kl": 0.068359375, "learning_rate": 9.9966997100941e-07, "loss": 0.0027, "reward": 1.6757017374038696, "reward_std": 0.11882271617650986, "rewards/accuracy_reward": 0.39860570430755615, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1842389851808548, "step": 513 }, { "all_correct": 0.0, "all_wrong": 0.5714285714285714, "completion_length": 367.39288330078125, "epoch": 0.011611096051323756, "grad_norm": 1.7544346615445978, "kl": 0.05810546875, "learning_rate": 9.99668680717239e-07, "loss": 0.0023, "reward": 1.2855687141418457, "reward_std": 0.0858425423502922, "rewards/accuracy_reward": 0.1422441303730011, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0861816480755806, "step": 514 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 359.96429443359375, "epoch": 0.011633685732357459, "grad_norm": 1.6352947974933327, "kl": 0.058349609375, "learning_rate": 9.996673879085382e-07, "loss": 0.0023, "reward": 1.7918331623077393, "reward_std": 0.179501473903656, "rewards/accuracy_reward": 0.49047985672950745, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1906389594078064, "step": 515 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 414.0357360839844, "epoch": 0.011656275413391162, "grad_norm": 1.5629648544254977, "kl": 0.044677734375, "learning_rate": 9.99666092583314e-07, "loss": 0.0018, "reward": 1.8440616130828857, "reward_std": 0.3023635745048523, "rewards/accuracy_reward": 0.5507999062538147, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2361188679933548, "step": 516 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 438.51788330078125, "epoch": 0.011678865094424867, "grad_norm": 74.86398462474662, "kl": 0.05712890625, "learning_rate": 9.996647947415732e-07, "loss": 0.0023, "reward": 1.617402195930481, "reward_std": 0.10179625451564789, "rewards/accuracy_reward": 0.4092303216457367, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.2153145968914032, "step": 517 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 368.71429443359375, "epoch": 0.01170145477545857, "grad_norm": 5.443973858004544, "kl": 0.05224609375, "learning_rate": 9.996634943833225e-07, "loss": 0.0021, "reward": 1.992806077003479, "reward_std": 0.3535667955875397, "rewards/accuracy_reward": 0.6190475821495056, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2451869547367096, "step": 518 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 341.89288330078125, "epoch": 0.011724044456492274, "grad_norm": 3.5731763531481384, "kl": 0.057861328125, "learning_rate": 9.996621915085678e-07, "loss": 0.0023, "reward": 1.9462376832962036, "reward_std": 0.09396544843912125, "rewards/accuracy_reward": 0.52205890417099, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2920357882976532, "step": 519 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 362.6250305175781, "epoch": 0.011746634137525977, "grad_norm": 2.2404613916734695, "kl": 0.051513671875, "learning_rate": 9.99660886117316e-07, "loss": 0.0021, "reward": 1.8852167129516602, "reward_std": 0.28981322050094604, "rewards/accuracy_reward": 0.49971848726272583, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.260498046875, "step": 520 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 328.26788330078125, "epoch": 0.011769223818559682, "grad_norm": 2.2476633460265494, "kl": 0.0625, "learning_rate": 9.99659578209574e-07, "loss": 0.0025, "reward": 1.7127375602722168, "reward_std": 0.04528288543224335, "rewards/accuracy_reward": 0.4630340337753296, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1782749742269516, "step": 521 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 367.1964416503906, "epoch": 0.011791813499593386, "grad_norm": 2.3492761798924513, "kl": 0.05322265625, "learning_rate": 9.996582677853478e-07, "loss": 0.0021, "reward": 1.902706503868103, "reward_std": 0.266227662563324, "rewards/accuracy_reward": 0.4866524934768677, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2946254312992096, "step": 522 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 346.0000305175781, "epoch": 0.011814403180627089, "grad_norm": 6.174324673003354, "kl": 0.10107421875, "learning_rate": 9.996569548446444e-07, "loss": 0.004, "reward": 2.093201160430908, "reward_std": 0.1383826732635498, "rewards/accuracy_reward": 0.6904212832450867, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2777797281742096, "step": 523 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 346.89288330078125, "epoch": 0.011836992861660794, "grad_norm": 8.947196929891597, "kl": 0.08544921875, "learning_rate": 9.996556393874702e-07, "loss": 0.0034, "reward": 1.894531488418579, "reward_std": 0.2795908451080322, "rewards/accuracy_reward": 0.5912982821464539, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1889474093914032, "step": 524 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 363.6785888671875, "epoch": 0.011859582542694497, "grad_norm": 6.153296662993968, "kl": 0.05078125, "learning_rate": 9.99654321413832e-07, "loss": 0.002, "reward": 1.375449299812317, "reward_std": 0.21684937179088593, "rewards/accuracy_reward": 0.1737053096294403, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1267438679933548, "step": 525 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 358.5000305175781, "epoch": 0.0118821722237282, "grad_norm": 7.887963860288554, "kl": 0.0546875, "learning_rate": 9.996530009237363e-07, "loss": 0.0022, "reward": 1.7664464712142944, "reward_std": 0.3628794848918915, "rewards/accuracy_reward": 0.515874445438385, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1755719929933548, "step": 526 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 377.0714416503906, "epoch": 0.011904761904761904, "grad_norm": 7.891544202892382, "kl": 0.045654296875, "learning_rate": 9.996516779171898e-07, "loss": 0.0018, "reward": 1.852524995803833, "reward_std": 0.22229188680648804, "rewards/accuracy_reward": 0.5546594858169556, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.187151238322258, "step": 527 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 334.33929443359375, "epoch": 0.01192735158579561, "grad_norm": 2.0210046638353494, "kl": 0.061279296875, "learning_rate": 9.996503523941992e-07, "loss": 0.0025, "reward": 1.8055769205093384, "reward_std": 0.20259644091129303, "rewards/accuracy_reward": 0.5535714626312256, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1984340250492096, "step": 528 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 369.8750305175781, "epoch": 0.011949941266829313, "grad_norm": 3.540916222853531, "kl": 0.050537109375, "learning_rate": 9.996490243547712e-07, "loss": 0.002, "reward": 1.8022208213806152, "reward_std": 0.21147766709327698, "rewards/accuracy_reward": 0.4861477017402649, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2160731852054596, "step": 529 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 382.6964416503906, "epoch": 0.011972530947863016, "grad_norm": 3.7066017252287904, "kl": 0.05126953125, "learning_rate": 9.996476937989124e-07, "loss": 0.002, "reward": 1.5794137716293335, "reward_std": 0.22840949892997742, "rewards/accuracy_reward": 0.28435224294662476, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1879185289144516, "step": 530 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 392.26788330078125, "epoch": 0.01199512062889672, "grad_norm": 5.431653796019761, "kl": 0.058349609375, "learning_rate": 9.996463607266295e-07, "loss": 0.0023, "reward": 1.9120413064956665, "reward_std": 0.1530340164899826, "rewards/accuracy_reward": 0.43125849962234497, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.320068359375, "step": 531 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 374.7857360839844, "epoch": 0.012017710309930424, "grad_norm": 1.9388371032060332, "kl": 0.052734375, "learning_rate": 9.996450251379292e-07, "loss": 0.0021, "reward": 1.6142420768737793, "reward_std": 0.04458688944578171, "rewards/accuracy_reward": 0.3523837924003601, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1547154039144516, "step": 532 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 382.5357360839844, "epoch": 0.012040299990964128, "grad_norm": 6.147171850619078, "kl": 0.058349609375, "learning_rate": 9.996436870328184e-07, "loss": 0.0023, "reward": 1.7403159141540527, "reward_std": 0.23306281864643097, "rewards/accuracy_reward": 0.4761904776096344, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.156982421875, "step": 533 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 419.4464416503906, "epoch": 0.012062889671997831, "grad_norm": 4.489351490932065, "kl": 0.0556640625, "learning_rate": 9.996423464113035e-07, "loss": 0.0022, "reward": 1.831758975982666, "reward_std": 0.28459587693214417, "rewards/accuracy_reward": 0.4931812286376953, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2385777235031128, "step": 534 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 444.4464416503906, "epoch": 0.012085479353031536, "grad_norm": 3.0414195816208247, "kl": 0.041748046875, "learning_rate": 9.996410032733916e-07, "loss": 0.0017, "reward": 1.6149439811706543, "reward_std": 0.2665058672428131, "rewards/accuracy_reward": 0.3995805084705353, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1403634250164032, "step": 535 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 388.1785888671875, "epoch": 0.01210806903406524, "grad_norm": 2.113169671535112, "kl": 0.06103515625, "learning_rate": 9.996396576190895e-07, "loss": 0.0024, "reward": 2.118628740310669, "reward_std": 0.1536790430545807, "rewards/accuracy_reward": 0.7082491517066956, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2460937649011612, "step": 536 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 394.8750305175781, "epoch": 0.012130658715098943, "grad_norm": 1.7698862330726879, "kl": 0.05859375, "learning_rate": 9.996383094484037e-07, "loss": 0.0023, "reward": 1.9127498865127563, "reward_std": 0.0536222867667675, "rewards/accuracy_reward": 0.5236838459968567, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2569231390953064, "step": 537 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 387.9107360839844, "epoch": 0.012153248396132646, "grad_norm": 4.211627763431855, "kl": 0.0556640625, "learning_rate": 9.996369587613411e-07, "loss": 0.0022, "reward": 2.2159931659698486, "reward_std": 0.31596288084983826, "rewards/accuracy_reward": 0.7727645039558411, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2825143039226532, "step": 538 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 401.89288330078125, "epoch": 0.012175838077166351, "grad_norm": 2.829456539833602, "kl": 0.0576171875, "learning_rate": 9.996356055579084e-07, "loss": 0.0023, "reward": 1.8727576732635498, "reward_std": 0.08742109686136246, "rewards/accuracy_reward": 0.4398719072341919, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2793143391609192, "step": 539 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 388.5535888671875, "epoch": 0.012198427758200054, "grad_norm": 3.20501237857805, "kl": 0.05810546875, "learning_rate": 9.996342498381126e-07, "loss": 0.0023, "reward": 1.8954187631607056, "reward_std": 0.07145875692367554, "rewards/accuracy_reward": 0.48998814821243286, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2447161078453064, "step": 540 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 396.9464416503906, "epoch": 0.012221017439233758, "grad_norm": 1.9404328210461328, "kl": 0.0654296875, "learning_rate": 9.996328916019604e-07, "loss": 0.0026, "reward": 2.021904706954956, "reward_std": 0.1284814327955246, "rewards/accuracy_reward": 0.5777222514152527, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.3048967719078064, "step": 541 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 469.732177734375, "epoch": 0.012243607120267461, "grad_norm": 2.6032129686317576, "kl": 0.0546875, "learning_rate": 9.996315308494586e-07, "loss": 0.0022, "reward": 1.766984224319458, "reward_std": 0.2577974498271942, "rewards/accuracy_reward": 0.409809947013855, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.2821742594242096, "step": 542 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 402.51788330078125, "epoch": 0.012266196801301166, "grad_norm": 2.0661886942088046, "kl": 0.055908203125, "learning_rate": 9.996301675806142e-07, "loss": 0.0022, "reward": 1.6531507968902588, "reward_std": 0.16763469576835632, "rewards/accuracy_reward": 0.3279414176940918, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1966378390789032, "step": 543 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 374.51788330078125, "epoch": 0.01228878648233487, "grad_norm": 2.8729533777272453, "kl": 0.06787109375, "learning_rate": 9.996288017954341e-07, "loss": 0.0027, "reward": 2.1692655086517334, "reward_std": 0.2879559099674225, "rewards/accuracy_reward": 0.7370458245277405, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2750767469406128, "step": 544 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 361.5357360839844, "epoch": 0.012311376163368573, "grad_norm": 22.813229656736347, "kl": 0.064453125, "learning_rate": 9.99627433493925e-07, "loss": 0.0026, "reward": 1.5764691829681396, "reward_std": 0.20038694143295288, "rewards/accuracy_reward": 0.3489092290401459, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.141845703125, "step": 545 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 385.3035888671875, "epoch": 0.012333965844402278, "grad_norm": 12.083386620096874, "kl": 0.06201171875, "learning_rate": 9.996260626760939e-07, "loss": 0.0025, "reward": 1.9390184879302979, "reward_std": 0.06575751304626465, "rewards/accuracy_reward": 0.5510021448135376, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2344447672367096, "step": 546 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 403.0714416503906, "epoch": 0.012356555525435981, "grad_norm": 1.9359234458942494, "kl": 0.0400390625, "learning_rate": 9.996246893419479e-07, "loss": 0.0016, "reward": 1.9144914150238037, "reward_std": 0.26652154326438904, "rewards/accuracy_reward": 0.6207482814788818, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.193743035197258, "step": 547 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 364.6250305175781, "epoch": 0.012379145206469684, "grad_norm": 4.5417330596258685, "kl": 0.0576171875, "learning_rate": 9.996233134914933e-07, "loss": 0.0023, "reward": 1.8713865280151367, "reward_std": 0.17169418931007385, "rewards/accuracy_reward": 0.5127264261245728, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.233660027384758, "step": 548 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 455.0535888671875, "epoch": 0.012401734887503388, "grad_norm": 1.3449323105547013, "kl": 0.0380859375, "learning_rate": 9.996219351247377e-07, "loss": 0.0015, "reward": 1.9416494369506836, "reward_std": 0.17671842873096466, "rewards/accuracy_reward": 0.648809552192688, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2142682820558548, "step": 549 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 361.4285888671875, "epoch": 0.012424324568537093, "grad_norm": 2.2094127199596243, "kl": 0.0537109375, "learning_rate": 9.996205542416877e-07, "loss": 0.0021, "reward": 1.9394993782043457, "reward_std": 0.15597257018089294, "rewards/accuracy_reward": 0.5767185091972351, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2163521945476532, "step": 550 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 377.4107360839844, "epoch": 0.012446914249570796, "grad_norm": 2.6327008654329314, "kl": 0.052978515625, "learning_rate": 9.996191708423501e-07, "loss": 0.0021, "reward": 1.5568053722381592, "reward_std": 0.34897053241729736, "rewards/accuracy_reward": 0.29211854934692383, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.178972527384758, "step": 551 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 325.08929443359375, "epoch": 0.0124695039306045, "grad_norm": 6.771709663010623, "kl": 0.055419921875, "learning_rate": 9.996177849267322e-07, "loss": 0.0022, "reward": 1.7538303136825562, "reward_std": 0.12404752522706985, "rewards/accuracy_reward": 0.4508098065853119, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2208775281906128, "step": 552 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 338.9107360839844, "epoch": 0.012492093611638203, "grad_norm": 1.8347903559805616, "kl": 0.05126953125, "learning_rate": 9.99616396494841e-07, "loss": 0.002, "reward": 1.6977741718292236, "reward_std": 0.06195138394832611, "rewards/accuracy_reward": 0.45547667145729065, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1494402289390564, "step": 553 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 341.1964416503906, "epoch": 0.012514683292671908, "grad_norm": 4.765911246549767, "kl": 0.05859375, "learning_rate": 9.996150055466832e-07, "loss": 0.0024, "reward": 2.195117235183716, "reward_std": 0.09717575460672379, "rewards/accuracy_reward": 0.7435232400894165, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3123081922531128, "step": 554 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.2857142857142857, "completion_length": 351.1250305175781, "epoch": 0.012537272973705611, "grad_norm": 2.998605071137021, "kl": 0.054931640625, "learning_rate": 9.996136120822661e-07, "loss": 0.0022, "reward": 1.9970704317092896, "reward_std": 0.06412028521299362, "rewards/accuracy_reward": 0.6556640863418579, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2271205484867096, "step": 555 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 373.21429443359375, "epoch": 0.012559862654739315, "grad_norm": 2.4853610781696864, "kl": 0.0390625, "learning_rate": 9.996122161015965e-07, "loss": 0.0016, "reward": 1.6679004430770874, "reward_std": 0.4239775240421295, "rewards/accuracy_reward": 0.4289286136627197, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.160400390625, "step": 556 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 383.4464416503906, "epoch": 0.01258245233577302, "grad_norm": 2.644124799323279, "kl": 0.04931640625, "learning_rate": 9.996108176046817e-07, "loss": 0.002, "reward": 1.7921985387802124, "reward_std": 0.2518872022628784, "rewards/accuracy_reward": 0.46271148324012756, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2152012586593628, "step": 557 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 408.6607360839844, "epoch": 0.012605042016806723, "grad_norm": 8.90820990697975, "kl": 0.04736328125, "learning_rate": 9.996094165915284e-07, "loss": 0.0019, "reward": 2.194398880004883, "reward_std": 0.1172441616654396, "rewards/accuracy_reward": 0.7186524868011475, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3007463812828064, "step": 558 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 377.5357360839844, "epoch": 0.012627631697840426, "grad_norm": 5.114029001556292, "kl": 0.05126953125, "learning_rate": 9.996080130621437e-07, "loss": 0.0021, "reward": 2.0838310718536377, "reward_std": 0.31129732728004456, "rewards/accuracy_reward": 0.644206166267395, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2717677652835846, "step": 559 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 401.0357360839844, "epoch": 0.01265022137887413, "grad_norm": 2.2933533993261777, "kl": 0.045654296875, "learning_rate": 9.99606607016535e-07, "loss": 0.0018, "reward": 1.8119450807571411, "reward_std": 0.19856856763362885, "rewards/accuracy_reward": 0.38002532720565796, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.306919664144516, "step": 560 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 421.26788330078125, "epoch": 0.012672811059907835, "grad_norm": 1.8467328755515218, "kl": 0.0439453125, "learning_rate": 9.99605198454709e-07, "loss": 0.0018, "reward": 1.7816085815429688, "reward_std": 0.4848213195800781, "rewards/accuracy_reward": 0.4798594117164612, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2481776773929596, "step": 561 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 406.71429443359375, "epoch": 0.012695400740941538, "grad_norm": 2.116672143706493, "kl": 0.0595703125, "learning_rate": 9.99603787376673e-07, "loss": 0.0024, "reward": 2.0311450958251953, "reward_std": 0.06030888855457306, "rewards/accuracy_reward": 0.6178393363952637, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2454485297203064, "step": 562 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 416.21429443359375, "epoch": 0.012717990421975241, "grad_norm": 3.109978301085107, "kl": 0.053955078125, "learning_rate": 9.996023737824341e-07, "loss": 0.0022, "reward": 1.7707157135009766, "reward_std": 0.02673652023077011, "rewards/accuracy_reward": 0.4267004728317261, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.204729363322258, "step": 563 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.2857142857142857, "completion_length": 421.4107360839844, "epoch": 0.012740580103008945, "grad_norm": 2.8124257325667585, "kl": 0.048095703125, "learning_rate": 9.996009576719994e-07, "loss": 0.0019, "reward": 1.9035096168518066, "reward_std": 0.04300761967897415, "rewards/accuracy_reward": 0.5479570627212524, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2269810438156128, "step": 564 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 418.6607360839844, "epoch": 0.01276316978404265, "grad_norm": 4.190722832281968, "kl": 0.04541015625, "learning_rate": 9.99599539045376e-07, "loss": 0.0018, "reward": 2.157569646835327, "reward_std": 0.294531911611557, "rewards/accuracy_reward": 0.6925861239433289, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.3042689859867096, "step": 565 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 427.76788330078125, "epoch": 0.012785759465076353, "grad_norm": 2.524515274654514, "kl": 0.058349609375, "learning_rate": 9.995981179025712e-07, "loss": 0.0023, "reward": 1.8585463762283325, "reward_std": 0.08424150198698044, "rewards/accuracy_reward": 0.4219391644001007, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2723214328289032, "step": 566 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 421.08929443359375, "epoch": 0.012808349146110056, "grad_norm": 2.1574925218058705, "kl": 0.053955078125, "learning_rate": 9.995966942435919e-07, "loss": 0.0022, "reward": 1.8901209831237793, "reward_std": 0.048785582184791565, "rewards/accuracy_reward": 0.5266792178153992, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2241559773683548, "step": 567 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 448.58929443359375, "epoch": 0.012830938827143762, "grad_norm": 1.5616094182724591, "kl": 0.0478515625, "learning_rate": 9.995952680684453e-07, "loss": 0.0019, "reward": 1.307595133781433, "reward_std": 0.299481064081192, "rewards/accuracy_reward": 0.21995462477207184, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.0769260972738266, "step": 568 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 415.26788330078125, "epoch": 0.012853528508177465, "grad_norm": 5.277536357254062, "kl": 0.051025390625, "learning_rate": 9.995938393771386e-07, "loss": 0.002, "reward": 1.9695979356765747, "reward_std": 0.2575474679470062, "rewards/accuracy_reward": 0.5358295440673828, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2551967203617096, "step": 569 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 453.8750305175781, "epoch": 0.012876118189211168, "grad_norm": 1.6791030052287708, "kl": 0.043212890625, "learning_rate": 9.995924081696792e-07, "loss": 0.0017, "reward": 1.6611956357955933, "reward_std": 0.3364489674568176, "rewards/accuracy_reward": 0.4642857015132904, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1719098836183548, "step": 570 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 430.08929443359375, "epoch": 0.012898707870244872, "grad_norm": 1.9586575925418255, "kl": 0.052490234375, "learning_rate": 9.995909744460742e-07, "loss": 0.0021, "reward": 1.6509337425231934, "reward_std": 0.04736940562725067, "rewards/accuracy_reward": 0.35095468163490295, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1928362250328064, "step": 571 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 425.1785888671875, "epoch": 0.012921297551278577, "grad_norm": 1.38391732000824, "kl": 0.052490234375, "learning_rate": 9.995895382063308e-07, "loss": 0.0021, "reward": 2.220991611480713, "reward_std": 0.11169137805700302, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2721819281578064, "step": 572 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 415.76788330078125, "epoch": 0.01294388723231228, "grad_norm": 2.0739639153475955, "kl": 0.050537109375, "learning_rate": 9.995880994504562e-07, "loss": 0.002, "reward": 1.6654807329177856, "reward_std": 0.247616246342659, "rewards/accuracy_reward": 0.4004625082015991, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1614467203617096, "step": 573 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 422.01788330078125, "epoch": 0.012966476913345983, "grad_norm": 2.2118988423548167, "kl": 0.057373046875, "learning_rate": 9.995866581784576e-07, "loss": 0.0023, "reward": 1.9814507961273193, "reward_std": 0.045832302421331406, "rewards/accuracy_reward": 0.5252671241760254, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2847551703453064, "step": 574 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 438.21429443359375, "epoch": 0.012989066594379687, "grad_norm": 3.523430771917579, "kl": 0.052734375, "learning_rate": 9.995852143903425e-07, "loss": 0.0021, "reward": 1.8304147720336914, "reward_std": 0.09106610715389252, "rewards/accuracy_reward": 0.5075700879096985, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2014160305261612, "step": 575 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 459.3214416503906, "epoch": 0.013011656275413392, "grad_norm": 3.9245991102114104, "kl": 0.038818359375, "learning_rate": 9.99583768086118e-07, "loss": 0.0016, "reward": 2.1164140701293945, "reward_std": 0.17803265154361725, "rewards/accuracy_reward": 0.6835246682167053, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2828892469406128, "step": 576 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 394.71429443359375, "epoch": 0.013034245956447095, "grad_norm": 3.838255726546675, "kl": 0.044189453125, "learning_rate": 9.995823192657912e-07, "loss": 0.0018, "reward": 2.0554301738739014, "reward_std": 0.1357385665178299, "rewards/accuracy_reward": 0.646996796131134, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2691476047039032, "step": 577 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 434.1785888671875, "epoch": 0.013056835637480798, "grad_norm": 1.2977948595297402, "kl": 0.0458984375, "learning_rate": 9.995808679293697e-07, "loss": 0.0018, "reward": 1.7072798013687134, "reward_std": 0.13669070601463318, "rewards/accuracy_reward": 0.4758414328098297, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1528669148683548, "step": 578 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 386.9107360839844, "epoch": 0.013079425318514503, "grad_norm": 2.6714097236017578, "kl": 0.0498046875, "learning_rate": 9.995794140768608e-07, "loss": 0.002, "reward": 1.3318331241607666, "reward_std": 0.10521180927753448, "rewards/accuracy_reward": 0.19750681519508362, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.095040462911129, "step": 579 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 396.9464416503906, "epoch": 0.013102014999548207, "grad_norm": 5.157701409728502, "kl": 0.047119140625, "learning_rate": 9.995779577082715e-07, "loss": 0.0019, "reward": 1.9079163074493408, "reward_std": 0.05516951158642769, "rewards/accuracy_reward": 0.5133849382400513, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2516741156578064, "step": 580 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 422.71429443359375, "epoch": 0.01312460468058191, "grad_norm": 3.5743132814145677, "kl": 0.0478515625, "learning_rate": 9.995764988236095e-07, "loss": 0.0019, "reward": 1.7528600692749023, "reward_std": 0.13111740350723267, "rewards/accuracy_reward": 0.44464626908302307, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2224993109703064, "step": 581 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 389.64288330078125, "epoch": 0.013147194361615613, "grad_norm": 2.6071556884855642, "kl": 0.0478515625, "learning_rate": 9.99575037422882e-07, "loss": 0.0019, "reward": 1.9921202659606934, "reward_std": 0.2054559290409088, "rewards/accuracy_reward": 0.5096808671951294, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.303867906332016, "step": 582 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 374.4464416503906, "epoch": 0.013169784042649318, "grad_norm": 8.643905878849612, "kl": 0.0517578125, "learning_rate": 9.995735735060961e-07, "loss": 0.0021, "reward": 1.9641375541687012, "reward_std": 0.056870006024837494, "rewards/accuracy_reward": 0.5345615744590759, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.26171875, "step": 583 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 350.8750305175781, "epoch": 0.013192373723683022, "grad_norm": 2.4596634407507496, "kl": 0.049560546875, "learning_rate": 9.995721070732597e-07, "loss": 0.002, "reward": 1.8672609329223633, "reward_std": 0.24126414954662323, "rewards/accuracy_reward": 0.45876115560531616, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.3013567328453064, "step": 584 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 360.4821472167969, "epoch": 0.013214963404716725, "grad_norm": 2.1999639925561203, "kl": 0.046630859375, "learning_rate": 9.9957063812438e-07, "loss": 0.0019, "reward": 1.8647273778915405, "reward_std": 0.30319270491600037, "rewards/accuracy_reward": 0.555941641330719, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2016427218914032, "step": 585 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 359.8750305175781, "epoch": 0.013237553085750428, "grad_norm": 3.44641535384064, "kl": 0.052978515625, "learning_rate": 9.995691666594642e-07, "loss": 0.0021, "reward": 1.7504916191101074, "reward_std": 0.15431086719036102, "rewards/accuracy_reward": 0.3823064863681793, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2467564344406128, "step": 586 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 342.26788330078125, "epoch": 0.013260142766784134, "grad_norm": 1.5383884631455584, "kl": 0.052001953125, "learning_rate": 9.995676926785198e-07, "loss": 0.0021, "reward": 1.8274790048599243, "reward_std": 0.16413529217243195, "rewards/accuracy_reward": 0.4869236946105957, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.247698113322258, "step": 587 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 371.2500305175781, "epoch": 0.013282732447817837, "grad_norm": 3.6754326517403477, "kl": 0.04296875, "learning_rate": 9.995662161815543e-07, "loss": 0.0017, "reward": 1.910606861114502, "reward_std": 0.2187267690896988, "rewards/accuracy_reward": 0.5444899797439575, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2339739203453064, "step": 588 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 386.1250305175781, "epoch": 0.01330532212885154, "grad_norm": 2.6029739175698428, "kl": 0.046630859375, "learning_rate": 9.995647371685748e-07, "loss": 0.0019, "reward": 1.790808081626892, "reward_std": 0.16554813086986542, "rewards/accuracy_reward": 0.44290241599082947, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2014770656824112, "step": 589 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 405.71429443359375, "epoch": 0.013327911809885245, "grad_norm": 3.506294974015037, "kl": 0.047119140625, "learning_rate": 9.995632556395894e-07, "loss": 0.0019, "reward": 1.8559846878051758, "reward_std": 0.3083142340183258, "rewards/accuracy_reward": 0.49491626024246216, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2646397352218628, "step": 590 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 375.39288330078125, "epoch": 0.013350501490918949, "grad_norm": 4.012899000847014, "kl": 0.04248046875, "learning_rate": 9.995617715946051e-07, "loss": 0.0017, "reward": 1.8807145357131958, "reward_std": 0.1251801997423172, "rewards/accuracy_reward": 0.5383665561676025, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2280622273683548, "step": 591 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 379.2857360839844, "epoch": 0.013373091171952652, "grad_norm": 4.142691344483956, "kl": 0.046142578125, "learning_rate": 9.995602850336295e-07, "loss": 0.0018, "reward": 1.4357199668884277, "reward_std": 0.3290632367134094, "rewards/accuracy_reward": 0.23129917681217194, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1294206976890564, "step": 592 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.2857142857142857, "completion_length": 346.2321472167969, "epoch": 0.013395680852986355, "grad_norm": 1.6684936659111211, "kl": 0.0537109375, "learning_rate": 9.9955879595667e-07, "loss": 0.0022, "reward": 2.0109424591064453, "reward_std": 0.056528910994529724, "rewards/accuracy_reward": 0.6584731936454773, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2274693250656128, "step": 593 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 363.0000305175781, "epoch": 0.01341827053402006, "grad_norm": 3.906378836872788, "kl": 0.05419921875, "learning_rate": 9.995573043637342e-07, "loss": 0.0022, "reward": 1.7504494190216064, "reward_std": 0.24408508837223053, "rewards/accuracy_reward": 0.40945810079574585, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.212419793009758, "step": 594 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 358.2500305175781, "epoch": 0.013440860215053764, "grad_norm": 6.185790227080461, "kl": 0.0556640625, "learning_rate": 9.995558102548296e-07, "loss": 0.0022, "reward": 2.112039089202881, "reward_std": 0.07288798689842224, "rewards/accuracy_reward": 0.6188889741897583, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.318150132894516, "step": 595 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 369.83929443359375, "epoch": 0.013463449896087467, "grad_norm": 4.905191724066371, "kl": 0.052490234375, "learning_rate": 9.995543136299635e-07, "loss": 0.0021, "reward": 1.6682230234146118, "reward_std": 0.11321936547756195, "rewards/accuracy_reward": 0.3252541720867157, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2215401828289032, "step": 596 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 378.1071472167969, "epoch": 0.01348603957712117, "grad_norm": 2.1770063375545106, "kl": 0.046142578125, "learning_rate": 9.995528144891438e-07, "loss": 0.0018, "reward": 2.0236077308654785, "reward_std": 0.33995404839515686, "rewards/accuracy_reward": 0.6322728395462036, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.252049058675766, "step": 597 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 374.76788330078125, "epoch": 0.013508629258154875, "grad_norm": 2.106856087123011, "kl": 0.050537109375, "learning_rate": 9.995513128323781e-07, "loss": 0.002, "reward": 1.9793565273284912, "reward_std": 0.1439911425113678, "rewards/accuracy_reward": 0.6128594875335693, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2129255086183548, "step": 598 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 374.3214416503906, "epoch": 0.013531218939188579, "grad_norm": 3.7507482040241524, "kl": 0.050537109375, "learning_rate": 9.995498086596735e-07, "loss": 0.002, "reward": 2.0838887691497803, "reward_std": 0.07154090702533722, "rewards/accuracy_reward": 0.6593593955039978, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2852434515953064, "step": 599 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 365.0714416503906, "epoch": 0.013553808620222282, "grad_norm": 1.652286103390495, "kl": 0.04638671875, "learning_rate": 9.995483019710378e-07, "loss": 0.0019, "reward": 1.9825788736343384, "reward_std": 0.23868528008460999, "rewards/accuracy_reward": 0.660714328289032, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2147216945886612, "step": 600 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 383.7321472167969, "epoch": 0.013576398301255987, "grad_norm": 1.3168839828741663, "kl": 0.041259765625, "learning_rate": 9.995467927664787e-07, "loss": 0.0017, "reward": 1.8121339082717896, "reward_std": 0.24195519089698792, "rewards/accuracy_reward": 0.5535714626312256, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1692766547203064, "step": 601 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 365.08929443359375, "epoch": 0.01359898798228969, "grad_norm": 14.376290446417906, "kl": 0.052734375, "learning_rate": 9.995452810460036e-07, "loss": 0.0021, "reward": 1.8652639389038086, "reward_std": 0.10933823883533478, "rewards/accuracy_reward": 0.41616714000701904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2883824110031128, "step": 602 }, { "all_correct": 0.8571428571428571, "all_wrong": 0.0, "completion_length": 373.89288330078125, "epoch": 0.013621577663323394, "grad_norm": 4.4169304755400605, "kl": 0.05517578125, "learning_rate": 9.995437668096202e-07, "loss": 0.0022, "reward": 2.3191044330596924, "reward_std": 0.041870538145303726, "rewards/accuracy_reward": 0.8144219517707825, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3082537055015564, "step": 603 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 375.58929443359375, "epoch": 0.013644167344357097, "grad_norm": 2.494399944005838, "kl": 0.0576171875, "learning_rate": 9.995422500573364e-07, "loss": 0.0023, "reward": 1.6718039512634277, "reward_std": 0.3448234498500824, "rewards/accuracy_reward": 0.3591609597206116, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2090715765953064, "step": 604 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 392.46429443359375, "epoch": 0.013666757025390802, "grad_norm": 1.7886511791984288, "kl": 0.047607421875, "learning_rate": 9.995407307891593e-07, "loss": 0.0019, "reward": 1.8581910133361816, "reward_std": 0.24784667789936066, "rewards/accuracy_reward": 0.517213761806488, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2159772664308548, "step": 605 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 405.8571472167969, "epoch": 0.013689346706424506, "grad_norm": 2.10663874096409, "kl": 0.061279296875, "learning_rate": 9.99539209005097e-07, "loss": 0.0024, "reward": 1.6482301950454712, "reward_std": 0.13671676814556122, "rewards/accuracy_reward": 0.2944948375225067, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2215925008058548, "step": 606 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 388.0714416503906, "epoch": 0.013711936387458209, "grad_norm": 2.478102153552954, "kl": 0.05517578125, "learning_rate": 9.99537684705157e-07, "loss": 0.0022, "reward": 1.945820927619934, "reward_std": 0.05923868343234062, "rewards/accuracy_reward": 0.5943558812141418, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2336077094078064, "step": 607 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 450.0000305175781, "epoch": 0.013734526068491912, "grad_norm": 1.345619045210268, "kl": 0.04296875, "learning_rate": 9.99536157889347e-07, "loss": 0.0017, "reward": 1.7352993488311768, "reward_std": 0.22175952792167664, "rewards/accuracy_reward": 0.5178571939468384, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1638706773519516, "step": 608 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 369.89288330078125, "epoch": 0.013757115749525617, "grad_norm": 2.3229585466216043, "kl": 0.056396484375, "learning_rate": 9.995346285576746e-07, "loss": 0.0023, "reward": 1.8281738758087158, "reward_std": 0.2909362018108368, "rewards/accuracy_reward": 0.5376185178756714, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2119838297367096, "step": 609 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 407.01788330078125, "epoch": 0.01377970543055932, "grad_norm": 2.137373750251689, "kl": 0.05615234375, "learning_rate": 9.995330967101477e-07, "loss": 0.0022, "reward": 1.863587737083435, "reward_std": 0.1798029989004135, "rewards/accuracy_reward": 0.46251338720321655, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2617885172367096, "step": 610 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 393.1785888671875, "epoch": 0.013802295111593024, "grad_norm": 3.026756271460063, "kl": 0.0634765625, "learning_rate": 9.995315623467737e-07, "loss": 0.0025, "reward": 1.7423146963119507, "reward_std": 0.14577238261699677, "rewards/accuracy_reward": 0.3260897397994995, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2376534640789032, "step": 611 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 399.1250305175781, "epoch": 0.013824884792626729, "grad_norm": 2.0968199465669612, "kl": 0.05078125, "learning_rate": 9.995300254675608e-07, "loss": 0.002, "reward": 1.477779746055603, "reward_std": 0.13941507041454315, "rewards/accuracy_reward": 0.3214285969734192, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0992082878947258, "step": 612 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 406.39288330078125, "epoch": 0.013847474473660432, "grad_norm": 2.8558840753162067, "kl": 0.05322265625, "learning_rate": 9.99528486072516e-07, "loss": 0.0021, "reward": 1.9837077856063843, "reward_std": 0.14972443878650665, "rewards/accuracy_reward": 0.5950533151626587, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2457973062992096, "step": 613 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 428.08929443359375, "epoch": 0.013870064154694136, "grad_norm": 2.316494242843762, "kl": 0.041259765625, "learning_rate": 9.99526944161648e-07, "loss": 0.0016, "reward": 1.6401089429855347, "reward_std": 0.21324960887432098, "rewards/accuracy_reward": 0.372345894575119, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1713344156742096, "step": 614 }, { "all_correct": 0.0, "all_wrong": 0.7142857142857143, "completion_length": 380.5535888671875, "epoch": 0.013892653835727839, "grad_norm": 2.0849084539634175, "kl": 0.054931640625, "learning_rate": 9.995253997349638e-07, "loss": 0.0022, "reward": 1.292547583580017, "reward_std": 0.08288845419883728, "rewards/accuracy_reward": 0.16850648820400238, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0704694539308548, "step": 615 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 383.76788330078125, "epoch": 0.013915243516761544, "grad_norm": 1.8165998955436438, "kl": 0.05126953125, "learning_rate": 9.995238527924716e-07, "loss": 0.002, "reward": 2.004776954650879, "reward_std": 0.33494964241981506, "rewards/accuracy_reward": 0.6035770177841187, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2476283609867096, "step": 616 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 369.6071472167969, "epoch": 0.013937833197795247, "grad_norm": 1.49108476139266, "kl": 0.056640625, "learning_rate": 9.995223033341789e-07, "loss": 0.0023, "reward": 1.6247185468673706, "reward_std": 0.14079461991786957, "rewards/accuracy_reward": 0.3602340817451477, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1680559515953064, "step": 617 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 364.6250305175781, "epoch": 0.01396042287882895, "grad_norm": 2.858960181527917, "kl": 0.058349609375, "learning_rate": 9.995207513600937e-07, "loss": 0.0023, "reward": 1.6373239755630493, "reward_std": 0.10653428733348846, "rewards/accuracy_reward": 0.33982470631599426, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.186785027384758, "step": 618 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 366.4821472167969, "epoch": 0.013983012559862654, "grad_norm": 2.054960529567564, "kl": 0.053955078125, "learning_rate": 9.99519196870224e-07, "loss": 0.0022, "reward": 1.8395665884017944, "reward_std": 0.22600838541984558, "rewards/accuracy_reward": 0.525887668132782, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2172502875328064, "step": 619 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 376.1250305175781, "epoch": 0.014005602240896359, "grad_norm": 2.913152954916261, "kl": 0.055908203125, "learning_rate": 9.99517639864577e-07, "loss": 0.0022, "reward": 2.0266408920288086, "reward_std": 0.14069077372550964, "rewards/accuracy_reward": 0.5988644957542419, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.274204820394516, "step": 620 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 381.4285888671875, "epoch": 0.014028191921930062, "grad_norm": 1.7013494848586208, "kl": 0.049072265625, "learning_rate": 9.995160803431612e-07, "loss": 0.002, "reward": 2.1117024421691895, "reward_std": 0.18647252023220062, "rewards/accuracy_reward": 0.6817357540130615, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.287109375, "step": 621 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 386.21429443359375, "epoch": 0.014050781602963766, "grad_norm": 6.746686825358886, "kl": 0.05224609375, "learning_rate": 9.995145183059842e-07, "loss": 0.0021, "reward": 2.183938503265381, "reward_std": 0.07379632443189621, "rewards/accuracy_reward": 0.6891176104545593, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3162493109703064, "step": 622 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 363.5535888671875, "epoch": 0.014073371283997469, "grad_norm": 2.254260389232632, "kl": 0.057861328125, "learning_rate": 9.995129537530536e-07, "loss": 0.0023, "reward": 1.8552300930023193, "reward_std": 0.1707054227590561, "rewards/accuracy_reward": 0.47184911370277405, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2333810031414032, "step": 623 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.5714285714285714, "completion_length": 362.4464416503906, "epoch": 0.014095960965031174, "grad_norm": 1.5400508448574344, "kl": 0.05615234375, "learning_rate": 9.995113866843778e-07, "loss": 0.0023, "reward": 1.4809205532073975, "reward_std": 0.04867074638605118, "rewards/accuracy_reward": 0.2936508059501648, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.130126953125, "step": 624 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 384.8571472167969, "epoch": 0.014118550646064877, "grad_norm": 3.986049546780208, "kl": 0.050048828125, "learning_rate": 9.995098170999643e-07, "loss": 0.002, "reward": 1.9098525047302246, "reward_std": 0.25540345907211304, "rewards/accuracy_reward": 0.5634796619415283, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2106584906578064, "step": 625 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 433.5714416503906, "epoch": 0.01414114032709858, "grad_norm": 2.9273303920447167, "kl": 0.040283203125, "learning_rate": 9.995082449998211e-07, "loss": 0.0016, "reward": 1.7547357082366943, "reward_std": 0.3695370852947235, "rewards/accuracy_reward": 0.4702874720096588, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2023053914308548, "step": 626 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 373.14288330078125, "epoch": 0.014163730008132286, "grad_norm": 5.259288644186226, "kl": 0.054443359375, "learning_rate": 9.995066703839564e-07, "loss": 0.0022, "reward": 2.26841402053833, "reward_std": 0.22670896351337433, "rewards/accuracy_reward": 0.7987256646156311, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3054024875164032, "step": 627 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 349.1071472167969, "epoch": 0.01418631968916599, "grad_norm": 7.4376567771993995, "kl": 0.064453125, "learning_rate": 9.995050932523776e-07, "loss": 0.0026, "reward": 2.164961814880371, "reward_std": 0.06354004889726639, "rewards/accuracy_reward": 0.7298995852470398, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2957763671875, "step": 628 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 406.21429443359375, "epoch": 0.014208909370199693, "grad_norm": 2.037857961412831, "kl": 0.050048828125, "learning_rate": 9.99503513605093e-07, "loss": 0.002, "reward": 1.8339192867279053, "reward_std": 0.15614505112171173, "rewards/accuracy_reward": 0.5360397696495056, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2014508992433548, "step": 629 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 366.4107360839844, "epoch": 0.014231499051233396, "grad_norm": 2.907605813286443, "kl": 0.056884765625, "learning_rate": 9.995019314421108e-07, "loss": 0.0023, "reward": 1.7836151123046875, "reward_std": 0.21902050077915192, "rewards/accuracy_reward": 0.42743822932243347, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2240339070558548, "step": 630 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 382.4821472167969, "epoch": 0.014254088732267101, "grad_norm": 12.740080102726086, "kl": 0.060791015625, "learning_rate": 9.995003467634381e-07, "loss": 0.0024, "reward": 1.9044498205184937, "reward_std": 0.13535773754119873, "rewards/accuracy_reward": 0.4523046314716339, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2878592610359192, "step": 631 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 398.4285888671875, "epoch": 0.014276678413300804, "grad_norm": 7.366753070957468, "kl": 0.059814453125, "learning_rate": 9.99498759569084e-07, "loss": 0.0024, "reward": 1.8908071517944336, "reward_std": 0.2692888081073761, "rewards/accuracy_reward": 0.4655035734176636, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.275303453207016, "step": 632 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 354.64288330078125, "epoch": 0.014299268094334508, "grad_norm": 2.332827089464137, "kl": 0.05859375, "learning_rate": 9.994971698590554e-07, "loss": 0.0023, "reward": 1.975573182106018, "reward_std": 0.12244213372468948, "rewards/accuracy_reward": 0.6141021251678467, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.222185418009758, "step": 633 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 375.26788330078125, "epoch": 0.014321857775368211, "grad_norm": 2.46346263739484, "kl": 0.05078125, "learning_rate": 9.994955776333613e-07, "loss": 0.002, "reward": 1.89081871509552, "reward_std": 0.371194988489151, "rewards/accuracy_reward": 0.5580130815505981, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2149483859539032, "step": 634 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 390.71429443359375, "epoch": 0.014344447456401916, "grad_norm": 3.1995581935492887, "kl": 0.052734375, "learning_rate": 9.99493982892009e-07, "loss": 0.0021, "reward": 1.9519239664077759, "reward_std": 0.22132354974746704, "rewards/accuracy_reward": 0.5085818767547607, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.282627671957016, "step": 635 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 380.39288330078125, "epoch": 0.01436703713743562, "grad_norm": 1.6546711793136655, "kl": 0.05419921875, "learning_rate": 9.994923856350068e-07, "loss": 0.0022, "reward": 1.81325364112854, "reward_std": 0.10756786912679672, "rewards/accuracy_reward": 0.4542306661605835, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2304513156414032, "step": 636 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 377.64288330078125, "epoch": 0.014389626818469323, "grad_norm": 5.872809493988438, "kl": 0.05419921875, "learning_rate": 9.994907858623626e-07, "loss": 0.0022, "reward": 1.6815202236175537, "reward_std": 0.26106059551239014, "rewards/accuracy_reward": 0.377627968788147, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1860351711511612, "step": 637 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 386.3035888671875, "epoch": 0.014412216499503028, "grad_norm": 3.15820219416398, "kl": 0.0576171875, "learning_rate": 9.994891835740848e-07, "loss": 0.0023, "reward": 1.7355629205703735, "reward_std": 0.2410086989402771, "rewards/accuracy_reward": 0.4465142786502838, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.167619988322258, "step": 638 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 409.6964416503906, "epoch": 0.014434806180536731, "grad_norm": 2.639513649963715, "kl": 0.052978515625, "learning_rate": 9.994875787701812e-07, "loss": 0.0021, "reward": 2.134885311126709, "reward_std": 0.05441998690366745, "rewards/accuracy_reward": 0.72269207239151, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2657645344734192, "step": 639 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 388.8750305175781, "epoch": 0.014457395861570434, "grad_norm": 1.4822201283924343, "kl": 0.0546875, "learning_rate": 9.9948597145066e-07, "loss": 0.0022, "reward": 1.8162877559661865, "reward_std": 0.24907392263412476, "rewards/accuracy_reward": 0.535714328289032, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1770019680261612, "step": 640 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 394.5714416503906, "epoch": 0.014479985542604138, "grad_norm": 4.261975358503261, "kl": 0.0615234375, "learning_rate": 9.994843616155291e-07, "loss": 0.0025, "reward": 1.7986516952514648, "reward_std": 0.13177727162837982, "rewards/accuracy_reward": 0.4489828646183014, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2139543890953064, "step": 641 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 389.4464416503906, "epoch": 0.014502575223637843, "grad_norm": 2.3546931932343744, "kl": 0.06005859375, "learning_rate": 9.994827492647968e-07, "loss": 0.0024, "reward": 1.8542993068695068, "reward_std": 0.1676480621099472, "rewards/accuracy_reward": 0.4644239842891693, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2470180094242096, "step": 642 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 405.33929443359375, "epoch": 0.014525164904671546, "grad_norm": 1.6925698827210571, "kl": 0.04833984375, "learning_rate": 9.99481134398471e-07, "loss": 0.0019, "reward": 1.699310064315796, "reward_std": 0.3429926931858063, "rewards/accuracy_reward": 0.4036208689212799, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1706891804933548, "step": 643 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 406.8214416503906, "epoch": 0.01454775458570525, "grad_norm": 2.2682396350913834, "kl": 0.05908203125, "learning_rate": 9.994795170165604e-07, "loss": 0.0024, "reward": 1.8563954830169678, "reward_std": 0.13382121920585632, "rewards/accuracy_reward": 0.48531922698020935, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2282191812992096, "step": 644 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 399.9285888671875, "epoch": 0.014570344266738953, "grad_norm": 2.3263953560096198, "kl": 0.052978515625, "learning_rate": 9.994778971190724e-07, "loss": 0.0021, "reward": 2.0538995265960693, "reward_std": 0.08579859137535095, "rewards/accuracy_reward": 0.6732006669044495, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2521275281906128, "step": 645 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 416.6607360839844, "epoch": 0.014592933947772658, "grad_norm": 1.7336179259862776, "kl": 0.046630859375, "learning_rate": 9.994762747060156e-07, "loss": 0.0019, "reward": 2.004861354827881, "reward_std": 0.3234177231788635, "rewards/accuracy_reward": 0.592521607875824, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2730538547039032, "step": 646 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 414.6964416503906, "epoch": 0.014615523628806361, "grad_norm": 1.6297834022681756, "kl": 0.052490234375, "learning_rate": 9.994746497773981e-07, "loss": 0.0021, "reward": 1.6881992816925049, "reward_std": 0.22674816846847534, "rewards/accuracy_reward": 0.4045880436897278, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1657540500164032, "step": 647 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 391.96429443359375, "epoch": 0.014638113309840065, "grad_norm": 2.043251551611208, "kl": 0.060302734375, "learning_rate": 9.99473022333228e-07, "loss": 0.0024, "reward": 1.9505923986434937, "reward_std": 0.05163796246051788, "rewards/accuracy_reward": 0.5081465244293213, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2745884656906128, "step": 648 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 368.6250305175781, "epoch": 0.01466070299087377, "grad_norm": 2.88894013432261, "kl": 0.0615234375, "learning_rate": 9.994713923735135e-07, "loss": 0.0025, "reward": 1.713339924812317, "reward_std": 0.2434602975845337, "rewards/accuracy_reward": 0.427238404750824, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1825300008058548, "step": 649 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 369.71429443359375, "epoch": 0.014683292671907473, "grad_norm": 2.012750903969938, "kl": 0.061767578125, "learning_rate": 9.994697598982628e-07, "loss": 0.0025, "reward": 1.5667906999588013, "reward_std": 0.09292737394571304, "rewards/accuracy_reward": 0.3369426429271698, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.147705078125, "step": 650 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 408.3214416503906, "epoch": 0.014705882352941176, "grad_norm": 2.9245973703912114, "kl": 0.055419921875, "learning_rate": 9.994681249074843e-07, "loss": 0.0022, "reward": 1.8654121160507202, "reward_std": 0.13549993932247162, "rewards/accuracy_reward": 0.583392858505249, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1677333414554596, "step": 651 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 388.4107360839844, "epoch": 0.01472847203397488, "grad_norm": 5.133626057798047, "kl": 0.055908203125, "learning_rate": 9.994664874011861e-07, "loss": 0.0022, "reward": 2.138643503189087, "reward_std": 0.2736800014972687, "rewards/accuracy_reward": 0.7064830660820007, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2678745985031128, "step": 652 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 383.9464416503906, "epoch": 0.014751061715008585, "grad_norm": 7.1379161431050315, "kl": 0.06494140625, "learning_rate": 9.994648473793765e-07, "loss": 0.0026, "reward": 1.7908110618591309, "reward_std": 0.12576644122600555, "rewards/accuracy_reward": 0.40985745191574097, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2452392727136612, "step": 653 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 374.6964416503906, "epoch": 0.014773651396042288, "grad_norm": 5.068929373571104, "kl": 0.06494140625, "learning_rate": 9.994632048420637e-07, "loss": 0.0026, "reward": 1.971433162689209, "reward_std": 0.04679303243756294, "rewards/accuracy_reward": 0.548372209072113, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2623465657234192, "step": 654 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 379.08929443359375, "epoch": 0.014796241077075991, "grad_norm": 129.4882823963524, "kl": 0.0546875, "learning_rate": 9.99461559789256e-07, "loss": 0.0022, "reward": 2.0060203075408936, "reward_std": 0.15949063003063202, "rewards/accuracy_reward": 0.5932655334472656, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2698974609375, "step": 655 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 432.5714416503906, "epoch": 0.014818830758109695, "grad_norm": 2.122245705248393, "kl": 0.043701171875, "learning_rate": 9.994599122209616e-07, "loss": 0.0017, "reward": 2.0843331813812256, "reward_std": 0.29379212856292725, "rewards/accuracy_reward": 0.7187255024909973, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.2834647297859192, "step": 656 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 430.5535888671875, "epoch": 0.0148414204391434, "grad_norm": 18.78601590767059, "kl": 0.0419921875, "learning_rate": 9.99458262137189e-07, "loss": 0.0017, "reward": 1.664739727973938, "reward_std": 0.15833811461925507, "rewards/accuracy_reward": 0.3508654832839966, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2031598836183548, "step": 657 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 398.7857360839844, "epoch": 0.014864010120177103, "grad_norm": 4.033217897792626, "kl": 0.05224609375, "learning_rate": 9.994566095379463e-07, "loss": 0.0021, "reward": 1.576066493988037, "reward_std": 0.03275704383850098, "rewards/accuracy_reward": 0.2557365894317627, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1774728000164032, "step": 658 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 381.2321472167969, "epoch": 0.014886599801210806, "grad_norm": 2.234966064548012, "kl": 0.0673828125, "learning_rate": 9.99454954423242e-07, "loss": 0.0027, "reward": 2.0533523559570312, "reward_std": 0.1525462567806244, "rewards/accuracy_reward": 0.6025325059890747, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2686767578125, "step": 659 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 380.7500305175781, "epoch": 0.014909189482244511, "grad_norm": 1.9556252821486184, "kl": 0.05810546875, "learning_rate": 9.994532967930843e-07, "loss": 0.0023, "reward": 1.7909865379333496, "reward_std": 0.11760066449642181, "rewards/accuracy_reward": 0.4509612023830414, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2221679836511612, "step": 660 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 373.6607360839844, "epoch": 0.014931779163278215, "grad_norm": 6.659088701282539, "kl": 0.053955078125, "learning_rate": 9.994516366474815e-07, "loss": 0.0022, "reward": 1.7633719444274902, "reward_std": 0.27181538939476013, "rewards/accuracy_reward": 0.5178571939468384, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1526576578617096, "step": 661 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 366.1785888671875, "epoch": 0.014954368844311918, "grad_norm": 2.5055647948646707, "kl": 0.06298828125, "learning_rate": 9.99449973986442e-07, "loss": 0.0025, "reward": 1.8152128458023071, "reward_std": 0.1592484563589096, "rewards/accuracy_reward": 0.49249032139778137, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2012939602136612, "step": 662 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 356.3035888671875, "epoch": 0.014976958525345621, "grad_norm": 5.616978192280321, "kl": 0.06298828125, "learning_rate": 9.994483088099746e-07, "loss": 0.0025, "reward": 1.8693113327026367, "reward_std": 0.06400622427463531, "rewards/accuracy_reward": 0.4872032105922699, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.267822265625, "step": 663 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 400.3750305175781, "epoch": 0.014999548206379327, "grad_norm": 1.3264159402471893, "kl": 0.05859375, "learning_rate": 9.99446641118087e-07, "loss": 0.0023, "reward": 1.8044986724853516, "reward_std": 0.17328926920890808, "rewards/accuracy_reward": 0.4794706106185913, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.207170769572258, "step": 664 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 383.4107360839844, "epoch": 0.01502213788741303, "grad_norm": 2.2156162955603462, "kl": 0.0654296875, "learning_rate": 9.99444970910788e-07, "loss": 0.0026, "reward": 1.855481743812561, "reward_std": 0.22348837554454803, "rewards/accuracy_reward": 0.4430932402610779, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.26953125, "step": 665 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 384.14288330078125, "epoch": 0.015044727568446733, "grad_norm": 1.992347332579429, "kl": 0.050048828125, "learning_rate": 9.99443298188086e-07, "loss": 0.002, "reward": 1.6193711757659912, "reward_std": 0.21804696321487427, "rewards/accuracy_reward": 0.31461378931999207, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.215471550822258, "step": 666 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 383.89288330078125, "epoch": 0.015067317249480437, "grad_norm": 2.368062490111717, "kl": 0.062255859375, "learning_rate": 9.994416229499893e-07, "loss": 0.0025, "reward": 1.814458966255188, "reward_std": 0.04649811238050461, "rewards/accuracy_reward": 0.43894967436790466, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.236223503947258, "step": 667 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 362.51788330078125, "epoch": 0.015089906930514142, "grad_norm": 2.3393253555543576, "kl": 0.059326171875, "learning_rate": 9.994399451965064e-07, "loss": 0.0024, "reward": 1.8433688879013062, "reward_std": 0.14810387790203094, "rewards/accuracy_reward": 0.5175526142120361, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2115304172039032, "step": 668 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 370.6071472167969, "epoch": 0.015112496611547845, "grad_norm": 4.42900879911752, "kl": 0.0537109375, "learning_rate": 9.994382649276455e-07, "loss": 0.0021, "reward": 2.242582082748413, "reward_std": 0.16596846282482147, "rewards/accuracy_reward": 0.7880880832672119, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2937796711921692, "step": 669 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 387.39288330078125, "epoch": 0.015135086292581548, "grad_norm": 2.120170661869064, "kl": 0.05029296875, "learning_rate": 9.994365821434156e-07, "loss": 0.002, "reward": 1.819413661956787, "reward_std": 0.03498688340187073, "rewards/accuracy_reward": 0.44031184911727905, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2398158609867096, "step": 670 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 375.4285888671875, "epoch": 0.015157675973615253, "grad_norm": 11.758995870920847, "kl": 0.05908203125, "learning_rate": 9.994348968438249e-07, "loss": 0.0024, "reward": 1.6108609437942505, "reward_std": 0.269483745098114, "rewards/accuracy_reward": 0.3079172670841217, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1958007961511612, "step": 671 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 352.0535888671875, "epoch": 0.015180265654648957, "grad_norm": 2.4404246202169637, "kl": 0.064453125, "learning_rate": 9.994332090288817e-07, "loss": 0.0026, "reward": 1.7120699882507324, "reward_std": 0.4854809641838074, "rewards/accuracy_reward": 0.4423469603061676, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.180437371134758, "step": 672 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 353.4107360839844, "epoch": 0.01520285533568266, "grad_norm": 2.191565028646548, "kl": 0.057861328125, "learning_rate": 9.994315186985945e-07, "loss": 0.0023, "reward": 1.9096858501434326, "reward_std": 0.2704371511936188, "rewards/accuracy_reward": 0.5312049388885498, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2463379055261612, "step": 673 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 408.5535888671875, "epoch": 0.015225445016716363, "grad_norm": 5.797912183782368, "kl": 0.03955078125, "learning_rate": 9.994298258529723e-07, "loss": 0.0016, "reward": 1.9086822271347046, "reward_std": 0.14032088220119476, "rewards/accuracy_reward": 0.5793713927268982, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2400251179933548, "step": 674 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 347.3214416503906, "epoch": 0.015248034697750068, "grad_norm": 1.7102316318342563, "kl": 0.058349609375, "learning_rate": 9.994281304920233e-07, "loss": 0.0023, "reward": 1.4242595434188843, "reward_std": 0.10392700135707855, "rewards/accuracy_reward": 0.2074311077594757, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.123971126973629, "step": 675 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 346.14288330078125, "epoch": 0.015270624378783772, "grad_norm": 3.599823666932054, "kl": 0.059326171875, "learning_rate": 9.99426432615756e-07, "loss": 0.0024, "reward": 1.972036361694336, "reward_std": 0.1574251353740692, "rewards/accuracy_reward": 0.6135298013687134, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.237077996134758, "step": 676 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 358.6071472167969, "epoch": 0.015293214059817475, "grad_norm": 1.5925515610551786, "kl": 0.04736328125, "learning_rate": 9.99424732224179e-07, "loss": 0.0019, "reward": 1.9280105829238892, "reward_std": 0.2060222029685974, "rewards/accuracy_reward": 0.5978809595108032, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2087009996175766, "step": 677 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 333.8214416503906, "epoch": 0.015315803740851178, "grad_norm": 4.883545842393297, "kl": 0.05810546875, "learning_rate": 9.994230293173008e-07, "loss": 0.0023, "reward": 1.8259303569793701, "reward_std": 0.30712100863456726, "rewards/accuracy_reward": 0.5333208441734314, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.210466668009758, "step": 678 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 355.6964416503906, "epoch": 0.015338393421884883, "grad_norm": 2.4757886211059557, "kl": 0.0537109375, "learning_rate": 9.994213238951299e-07, "loss": 0.0021, "reward": 1.7228269577026367, "reward_std": 0.09951711446046829, "rewards/accuracy_reward": 0.4271375238895416, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1885463297367096, "step": 679 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 404.4464416503906, "epoch": 0.015360983102918587, "grad_norm": 2.3707198763079185, "kl": 0.048583984375, "learning_rate": 9.994196159576752e-07, "loss": 0.0019, "reward": 1.6540571451187134, "reward_std": 0.2852063477039337, "rewards/accuracy_reward": 0.44012537598609924, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1817888617515564, "step": 680 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 356.1607360839844, "epoch": 0.01538357278395229, "grad_norm": 1.6376073988934492, "kl": 0.05859375, "learning_rate": 9.994179055049449e-07, "loss": 0.0023, "reward": 1.8505699634552002, "reward_std": 0.19538621604442596, "rewards/accuracy_reward": 0.4901311993598938, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2354387640953064, "step": 681 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 445.83929443359375, "epoch": 0.015406162464985995, "grad_norm": 1.2519831971313728, "kl": 0.040283203125, "learning_rate": 9.994161925369482e-07, "loss": 0.0016, "reward": 1.2905759811401367, "reward_std": 0.3469246029853821, "rewards/accuracy_reward": 0.24753575026988983, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.0894688218832016, "step": 682 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 385.4107360839844, "epoch": 0.015428752146019699, "grad_norm": 10.160839004539998, "kl": 0.053466796875, "learning_rate": 9.99414477053693e-07, "loss": 0.0021, "reward": 1.900525689125061, "reward_std": 0.16208553314208984, "rewards/accuracy_reward": 0.5077993273735046, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2498692274093628, "step": 683 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 405.1964416503906, "epoch": 0.015451341827053402, "grad_norm": 3.837559714788746, "kl": 0.058349609375, "learning_rate": 9.994127590551886e-07, "loss": 0.0023, "reward": 1.8953403234481812, "reward_std": 0.24963177740573883, "rewards/accuracy_reward": 0.4915280342102051, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2466692328453064, "step": 684 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 376.26788330078125, "epoch": 0.015473931508087105, "grad_norm": 2.834930967979947, "kl": 0.0546875, "learning_rate": 9.99411038541443e-07, "loss": 0.0022, "reward": 2.196718692779541, "reward_std": 0.14558182656764984, "rewards/accuracy_reward": 0.7152869701385498, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.306431382894516, "step": 685 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 382.8571472167969, "epoch": 0.01549652118912081, "grad_norm": 3.625248270445562, "kl": 0.052490234375, "learning_rate": 9.994093155124657e-07, "loss": 0.0021, "reward": 2.152752161026001, "reward_std": 0.23597288131713867, "rewards/accuracy_reward": 0.7176202535629272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.277989000082016, "step": 686 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 376.01788330078125, "epoch": 0.015519110870154514, "grad_norm": 2.0529128628878226, "kl": 0.0498046875, "learning_rate": 9.994075899682646e-07, "loss": 0.002, "reward": 2.2149720191955566, "reward_std": 0.05405137687921524, "rewards/accuracy_reward": 0.7539471387863159, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3003104329109192, "step": 687 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 364.2857360839844, "epoch": 0.015541700551188217, "grad_norm": 2.7159829897803482, "kl": 0.0712890625, "learning_rate": 9.994058619088489e-07, "loss": 0.0028, "reward": 1.5383241176605225, "reward_std": 0.10809677839279175, "rewards/accuracy_reward": 0.22576119005680084, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1982770711183548, "step": 688 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 363.89288330078125, "epoch": 0.01556429023222192, "grad_norm": 2.6327136443399994, "kl": 0.056640625, "learning_rate": 9.994041313342269e-07, "loss": 0.0023, "reward": 1.5502392053604126, "reward_std": 0.1882409304380417, "rewards/accuracy_reward": 0.31941115856170654, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1343994140625, "step": 689 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 359.6785888671875, "epoch": 0.015586879913255625, "grad_norm": 1.9239980605623277, "kl": 0.059326171875, "learning_rate": 9.994023982444075e-07, "loss": 0.0024, "reward": 1.8199267387390137, "reward_std": 0.14591531455516815, "rewards/accuracy_reward": 0.4606579840183258, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2235543429851532, "step": 690 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 380.6785888671875, "epoch": 0.015609469594289329, "grad_norm": 2.6968829431488848, "kl": 0.057373046875, "learning_rate": 9.994006626393995e-07, "loss": 0.0023, "reward": 1.7220231294631958, "reward_std": 0.16718684136867523, "rewards/accuracy_reward": 0.4097113311290741, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1908831000328064, "step": 691 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 382.83929443359375, "epoch": 0.015632059275323032, "grad_norm": 3.2197084613154106, "kl": 0.056884765625, "learning_rate": 9.993989245192115e-07, "loss": 0.0023, "reward": 1.8784536123275757, "reward_std": 0.23566025495529175, "rewards/accuracy_reward": 0.5294615030288696, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2347063422203064, "step": 692 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 373.89288330078125, "epoch": 0.015654648956356737, "grad_norm": 3.535769245855171, "kl": 0.06494140625, "learning_rate": 9.993971838838526e-07, "loss": 0.0026, "reward": 1.7517954111099243, "reward_std": 0.2509588897228241, "rewards/accuracy_reward": 0.4096008241176605, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2493373453617096, "step": 693 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.2857142857142857, "completion_length": 378.6785888671875, "epoch": 0.01567723863739044, "grad_norm": 1.9248758582271062, "kl": 0.056884765625, "learning_rate": 9.993954407333312e-07, "loss": 0.0023, "reward": 2.0570034980773926, "reward_std": 0.03415616601705551, "rewards/accuracy_reward": 0.7142857313156128, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2141462117433548, "step": 694 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 396.2321472167969, "epoch": 0.015699828318424144, "grad_norm": 7.178575792295247, "kl": 0.0634765625, "learning_rate": 9.99393695067656e-07, "loss": 0.0025, "reward": 1.9119073152542114, "reward_std": 0.22332763671875, "rewards/accuracy_reward": 0.5584298372268677, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2249058485031128, "step": 695 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 376.1250305175781, "epoch": 0.01572241799945785, "grad_norm": 1.8456136298695613, "kl": 0.05322265625, "learning_rate": 9.993919468868362e-07, "loss": 0.0021, "reward": 1.7610254287719727, "reward_std": 0.34975987672805786, "rewards/accuracy_reward": 0.46503961086273193, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1888427883386612, "step": 696 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 390.58929443359375, "epoch": 0.01574500768049155, "grad_norm": 6.5921952542619495, "kl": 0.0615234375, "learning_rate": 9.9939019619088e-07, "loss": 0.0025, "reward": 1.8955204486846924, "reward_std": 0.1693522036075592, "rewards/accuracy_reward": 0.5066426992416382, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.242449089884758, "step": 697 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 388.8214416503906, "epoch": 0.015767597361525255, "grad_norm": 1.6324338877478786, "kl": 0.055419921875, "learning_rate": 9.99388442979797e-07, "loss": 0.0022, "reward": 1.6367249488830566, "reward_std": 0.043663717806339264, "rewards/accuracy_reward": 0.37797585129737854, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1551775336265564, "step": 698 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 395.8214416503906, "epoch": 0.01579018704255896, "grad_norm": 2.3383516832820446, "kl": 0.06103515625, "learning_rate": 9.993866872535955e-07, "loss": 0.0025, "reward": 2.110888719558716, "reward_std": 0.06732739508152008, "rewards/accuracy_reward": 0.6803082227706909, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2734375, "step": 699 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 383.1785888671875, "epoch": 0.015812776723592662, "grad_norm": 2.1371566238259927, "kl": 0.06494140625, "learning_rate": 9.993849290122845e-07, "loss": 0.0026, "reward": 1.8214915990829468, "reward_std": 0.2014079988002777, "rewards/accuracy_reward": 0.4879535138607025, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2156808078289032, "step": 700 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 395.7500305175781, "epoch": 0.015835366404626367, "grad_norm": 1.596501380197765, "kl": 0.060302734375, "learning_rate": 9.993831682558726e-07, "loss": 0.0024, "reward": 1.8287138938903809, "reward_std": 0.16254273056983948, "rewards/accuracy_reward": 0.4945063591003418, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2056361734867096, "step": 701 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 423.2500305175781, "epoch": 0.01585795608566007, "grad_norm": 1.427580814673446, "kl": 0.046875, "learning_rate": 9.993814049843692e-07, "loss": 0.0019, "reward": 1.8118817806243896, "reward_std": 0.21220164000988007, "rewards/accuracy_reward": 0.4759756028652191, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2144775539636612, "step": 702 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 393.4285888671875, "epoch": 0.015880545766693774, "grad_norm": 1.576574548903283, "kl": 0.0576171875, "learning_rate": 9.993796391977827e-07, "loss": 0.0023, "reward": 2.100782632827759, "reward_std": 0.0693078488111496, "rewards/accuracy_reward": 0.6837502121925354, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.25274658203125, "step": 703 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.7142857142857143, "completion_length": 400.71429443359375, "epoch": 0.01590313544772748, "grad_norm": 1.5957228296406873, "kl": 0.06005859375, "learning_rate": 9.993778708961222e-07, "loss": 0.0024, "reward": 1.3459258079528809, "reward_std": 0.026548968628048897, "rewards/accuracy_reward": 0.20716311037540436, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0887625589966774, "step": 704 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 377.6785888671875, "epoch": 0.01592572512876118, "grad_norm": 45.219968493642256, "kl": 0.06787109375, "learning_rate": 9.993761000793965e-07, "loss": 0.0027, "reward": 1.7915854454040527, "reward_std": 0.13839803636074066, "rewards/accuracy_reward": 0.45634883642196655, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2066650539636612, "step": 705 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 391.1785888671875, "epoch": 0.015948314809794886, "grad_norm": 6.0910775422202885, "kl": 0.0673828125, "learning_rate": 9.993743267476147e-07, "loss": 0.0027, "reward": 1.6801742315292358, "reward_std": 0.08737960457801819, "rewards/accuracy_reward": 0.3009941279888153, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.25775146484375, "step": 706 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 413.83929443359375, "epoch": 0.01597090449082859, "grad_norm": 2.560167705646486, "kl": 0.059814453125, "learning_rate": 9.993725509007855e-07, "loss": 0.0024, "reward": 1.6594187021255493, "reward_std": 0.14577384293079376, "rewards/accuracy_reward": 0.46428054571151733, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.159423828125, "step": 707 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 378.0714416503906, "epoch": 0.015993494171862292, "grad_norm": 3.1217250043548734, "kl": 0.06787109375, "learning_rate": 9.993707725389181e-07, "loss": 0.0027, "reward": 1.5493309497833252, "reward_std": 0.19882217049598694, "rewards/accuracy_reward": 0.2271488457918167, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2007533609867096, "step": 708 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 382.7857360839844, "epoch": 0.016016083852895997, "grad_norm": 4.152196338569907, "kl": 0.05615234375, "learning_rate": 9.993689916620212e-07, "loss": 0.0023, "reward": 2.185448408126831, "reward_std": 0.07462802529335022, "rewards/accuracy_reward": 0.728717029094696, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2924456000328064, "step": 709 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 376.6071472167969, "epoch": 0.016038673533929702, "grad_norm": 3.109353807043, "kl": 0.056884765625, "learning_rate": 9.99367208270104e-07, "loss": 0.0023, "reward": 1.9016344547271729, "reward_std": 0.25317177176475525, "rewards/accuracy_reward": 0.5097119212150574, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2383510172367096, "step": 710 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 369.0357360839844, "epoch": 0.016061263214963404, "grad_norm": 2.0581759163266775, "kl": 0.06689453125, "learning_rate": 9.993654223631754e-07, "loss": 0.0027, "reward": 2.2933731079101562, "reward_std": 0.18095046281814575, "rewards/accuracy_reward": 0.7924840450286865, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3401750922203064, "step": 711 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 372.1607360839844, "epoch": 0.01608385289599711, "grad_norm": 2.8594665001953503, "kl": 0.06298828125, "learning_rate": 9.993636339412442e-07, "loss": 0.0025, "reward": 1.837645173072815, "reward_std": 0.05107627063989639, "rewards/accuracy_reward": 0.46194419264793396, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2364153265953064, "step": 712 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 369.0357360839844, "epoch": 0.01610644257703081, "grad_norm": 1.3680617542943039, "kl": 0.055908203125, "learning_rate": 9.993618430043195e-07, "loss": 0.0022, "reward": 1.5328947305679321, "reward_std": 0.22019532322883606, "rewards/accuracy_reward": 0.31257522106170654, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1488909125328064, "step": 713 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 382.8571472167969, "epoch": 0.016129032258064516, "grad_norm": 4.632675899753536, "kl": 0.0576171875, "learning_rate": 9.993600495524107e-07, "loss": 0.0023, "reward": 1.8259575366973877, "reward_std": 0.2256304919719696, "rewards/accuracy_reward": 0.4207606613636017, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2551967203617096, "step": 714 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 374.0535888671875, "epoch": 0.01615162193909822, "grad_norm": 2.5758333292621085, "kl": 0.061767578125, "learning_rate": 9.993582535855263e-07, "loss": 0.0025, "reward": 1.43338143825531, "reward_std": 0.20800374448299408, "rewards/accuracy_reward": 0.15467731654644012, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1894182562828064, "step": 715 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 372.3035888671875, "epoch": 0.016174211620131922, "grad_norm": 2.895603616593869, "kl": 0.06298828125, "learning_rate": 9.993564551036758e-07, "loss": 0.0025, "reward": 1.6522088050842285, "reward_std": 0.28547555208206177, "rewards/accuracy_reward": 0.36468085646629333, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1803850531578064, "step": 716 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 396.08929443359375, "epoch": 0.016196801301165627, "grad_norm": 3.3141716158312944, "kl": 0.051025390625, "learning_rate": 9.993546541068678e-07, "loss": 0.002, "reward": 1.922620415687561, "reward_std": 0.19876284897327423, "rewards/accuracy_reward": 0.5766452550888062, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2316894680261612, "step": 717 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 364.64288330078125, "epoch": 0.016219390982199332, "grad_norm": 2.8390722900106535, "kl": 0.0556640625, "learning_rate": 9.993528505951116e-07, "loss": 0.0022, "reward": 1.8891973495483398, "reward_std": 0.21356627345085144, "rewards/accuracy_reward": 0.4858560562133789, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2640555500984192, "step": 718 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 371.83929443359375, "epoch": 0.016241980663233034, "grad_norm": 3.040496804135561, "kl": 0.059326171875, "learning_rate": 9.993510445684163e-07, "loss": 0.0024, "reward": 1.7356764078140259, "reward_std": 0.2311038374900818, "rewards/accuracy_reward": 0.39439570903778076, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2412807047367096, "step": 719 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 344.6607360839844, "epoch": 0.01626457034426674, "grad_norm": 7.443864583961341, "kl": 0.06591796875, "learning_rate": 9.99349236026791e-07, "loss": 0.0026, "reward": 1.8919243812561035, "reward_std": 0.1702839583158493, "rewards/accuracy_reward": 0.5086864233016968, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.265380859375, "step": 720 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 346.76788330078125, "epoch": 0.016287160025300444, "grad_norm": 2.130336122502797, "kl": 0.06005859375, "learning_rate": 9.993474249702449e-07, "loss": 0.0024, "reward": 1.650243878364563, "reward_std": 0.11687473952770233, "rewards/accuracy_reward": 0.42962440848350525, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1527622789144516, "step": 721 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 391.6071472167969, "epoch": 0.016309749706334146, "grad_norm": 3.021024207938561, "kl": 0.057861328125, "learning_rate": 9.99345611398787e-07, "loss": 0.0023, "reward": 1.9449743032455444, "reward_std": 0.18169409036636353, "rewards/accuracy_reward": 0.442138671875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3171212375164032, "step": 722 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 353.6071472167969, "epoch": 0.01633233938736785, "grad_norm": 3.362180782361869, "kl": 0.060546875, "learning_rate": 9.993437953124263e-07, "loss": 0.0024, "reward": 1.93583083152771, "reward_std": 0.1985725313425064, "rewards/accuracy_reward": 0.5710184574127197, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2362409383058548, "step": 723 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 390.7857360839844, "epoch": 0.016354929068401552, "grad_norm": 2.8297762793566332, "kl": 0.05078125, "learning_rate": 9.993419767111722e-07, "loss": 0.002, "reward": 1.992864966392517, "reward_std": 0.24261434376239777, "rewards/accuracy_reward": 0.5688273906707764, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2811802625656128, "step": 724 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 369.76788330078125, "epoch": 0.016377518749435258, "grad_norm": 2.3648950406025246, "kl": 0.05615234375, "learning_rate": 9.993401555950336e-07, "loss": 0.0022, "reward": 1.99856436252594, "reward_std": 0.17167983949184418, "rewards/accuracy_reward": 0.585021436214447, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2742571234703064, "step": 725 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 367.5714416503906, "epoch": 0.016400108430468963, "grad_norm": 11.31926108926172, "kl": 0.06787109375, "learning_rate": 9.9933833196402e-07, "loss": 0.0027, "reward": 2.035024642944336, "reward_std": 0.057469889521598816, "rewards/accuracy_reward": 0.6246554851531982, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2532261610031128, "step": 726 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 365.7500305175781, "epoch": 0.016422698111502664, "grad_norm": 2.69344542848617, "kl": 0.061279296875, "learning_rate": 9.993365058181403e-07, "loss": 0.0024, "reward": 1.6519683599472046, "reward_std": 0.04223232343792915, "rewards/accuracy_reward": 0.36498796939849854, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1905517727136612, "step": 727 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 373.08929443359375, "epoch": 0.01644528779253637, "grad_norm": 3.2896533464971838, "kl": 0.061279296875, "learning_rate": 9.993346771574037e-07, "loss": 0.0024, "reward": 1.9167611598968506, "reward_std": 0.06762173771858215, "rewards/accuracy_reward": 0.5110376477241516, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2771519422531128, "step": 728 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 470.9464416503906, "epoch": 0.016467877473570074, "grad_norm": 1.537705257243503, "kl": 0.046142578125, "learning_rate": 9.993328459818198e-07, "loss": 0.0018, "reward": 1.4660993814468384, "reward_std": 0.38279786705970764, "rewards/accuracy_reward": 0.4107142984867096, "rewards/format_reward": 0.8392857313156128, "rewards/semantic_reward": 0.144670769572258, "step": 729 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 418.21429443359375, "epoch": 0.016490467154603776, "grad_norm": 3.2815023569335793, "kl": 0.05126953125, "learning_rate": 9.993310122913973e-07, "loss": 0.0021, "reward": 1.8331950902938843, "reward_std": 0.11518175899982452, "rewards/accuracy_reward": 0.5059838891029358, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1950683742761612, "step": 730 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 411.6071472167969, "epoch": 0.01651305683563748, "grad_norm": 3.1966050348112365, "kl": 0.053955078125, "learning_rate": 9.993291760861457e-07, "loss": 0.0022, "reward": 1.6566070318222046, "reward_std": 0.19884710013866425, "rewards/accuracy_reward": 0.32607194781303406, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2091064602136612, "step": 731 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 380.76788330078125, "epoch": 0.016535646516671186, "grad_norm": 1.671408437685882, "kl": 0.06396484375, "learning_rate": 9.993273373660742e-07, "loss": 0.0026, "reward": 2.0139644145965576, "reward_std": 0.054207950830459595, "rewards/accuracy_reward": 0.5834850072860718, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2590506672859192, "step": 732 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 456.1964416503906, "epoch": 0.016558236197704888, "grad_norm": 29.83738675536018, "kl": 0.047119140625, "learning_rate": 9.993254961311921e-07, "loss": 0.0019, "reward": 1.616146206855774, "reward_std": 0.3787919580936432, "rewards/accuracy_reward": 0.3470786511898041, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1869245320558548, "step": 733 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 374.6964416503906, "epoch": 0.016580825878738593, "grad_norm": 1.7948852558172863, "kl": 0.0703125, "learning_rate": 9.993236523815087e-07, "loss": 0.0028, "reward": 1.7374616861343384, "reward_std": 0.12091172486543655, "rewards/accuracy_reward": 0.4536340832710266, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1909702867269516, "step": 734 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 406.9285888671875, "epoch": 0.016603415559772294, "grad_norm": 2.4505558231933158, "kl": 0.0595703125, "learning_rate": 9.993218061170333e-07, "loss": 0.0024, "reward": 1.7804681062698364, "reward_std": 0.08951373398303986, "rewards/accuracy_reward": 0.491220623254776, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1749616414308548, "step": 735 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 391.9107360839844, "epoch": 0.016626005240806, "grad_norm": 6.923521254037194, "kl": 0.07080078125, "learning_rate": 9.993199573377751e-07, "loss": 0.0028, "reward": 1.995051383972168, "reward_std": 0.06279110163450241, "rewards/accuracy_reward": 0.5849684476852417, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2457973062992096, "step": 736 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 407.01788330078125, "epoch": 0.016648594921839704, "grad_norm": 4.030694880656822, "kl": 0.064453125, "learning_rate": 9.993181060437433e-07, "loss": 0.0026, "reward": 2.0681262016296387, "reward_std": 0.18119379878044128, "rewards/accuracy_reward": 0.6902313828468323, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2350376844406128, "step": 737 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 404.1071472167969, "epoch": 0.016671184602873406, "grad_norm": 2.8552402540578985, "kl": 0.059814453125, "learning_rate": 9.993162522349474e-07, "loss": 0.0024, "reward": 2.124199390411377, "reward_std": 0.2681407034397125, "rewards/accuracy_reward": 0.7279761433601379, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2533656656742096, "step": 738 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 389.5357360839844, "epoch": 0.01669377428390711, "grad_norm": 1.754191219485829, "kl": 0.061279296875, "learning_rate": 9.993143959113968e-07, "loss": 0.0025, "reward": 1.835487723350525, "reward_std": 0.23751051723957062, "rewards/accuracy_reward": 0.5535714626312256, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1783447414636612, "step": 739 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 392.3214416503906, "epoch": 0.016716363964940816, "grad_norm": 4.875065287357546, "kl": 0.06298828125, "learning_rate": 9.993125370731008e-07, "loss": 0.0025, "reward": 2.182685613632202, "reward_std": 0.06565657258033752, "rewards/accuracy_reward": 0.6834492087364197, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3063790500164032, "step": 740 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 404.5714416503906, "epoch": 0.016738953645974518, "grad_norm": 1.8534701908455362, "kl": 0.0576171875, "learning_rate": 9.993106757200686e-07, "loss": 0.0023, "reward": 1.8956342935562134, "reward_std": 0.1523515284061432, "rewards/accuracy_reward": 0.5601048469543457, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1855294406414032, "step": 741 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 388.9107360839844, "epoch": 0.016761543327008223, "grad_norm": 2.8688026681928873, "kl": 0.06396484375, "learning_rate": 9.993088118523097e-07, "loss": 0.0026, "reward": 2.185018301010132, "reward_std": 0.0847848504781723, "rewards/accuracy_reward": 0.6404625773429871, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3481270968914032, "step": 742 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 409.7500305175781, "epoch": 0.016784133008041928, "grad_norm": 1.9204066886283147, "kl": 0.0634765625, "learning_rate": 9.993069454698335e-07, "loss": 0.0025, "reward": 1.7971937656402588, "reward_std": 0.1926516592502594, "rewards/accuracy_reward": 0.4756360650062561, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2215576320886612, "step": 743 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 389.1607360839844, "epoch": 0.01680672268907563, "grad_norm": 2.6864470276589385, "kl": 0.053466796875, "learning_rate": 9.993050765726494e-07, "loss": 0.0021, "reward": 2.11737060546875, "reward_std": 0.14532461762428284, "rewards/accuracy_reward": 0.6793788075447083, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2629917860031128, "step": 744 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 386.2857360839844, "epoch": 0.016829312370109335, "grad_norm": 2.049234000141354, "kl": 0.0634765625, "learning_rate": 9.993032051607668e-07, "loss": 0.0025, "reward": 1.9041424989700317, "reward_std": 0.29852452874183655, "rewards/accuracy_reward": 0.5025345087051392, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2551792860031128, "step": 745 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 376.4821472167969, "epoch": 0.016851902051143036, "grad_norm": 2.786743015487085, "kl": 0.0693359375, "learning_rate": 9.99301331234195e-07, "loss": 0.0028, "reward": 2.0274317264556885, "reward_std": 0.1286712884902954, "rewards/accuracy_reward": 0.5692424178123474, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3046177625656128, "step": 746 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 382.2321472167969, "epoch": 0.01687449173217674, "grad_norm": 2.721946703860068, "kl": 0.05859375, "learning_rate": 9.992994547929436e-07, "loss": 0.0023, "reward": 2.2042181491851807, "reward_std": 0.17578166723251343, "rewards/accuracy_reward": 0.7252019643783569, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.30401611328125, "step": 747 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 391.2500305175781, "epoch": 0.016897081413210446, "grad_norm": 3.4185523423876045, "kl": 0.06494140625, "learning_rate": 9.992975758370222e-07, "loss": 0.0026, "reward": 2.034255266189575, "reward_std": 0.27195802330970764, "rewards/accuracy_reward": 0.5933023691177368, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.298095703125, "step": 748 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.5714285714285714, "completion_length": 358.4821472167969, "epoch": 0.016919671094244148, "grad_norm": 9.04662565289672, "kl": 0.06591796875, "learning_rate": 9.9929569436644e-07, "loss": 0.0026, "reward": 1.4923551082611084, "reward_std": 0.025646569207310677, "rewards/accuracy_reward": 0.2842671573162079, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1295166015625, "step": 749 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 385.01788330078125, "epoch": 0.016942260775277853, "grad_norm": 3.1904230893709946, "kl": 0.058349609375, "learning_rate": 9.992938103812062e-07, "loss": 0.0023, "reward": 1.8616129159927368, "reward_std": 0.21401618421077728, "rewards/accuracy_reward": 0.520321786403656, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1984340250492096, "step": 750 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 396.58929443359375, "epoch": 0.016964850456311558, "grad_norm": 2.2905922397194742, "kl": 0.057373046875, "learning_rate": 9.992919238813311e-07, "loss": 0.0023, "reward": 1.6440176963806152, "reward_std": 0.3470071852207184, "rewards/accuracy_reward": 0.400926798582077, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1680908352136612, "step": 751 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 377.58929443359375, "epoch": 0.01698744013734526, "grad_norm": 4.478593162422996, "kl": 0.062255859375, "learning_rate": 9.992900348668234e-07, "loss": 0.0025, "reward": 1.8089097738265991, "reward_std": 0.3959093987941742, "rewards/accuracy_reward": 0.4690520763397217, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.229143425822258, "step": 752 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 367.33929443359375, "epoch": 0.017010029818378965, "grad_norm": 2.0561379299342226, "kl": 0.0673828125, "learning_rate": 9.992881433376932e-07, "loss": 0.0027, "reward": 2.124803066253662, "reward_std": 0.12412421405315399, "rewards/accuracy_reward": 0.7164079546928406, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.265537828207016, "step": 753 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 361.71429443359375, "epoch": 0.01703261949941267, "grad_norm": 1.947216214693018, "kl": 0.056640625, "learning_rate": 9.992862492939495e-07, "loss": 0.0023, "reward": 2.0245883464813232, "reward_std": 0.14415782690048218, "rewards/accuracy_reward": 0.5609650015830994, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3207659125328064, "step": 754 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 370.1250305175781, "epoch": 0.01705520918044637, "grad_norm": 10.435973889418273, "kl": 0.06396484375, "learning_rate": 9.992843527356023e-07, "loss": 0.0026, "reward": 1.5396332740783691, "reward_std": 0.18326596915721893, "rewards/accuracy_reward": 0.29048070311546326, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1491524875164032, "step": 755 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 350.0714416503906, "epoch": 0.017077798861480076, "grad_norm": 3.519738627089877, "kl": 0.06591796875, "learning_rate": 9.992824536626609e-07, "loss": 0.0026, "reward": 2.1423797607421875, "reward_std": 0.14427495002746582, "rewards/accuracy_reward": 0.7288891673088074, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.274204820394516, "step": 756 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 366.0535888671875, "epoch": 0.017100388542513778, "grad_norm": 1.7354230144658627, "kl": 0.0634765625, "learning_rate": 9.99280552075135e-07, "loss": 0.0025, "reward": 1.9051284790039062, "reward_std": 0.17081253230571747, "rewards/accuracy_reward": 0.48365792632102966, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.3107561469078064, "step": 757 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 361.2500305175781, "epoch": 0.017122978223547483, "grad_norm": 21.86412073612209, "kl": 0.068359375, "learning_rate": 9.99278647973034e-07, "loss": 0.0027, "reward": 1.9583055973052979, "reward_std": 0.24657027423381805, "rewards/accuracy_reward": 0.5493699312210083, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2660784125328064, "step": 758 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 391.76788330078125, "epoch": 0.017145567904581188, "grad_norm": 2.015520245655613, "kl": 0.0654296875, "learning_rate": 9.992767413563676e-07, "loss": 0.0026, "reward": 1.723549246788025, "reward_std": 0.3431054651737213, "rewards/accuracy_reward": 0.47860991954803467, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1877964586019516, "step": 759 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 373.9821472167969, "epoch": 0.01716815758561489, "grad_norm": 2.9149283492066718, "kl": 0.08056640625, "learning_rate": 9.992748322251455e-07, "loss": 0.0032, "reward": 2.103523015975952, "reward_std": 0.09153170883655548, "rewards/accuracy_reward": 0.6395267248153687, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2925676703453064, "step": 760 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 402.76788330078125, "epoch": 0.017190747266648595, "grad_norm": 2.493944469510984, "kl": 0.072265625, "learning_rate": 9.992729205793772e-07, "loss": 0.0029, "reward": 1.7186992168426514, "reward_std": 0.3501598536968231, "rewards/accuracy_reward": 0.4726681709289551, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.199602410197258, "step": 761 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 368.39288330078125, "epoch": 0.0172133369476823, "grad_norm": 2.020007054568655, "kl": 0.07421875, "learning_rate": 9.992710064190723e-07, "loss": 0.003, "reward": 1.8963063955307007, "reward_std": 0.1410101056098938, "rewards/accuracy_reward": 0.49858370423316956, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2548653781414032, "step": 762 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 391.83929443359375, "epoch": 0.017235926628716, "grad_norm": 2.34563338945435, "kl": 0.06884765625, "learning_rate": 9.992690897442405e-07, "loss": 0.0028, "reward": 1.8948681354522705, "reward_std": 0.19372157752513885, "rewards/accuracy_reward": 0.4917779564857483, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2566615641117096, "step": 763 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 391.6964416503906, "epoch": 0.017258516309749707, "grad_norm": 7.557731512191279, "kl": 0.0869140625, "learning_rate": 9.992671705548913e-07, "loss": 0.0035, "reward": 1.4786440134048462, "reward_std": 0.284773051738739, "rewards/accuracy_reward": 0.29140549898147583, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1336669921875, "step": 764 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 400.64288330078125, "epoch": 0.01728110599078341, "grad_norm": 878.1095824703011, "kl": 8.0, "learning_rate": 9.992652488510346e-07, "loss": 0.3208, "reward": 1.7539689540863037, "reward_std": 0.25734221935272217, "rewards/accuracy_reward": 0.494597464799881, "rewards/format_reward": 0.9285714626312256, "rewards/semantic_reward": 0.2129429429769516, "step": 765 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 421.2321472167969, "epoch": 0.017303695671817113, "grad_norm": 2.162009413759747, "kl": 0.06494140625, "learning_rate": 9.9926332463268e-07, "loss": 0.0026, "reward": 1.9079152345657349, "reward_std": 0.13514506816864014, "rewards/accuracy_reward": 0.4544169008731842, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2999267578125, "step": 766 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 398.6071472167969, "epoch": 0.01732628535285082, "grad_norm": 1.8210704687998627, "kl": 0.07470703125, "learning_rate": 9.992613978998374e-07, "loss": 0.003, "reward": 1.6801027059555054, "reward_std": 0.11410413682460785, "rewards/accuracy_reward": 0.44249099493026733, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1411830484867096, "step": 767 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 405.2857360839844, "epoch": 0.01734887503388452, "grad_norm": 3.6596892338961675, "kl": 0.0703125, "learning_rate": 9.99259468652516e-07, "loss": 0.0028, "reward": 2.0652377605438232, "reward_std": 0.17529137432575226, "rewards/accuracy_reward": 0.6041675806045532, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.293212890625, "step": 768 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 404.96429443359375, "epoch": 0.017371464714918225, "grad_norm": 2.306695256329794, "kl": 0.061767578125, "learning_rate": 9.992575368907257e-07, "loss": 0.0025, "reward": 2.2206883430480957, "reward_std": 0.2661028802394867, "rewards/accuracy_reward": 0.8040865063667297, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2666015625, "step": 769 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 405.96429443359375, "epoch": 0.01739405439595193, "grad_norm": 1.667701304009779, "kl": 0.064453125, "learning_rate": 9.992556026144764e-07, "loss": 0.0026, "reward": 2.066732168197632, "reward_std": 0.15664175152778625, "rewards/accuracy_reward": 0.6383227705955505, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.28912353515625, "step": 770 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 388.1071472167969, "epoch": 0.01741664407698563, "grad_norm": 6.46473251531966, "kl": 0.060546875, "learning_rate": 9.99253665823778e-07, "loss": 0.0024, "reward": 2.080148935317993, "reward_std": 0.17766539752483368, "rewards/accuracy_reward": 0.6465551257133484, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2728794813156128, "step": 771 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 372.4285888671875, "epoch": 0.017439233758019337, "grad_norm": 7.9233087490122776, "kl": 0.06298828125, "learning_rate": 9.992517265186397e-07, "loss": 0.0025, "reward": 1.7583450078964233, "reward_std": 0.3281227946281433, "rewards/accuracy_reward": 0.47457677125930786, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1837681382894516, "step": 772 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 393.6964416503906, "epoch": 0.017461823439053042, "grad_norm": 2.624923478753359, "kl": 0.06005859375, "learning_rate": 9.992497846990718e-07, "loss": 0.0024, "reward": 1.5508625507354736, "reward_std": 0.2737288475036621, "rewards/accuracy_reward": 0.2991291284561157, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1695905476808548, "step": 773 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 372.0357360839844, "epoch": 0.017484413120086743, "grad_norm": 14.356562643415966, "kl": 0.0576171875, "learning_rate": 9.992478403650837e-07, "loss": 0.0023, "reward": 2.1726386547088623, "reward_std": 0.08480482548475266, "rewards/accuracy_reward": 0.6931707262992859, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3116106390953064, "step": 774 }, { "all_correct": 0.8571428571428571, "all_wrong": 0.0, "completion_length": 361.2857360839844, "epoch": 0.01750700280112045, "grad_norm": 1.781377091207814, "kl": 0.06396484375, "learning_rate": 9.992458935166855e-07, "loss": 0.0026, "reward": 2.4325859546661377, "reward_std": 0.12184496968984604, "rewards/accuracy_reward": 0.9821429252624512, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2754429578781128, "step": 775 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 379.1071472167969, "epoch": 0.017529592482154153, "grad_norm": 3.6374602404913423, "kl": 0.06298828125, "learning_rate": 9.992439441538869e-07, "loss": 0.0025, "reward": 1.9923725128173828, "reward_std": 0.43153151869773865, "rewards/accuracy_reward": 0.6481794714927673, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.226335808634758, "step": 776 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 383.6785888671875, "epoch": 0.017552182163187855, "grad_norm": 3.8153205877489733, "kl": 0.060546875, "learning_rate": 9.992419922766975e-07, "loss": 0.0024, "reward": 2.1024279594421387, "reward_std": 0.17761914432048798, "rewards/accuracy_reward": 0.6367025375366211, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2835824191570282, "step": 777 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 375.7857360839844, "epoch": 0.01757477184422156, "grad_norm": 1.5123183867361651, "kl": 0.06494140625, "learning_rate": 9.992400378851274e-07, "loss": 0.0026, "reward": 1.5676798820495605, "reward_std": 0.24392731487751007, "rewards/accuracy_reward": 0.30049222707748413, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2064732164144516, "step": 778 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 406.71429443359375, "epoch": 0.017597361525255262, "grad_norm": 2.0798127654248715, "kl": 0.06298828125, "learning_rate": 9.992380809791864e-07, "loss": 0.0025, "reward": 1.8256112337112427, "reward_std": 0.2518070340156555, "rewards/accuracy_reward": 0.49713385105133057, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1891915500164032, "step": 779 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 413.1250305175781, "epoch": 0.017619951206288967, "grad_norm": 5.179340551512949, "kl": 0.0625, "learning_rate": 9.99236121558884e-07, "loss": 0.0025, "reward": 2.001278877258301, "reward_std": 0.28871768712997437, "rewards/accuracy_reward": 0.6414190530776978, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.267002671957016, "step": 780 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 417.7857360839844, "epoch": 0.017642540887322672, "grad_norm": 10.140986095492243, "kl": 0.0634765625, "learning_rate": 9.992341596242307e-07, "loss": 0.0025, "reward": 1.8814204931259155, "reward_std": 0.2996964454650879, "rewards/accuracy_reward": 0.5229836702346802, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2727225422859192, "step": 781 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 416.96429443359375, "epoch": 0.017665130568356373, "grad_norm": 1.8245050606147042, "kl": 0.0712890625, "learning_rate": 9.99232195175236e-07, "loss": 0.0029, "reward": 1.930903673171997, "reward_std": 0.38720574975013733, "rewards/accuracy_reward": 0.6471145153045654, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.2409319281578064, "step": 782 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 357.9285888671875, "epoch": 0.01768772024939008, "grad_norm": 4.789903449246839, "kl": 0.06396484375, "learning_rate": 9.9923022821191e-07, "loss": 0.0026, "reward": 1.6512703895568848, "reward_std": 0.03788610175251961, "rewards/accuracy_reward": 0.37554147839546204, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1757289469242096, "step": 783 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 366.96429443359375, "epoch": 0.017710309930423784, "grad_norm": 1.8505023672235883, "kl": 0.06689453125, "learning_rate": 9.99228258734262e-07, "loss": 0.0027, "reward": 1.8899128437042236, "reward_std": 0.2745928168296814, "rewards/accuracy_reward": 0.5301715135574341, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.252598375082016, "step": 784 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 369.8035888671875, "epoch": 0.017732899611457485, "grad_norm": 4.922444922180056, "kl": 0.0703125, "learning_rate": 9.992262867423028e-07, "loss": 0.0028, "reward": 1.8130429983139038, "reward_std": 0.12127283960580826, "rewards/accuracy_reward": 0.4120243489742279, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2403041422367096, "step": 785 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 385.33929443359375, "epoch": 0.01775548929249119, "grad_norm": 2.0921942166379535, "kl": 0.068359375, "learning_rate": 9.992243122360418e-07, "loss": 0.0027, "reward": 1.8946040868759155, "reward_std": 0.22333122789859772, "rewards/accuracy_reward": 0.5765306353569031, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2037876695394516, "step": 786 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 377.96429443359375, "epoch": 0.017778078973524895, "grad_norm": 4.474190895789292, "kl": 0.060546875, "learning_rate": 9.99222335215489e-07, "loss": 0.0024, "reward": 2.0492703914642334, "reward_std": 0.1714819222688675, "rewards/accuracy_reward": 0.5983808040618896, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2866036593914032, "step": 787 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.7142857142857143, "completion_length": 347.6964416503906, "epoch": 0.017800668654558597, "grad_norm": 1.1846041257021815, "kl": 0.0673828125, "learning_rate": 9.992203556806542e-07, "loss": 0.0027, "reward": 1.3314672708511353, "reward_std": 0.029133396223187447, "rewards/accuracy_reward": 0.1991569846868515, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0823102742433548, "step": 788 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 365.7857360839844, "epoch": 0.017823258335592302, "grad_norm": 2.2527128314192155, "kl": 0.0615234375, "learning_rate": 9.99218373631548e-07, "loss": 0.0025, "reward": 1.8940722942352295, "reward_std": 0.19721649587154388, "rewards/accuracy_reward": 0.5016580820083618, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2959856390953064, "step": 789 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 391.14288330078125, "epoch": 0.017845848016626004, "grad_norm": 2.4066464587190524, "kl": 0.04931640625, "learning_rate": 9.992163890681797e-07, "loss": 0.002, "reward": 1.4083067178726196, "reward_std": 0.2923932671546936, "rewards/accuracy_reward": 0.24527296423912048, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1094622015953064, "step": 790 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 388.0535888671875, "epoch": 0.01786843769765971, "grad_norm": 3.2009165005640967, "kl": 0.05712890625, "learning_rate": 9.992144019905597e-07, "loss": 0.0023, "reward": 1.7049462795257568, "reward_std": 0.18574084341526031, "rewards/accuracy_reward": 0.387329638004303, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1997593492269516, "step": 791 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 374.4107360839844, "epoch": 0.017891027378693414, "grad_norm": 2.621664289706485, "kl": 0.0703125, "learning_rate": 9.992124123986976e-07, "loss": 0.0028, "reward": 1.944461464881897, "reward_std": 0.06669680774211884, "rewards/accuracy_reward": 0.47964906692504883, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2719552218914032, "step": 792 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 366.5000305175781, "epoch": 0.017913617059727115, "grad_norm": 2.6628232758979484, "kl": 0.06591796875, "learning_rate": 9.99210420292604e-07, "loss": 0.0026, "reward": 2.037675619125366, "reward_std": 0.13233661651611328, "rewards/accuracy_reward": 0.5833438634872437, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.279331773519516, "step": 793 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 402.83929443359375, "epoch": 0.01793620674076082, "grad_norm": 1.611061944193743, "kl": 0.0576171875, "learning_rate": 9.992084256722884e-07, "loss": 0.0023, "reward": 1.7950221300125122, "reward_std": 0.28375980257987976, "rewards/accuracy_reward": 0.498555064201355, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2000383734703064, "step": 794 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 373.96429443359375, "epoch": 0.017958796421794525, "grad_norm": 1.6442698959279263, "kl": 0.0595703125, "learning_rate": 9.992064285377612e-07, "loss": 0.0024, "reward": 1.8350344896316528, "reward_std": 0.24079856276512146, "rewards/accuracy_reward": 0.5307862758636475, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1935337632894516, "step": 795 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 382.1250305175781, "epoch": 0.017981386102828227, "grad_norm": 2.2151948809886477, "kl": 0.059814453125, "learning_rate": 9.992044288890323e-07, "loss": 0.0024, "reward": 1.7768415212631226, "reward_std": 0.26870033144950867, "rewards/accuracy_reward": 0.4631589651107788, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2208252102136612, "step": 796 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 375.3750305175781, "epoch": 0.018003975783861932, "grad_norm": 3.9180566077759176, "kl": 0.05908203125, "learning_rate": 9.992024267261116e-07, "loss": 0.0024, "reward": 1.9461263418197632, "reward_std": 0.232185959815979, "rewards/accuracy_reward": 0.5183883905410767, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2527378797531128, "step": 797 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 403.39288330078125, "epoch": 0.018026565464895637, "grad_norm": 5.674599219710167, "kl": 0.0625, "learning_rate": 9.992004220490095e-07, "loss": 0.0025, "reward": 1.9273954629898071, "reward_std": 0.17222413420677185, "rewards/accuracy_reward": 0.5510701537132263, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2334682047367096, "step": 798 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 379.01788330078125, "epoch": 0.01804915514592934, "grad_norm": 1.950365519671864, "kl": 0.055908203125, "learning_rate": 9.991984148577361e-07, "loss": 0.0022, "reward": 2.0241870880126953, "reward_std": 0.16345351934432983, "rewards/accuracy_reward": 0.6354663968086243, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2422921359539032, "step": 799 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 359.9285888671875, "epoch": 0.018071744826963044, "grad_norm": 1.4082505046142277, "kl": 0.05517578125, "learning_rate": 9.991964051523013e-07, "loss": 0.0022, "reward": 1.4451454877853394, "reward_std": 0.19405466318130493, "rewards/accuracy_reward": 0.2745400071144104, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1027483269572258, "step": 800 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 365.76788330078125, "epoch": 0.018094334507996745, "grad_norm": 2.2072859373462808, "kl": 0.055419921875, "learning_rate": 9.991943929327153e-07, "loss": 0.0022, "reward": 1.962977647781372, "reward_std": 0.1705659031867981, "rewards/accuracy_reward": 0.5821216702461243, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2344273328781128, "step": 801 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 376.3750305175781, "epoch": 0.01811692418903045, "grad_norm": 2.0158318570158094, "kl": 0.049560546875, "learning_rate": 9.991923781989884e-07, "loss": 0.002, "reward": 2.0727667808532715, "reward_std": 0.3021707534790039, "rewards/accuracy_reward": 0.6696940064430237, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2566441297531128, "step": 802 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 374.8214416503906, "epoch": 0.018139513870064156, "grad_norm": 1.7788697572104701, "kl": 0.050048828125, "learning_rate": 9.991903609511302e-07, "loss": 0.002, "reward": 2.0852043628692627, "reward_std": 0.22043266892433167, "rewards/accuracy_reward": 0.6823861002922058, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2635323703289032, "step": 803 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 396.4464416503906, "epoch": 0.018162103551097857, "grad_norm": 2.21409903432406, "kl": 0.046630859375, "learning_rate": 9.991883411891516e-07, "loss": 0.0019, "reward": 1.8960860967636108, "reward_std": 0.3403969705104828, "rewards/accuracy_reward": 0.5124381184577942, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2479335367679596, "step": 804 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 378.0357360839844, "epoch": 0.018184693232131562, "grad_norm": 2.358121854422228, "kl": 0.04833984375, "learning_rate": 9.991863189130624e-07, "loss": 0.0019, "reward": 1.9652682542800903, "reward_std": 0.27103468775749207, "rewards/accuracy_reward": 0.6176049709320068, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2298060953617096, "step": 805 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 376.4285888671875, "epoch": 0.018207282913165267, "grad_norm": 2.4648142866896205, "kl": 0.05029296875, "learning_rate": 9.991842941228726e-07, "loss": 0.002, "reward": 1.7626888751983643, "reward_std": 0.33360517024993896, "rewards/accuracy_reward": 0.4273233115673065, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2317940890789032, "step": 806 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 367.2321472167969, "epoch": 0.01822987259419897, "grad_norm": 2.2460062515766, "kl": 0.046875, "learning_rate": 9.991822668185925e-07, "loss": 0.0019, "reward": 2.0689685344696045, "reward_std": 0.1543595790863037, "rewards/accuracy_reward": 0.6966296434402466, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2366245985031128, "step": 807 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 415.2500305175781, "epoch": 0.018252462275232674, "grad_norm": 3.007863543980719, "kl": 0.050048828125, "learning_rate": 9.991802370002326e-07, "loss": 0.002, "reward": 2.0373761653900146, "reward_std": 0.37231722474098206, "rewards/accuracy_reward": 0.6405917406082153, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2753557562828064, "step": 808 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 387.4285888671875, "epoch": 0.01827505195626638, "grad_norm": 6.278625627759303, "kl": 0.0537109375, "learning_rate": 9.99178204667803e-07, "loss": 0.0021, "reward": 1.7549563646316528, "reward_std": 0.31242984533309937, "rewards/accuracy_reward": 0.45382270216941833, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2154192328453064, "step": 809 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 394.6071472167969, "epoch": 0.01829764163730008, "grad_norm": 2.165274075320558, "kl": 0.049072265625, "learning_rate": 9.991761698213138e-07, "loss": 0.002, "reward": 1.7901263236999512, "reward_std": 0.11528279632329941, "rewards/accuracy_reward": 0.4845946133136749, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2055315375328064, "step": 810 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 414.51788330078125, "epoch": 0.018320231318333786, "grad_norm": 3.7690032903431083, "kl": 0.039306640625, "learning_rate": 9.991741324607752e-07, "loss": 0.0016, "reward": 1.8314059972763062, "reward_std": 0.19659140706062317, "rewards/accuracy_reward": 0.5194709300994873, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.197649285197258, "step": 811 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 366.51788330078125, "epoch": 0.018342820999367487, "grad_norm": 4.435101609579215, "kl": 0.0654296875, "learning_rate": 9.991720925861977e-07, "loss": 0.0026, "reward": 1.9739282131195068, "reward_std": 0.18048734962940216, "rewards/accuracy_reward": 0.5783853530883789, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2348284125328064, "step": 812 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 383.9464416503906, "epoch": 0.018365410680401192, "grad_norm": 2.739710069267869, "kl": 0.0615234375, "learning_rate": 9.991700501975915e-07, "loss": 0.0025, "reward": 1.8333123922348022, "reward_std": 0.2525896430015564, "rewards/accuracy_reward": 0.4963110387325287, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2477155476808548, "step": 813 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 372.14288330078125, "epoch": 0.018388000361434897, "grad_norm": 28.002141072924342, "kl": 0.1611328125, "learning_rate": 9.991680052949667e-07, "loss": 0.0064, "reward": 1.6802924871444702, "reward_std": 0.21274593472480774, "rewards/accuracy_reward": 0.44241219758987427, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1664516031742096, "step": 814 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 363.9107360839844, "epoch": 0.0184105900424686, "grad_norm": 1.8576527563127359, "kl": 0.0615234375, "learning_rate": 9.991659578783337e-07, "loss": 0.0025, "reward": 1.8517323732376099, "reward_std": 0.27246615290641785, "rewards/accuracy_reward": 0.5505952835083008, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2011370062828064, "step": 815 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 378.64288330078125, "epoch": 0.018433179723502304, "grad_norm": 3.2598735052609045, "kl": 0.054443359375, "learning_rate": 9.99163907947703e-07, "loss": 0.0022, "reward": 1.965035080909729, "reward_std": 0.15899106860160828, "rewards/accuracy_reward": 0.5659837126731873, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2419084906578064, "step": 816 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 370.0000305175781, "epoch": 0.01845576940453601, "grad_norm": 1.6482992070438849, "kl": 0.060546875, "learning_rate": 9.991618555030848e-07, "loss": 0.0024, "reward": 1.6143052577972412, "reward_std": 0.22147588431835175, "rewards/accuracy_reward": 0.3836619555950165, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.141357421875, "step": 817 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 374.9285888671875, "epoch": 0.01847835908556971, "grad_norm": 2.068725090816121, "kl": 0.056884765625, "learning_rate": 9.991598005444893e-07, "loss": 0.0023, "reward": 1.6631300449371338, "reward_std": 0.1618383377790451, "rewards/accuracy_reward": 0.3931243121623993, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1842913031578064, "step": 818 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 366.39288330078125, "epoch": 0.018500948766603416, "grad_norm": 1.7263971969124068, "kl": 0.0673828125, "learning_rate": 9.99157743071927e-07, "loss": 0.0027, "reward": 1.8490397930145264, "reward_std": 0.1507011204957962, "rewards/accuracy_reward": 0.5595238208770752, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1823730617761612, "step": 819 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 362.8035888671875, "epoch": 0.01852353844763712, "grad_norm": 2.758941594613635, "kl": 0.06396484375, "learning_rate": 9.991556830854082e-07, "loss": 0.0026, "reward": 1.8871482610702515, "reward_std": 0.08640771359205246, "rewards/accuracy_reward": 0.4712301194667816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2623465657234192, "step": 820 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 375.0714416503906, "epoch": 0.018546128128670823, "grad_norm": 2.3779056051919074, "kl": 0.05810546875, "learning_rate": 9.991536205849434e-07, "loss": 0.0023, "reward": 2.128242254257202, "reward_std": 0.2587119936943054, "rewards/accuracy_reward": 0.6339410543441772, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3193010687828064, "step": 821 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 370.1607360839844, "epoch": 0.018568717809704528, "grad_norm": 2.9176085095480984, "kl": 0.06689453125, "learning_rate": 9.991515555705428e-07, "loss": 0.0027, "reward": 2.141608953475952, "reward_std": 0.12619923055171967, "rewards/accuracy_reward": 0.6977437734603882, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.3010079562664032, "step": 822 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 371.21429443359375, "epoch": 0.01859130749073823, "grad_norm": 2.404138105520456, "kl": 0.056884765625, "learning_rate": 9.991494880422168e-07, "loss": 0.0023, "reward": 1.7117046117782593, "reward_std": 0.33293479681015015, "rewards/accuracy_reward": 0.3852187693119049, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2193429172039032, "step": 823 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 389.3571472167969, "epoch": 0.018613897171771934, "grad_norm": 2.0829599146020588, "kl": 0.06103515625, "learning_rate": 9.99147417999976e-07, "loss": 0.0024, "reward": 1.8636895418167114, "reward_std": 0.2497768998146057, "rewards/accuracy_reward": 0.5366700887680054, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2127336859703064, "step": 824 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 371.3035888671875, "epoch": 0.01863648685280564, "grad_norm": 1.5871547713493426, "kl": 0.05908203125, "learning_rate": 9.991453454438306e-07, "loss": 0.0024, "reward": 1.843176007270813, "reward_std": 0.3195580840110779, "rewards/accuracy_reward": 0.4998757839202881, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2397286593914032, "step": 825 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 404.33929443359375, "epoch": 0.01865907653383934, "grad_norm": 3.8809268287184002, "kl": 0.05859375, "learning_rate": 9.991432703737913e-07, "loss": 0.0024, "reward": 1.7288029193878174, "reward_std": 0.13974420726299286, "rewards/accuracy_reward": 0.37017762660980225, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2157680094242096, "step": 826 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.5714285714285714, "completion_length": 356.9464416503906, "epoch": 0.018681666214873046, "grad_norm": 0.9124009197679929, "kl": 0.052734375, "learning_rate": 9.991411927898684e-07, "loss": 0.0021, "reward": 1.50892174243927, "reward_std": 0.15331816673278809, "rewards/accuracy_reward": 0.3392857313156128, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1089216023683548, "step": 827 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 402.3214416503906, "epoch": 0.01870425589590675, "grad_norm": 3.458638603450084, "kl": 0.06689453125, "learning_rate": 9.991391126920724e-07, "loss": 0.0027, "reward": 1.7818585634231567, "reward_std": 0.25848063826560974, "rewards/accuracy_reward": 0.36563000082969666, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2769426703453064, "step": 828 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.2857142857142857, "completion_length": 367.9464416503906, "epoch": 0.018726845576940453, "grad_norm": 3.3625439170834177, "kl": 0.06396484375, "learning_rate": 9.991370300804137e-07, "loss": 0.0026, "reward": 1.935517430305481, "reward_std": 0.09855169802904129, "rewards/accuracy_reward": 0.6171788573265076, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2076241672039032, "step": 829 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 378.4107360839844, "epoch": 0.018749435257974158, "grad_norm": 2.1666961059084335, "kl": 0.060791015625, "learning_rate": 9.991349449549026e-07, "loss": 0.0024, "reward": 2.2802681922912598, "reward_std": 0.16862863302230835, "rewards/accuracy_reward": 0.7995762228965759, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2985491156578064, "step": 830 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 383.21429443359375, "epoch": 0.018772024939007863, "grad_norm": 2.266877170660508, "kl": 0.05908203125, "learning_rate": 9.991328573155502e-07, "loss": 0.0024, "reward": 1.642372488975525, "reward_std": 0.33831366896629333, "rewards/accuracy_reward": 0.4157680869102478, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.1801757961511612, "step": 831 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 355.1785888671875, "epoch": 0.018794614620041564, "grad_norm": 6.404509670409695, "kl": 0.06982421875, "learning_rate": 9.991307671623665e-07, "loss": 0.0028, "reward": 1.8492915630340576, "reward_std": 0.14039556682109833, "rewards/accuracy_reward": 0.4987963140010834, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2397809773683548, "step": 832 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 407.01788330078125, "epoch": 0.01881720430107527, "grad_norm": 1.2012475664390205, "kl": 0.046875, "learning_rate": 9.991286744953622e-07, "loss": 0.0019, "reward": 1.6675705909729004, "reward_std": 0.35131320357322693, "rewards/accuracy_reward": 0.45708298683166504, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1604875922203064, "step": 833 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 362.08929443359375, "epoch": 0.01883979398210897, "grad_norm": 3.4130103717275864, "kl": 0.060791015625, "learning_rate": 9.991265793145479e-07, "loss": 0.0024, "reward": 1.918511152267456, "reward_std": 0.1620391458272934, "rewards/accuracy_reward": 0.5116193294525146, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2604631781578064, "step": 834 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 349.7500305175781, "epoch": 0.018862383663142676, "grad_norm": 2.6775684049522726, "kl": 0.057861328125, "learning_rate": 9.99124481619934e-07, "loss": 0.0023, "reward": 1.8492281436920166, "reward_std": 0.12418801337480545, "rewards/accuracy_reward": 0.5595238208770752, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1789899617433548, "step": 835 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 390.7500305175781, "epoch": 0.01888497334417638, "grad_norm": 2.0412541884967035, "kl": 0.0556640625, "learning_rate": 9.99122381411531e-07, "loss": 0.0022, "reward": 2.0014514923095703, "reward_std": 0.21715012192726135, "rewards/accuracy_reward": 0.6071224212646484, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2586146891117096, "step": 836 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 380.96429443359375, "epoch": 0.018907563025210083, "grad_norm": 2.5289855071896494, "kl": 0.060302734375, "learning_rate": 9.9912027868935e-07, "loss": 0.0024, "reward": 1.8526729345321655, "reward_std": 0.14097750186920166, "rewards/accuracy_reward": 0.4373756945133209, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2688685953617096, "step": 837 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 370.83929443359375, "epoch": 0.018930152706243788, "grad_norm": 3.970077349869018, "kl": 0.05859375, "learning_rate": 9.99118173453401e-07, "loss": 0.0023, "reward": 2.1731977462768555, "reward_std": 0.24780739843845367, "rewards/accuracy_reward": 0.6742684841156006, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3310721516609192, "step": 838 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 379.3571472167969, "epoch": 0.018952742387277493, "grad_norm": 3.4681679456392063, "kl": 0.068359375, "learning_rate": 9.991160657036945e-07, "loss": 0.0027, "reward": 2.0228970050811768, "reward_std": 0.09942422062158585, "rewards/accuracy_reward": 0.6022775173187256, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2777622938156128, "step": 839 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 392.9821472167969, "epoch": 0.018975332068311195, "grad_norm": 2.932888468789156, "kl": 0.0595703125, "learning_rate": 9.991139554402418e-07, "loss": 0.0024, "reward": 1.7419986724853516, "reward_std": 0.12158215790987015, "rewards/accuracy_reward": 0.4468115568161011, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1916155219078064, "step": 840 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 397.08929443359375, "epoch": 0.0189979217493449, "grad_norm": 1.4392371350750566, "kl": 0.05224609375, "learning_rate": 9.991118426630532e-07, "loss": 0.0021, "reward": 1.8586362600326538, "reward_std": 0.22613723576068878, "rewards/accuracy_reward": 0.5174601078033447, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2233189344406128, "step": 841 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 366.26788330078125, "epoch": 0.019020511430378605, "grad_norm": 10.76164974711611, "kl": 0.0634765625, "learning_rate": 9.99109727372139e-07, "loss": 0.0025, "reward": 1.6466498374938965, "reward_std": 0.035402823239564896, "rewards/accuracy_reward": 0.3461335599422455, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1898019015789032, "step": 842 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 405.46429443359375, "epoch": 0.019043101111412306, "grad_norm": 3.4904492022089206, "kl": 0.0625, "learning_rate": 9.991076095675102e-07, "loss": 0.0025, "reward": 1.7937829494476318, "reward_std": 0.028727922588586807, "rewards/accuracy_reward": 0.4248516261577606, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2260742336511612, "step": 843 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 405.8571472167969, "epoch": 0.01906569079244601, "grad_norm": 1.521612278508405, "kl": 0.05126953125, "learning_rate": 9.991054892491776e-07, "loss": 0.0021, "reward": 2.1160712242126465, "reward_std": 0.3383042514324188, "rewards/accuracy_reward": 0.7094236016273499, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2602190375328064, "step": 844 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 391.76788330078125, "epoch": 0.019088280473479713, "grad_norm": 2.4752830435249558, "kl": 0.06494140625, "learning_rate": 9.991033664171515e-07, "loss": 0.0026, "reward": 1.6199694871902466, "reward_std": 0.17121244966983795, "rewards/accuracy_reward": 0.2729617655277252, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2184361219406128, "step": 845 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 390.64288330078125, "epoch": 0.019110870154513418, "grad_norm": 10.308753025032377, "kl": 0.06640625, "learning_rate": 9.99101241071443e-07, "loss": 0.0027, "reward": 2.1757612228393555, "reward_std": 0.1492956131696701, "rewards/accuracy_reward": 0.6912466883659363, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2880859375, "step": 846 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 444.3750305175781, "epoch": 0.019133459835547123, "grad_norm": 1.4032423744269853, "kl": 0.048828125, "learning_rate": 9.990991132120625e-07, "loss": 0.002, "reward": 1.9369885921478271, "reward_std": 0.21930231153964996, "rewards/accuracy_reward": 0.6322871446609497, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2118443101644516, "step": 847 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 375.01788330078125, "epoch": 0.019156049516580825, "grad_norm": 2.677371876945249, "kl": 0.0673828125, "learning_rate": 9.990969828390208e-07, "loss": 0.0027, "reward": 2.2542848587036133, "reward_std": 0.06002981960773468, "rewards/accuracy_reward": 0.766903281211853, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.298095703125, "step": 848 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 383.7857360839844, "epoch": 0.01917863919761453, "grad_norm": 1.9722945294263332, "kl": 0.059814453125, "learning_rate": 9.990948499523286e-07, "loss": 0.0024, "reward": 1.9942104816436768, "reward_std": 0.23074676096439362, "rewards/accuracy_reward": 0.5676145553588867, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2515956461429596, "step": 849 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 377.5000305175781, "epoch": 0.019201228878648235, "grad_norm": 2.3616127979221693, "kl": 0.0703125, "learning_rate": 9.990927145519967e-07, "loss": 0.0028, "reward": 1.7351374626159668, "reward_std": 0.03366037458181381, "rewards/accuracy_reward": 0.40491965413093567, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1873604953289032, "step": 850 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 365.0000305175781, "epoch": 0.019223818559681936, "grad_norm": 2.704702468914278, "kl": 0.07421875, "learning_rate": 9.990905766380358e-07, "loss": 0.003, "reward": 1.921852707862854, "reward_std": 0.31696635484695435, "rewards/accuracy_reward": 0.5650061964988708, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2139892727136612, "step": 851 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 395.1964416503906, "epoch": 0.01924640824071564, "grad_norm": 1.966873097884997, "kl": 0.050537109375, "learning_rate": 9.990884362104566e-07, "loss": 0.002, "reward": 1.5919846296310425, "reward_std": 0.13623332977294922, "rewards/accuracy_reward": 0.35630175471305847, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1642543375492096, "step": 852 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 390.6071472167969, "epoch": 0.019268997921749347, "grad_norm": 4.05949463940582, "kl": 0.0634765625, "learning_rate": 9.990862932692701e-07, "loss": 0.0025, "reward": 2.064089775085449, "reward_std": 0.0990174189209938, "rewards/accuracy_reward": 0.6139047741889954, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2787562906742096, "step": 853 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 403.6071472167969, "epoch": 0.019291587602783048, "grad_norm": 2.475817506595684, "kl": 0.0546875, "learning_rate": 9.990841478144868e-07, "loss": 0.0022, "reward": 1.5779340267181396, "reward_std": 0.03674842417240143, "rewards/accuracy_reward": 0.30643558502197266, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.160784050822258, "step": 854 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 412.0357360839844, "epoch": 0.019314177283816753, "grad_norm": 2.2332430878380594, "kl": 0.058349609375, "learning_rate": 9.99081999846118e-07, "loss": 0.0023, "reward": 1.8648935556411743, "reward_std": 0.18602697551250458, "rewards/accuracy_reward": 0.48875296115875244, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2582833468914032, "step": 855 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 386.0535888671875, "epoch": 0.019336766964850455, "grad_norm": 3.313793323896126, "kl": 0.061279296875, "learning_rate": 9.99079849364174e-07, "loss": 0.0024, "reward": 1.6751954555511475, "reward_std": 0.25305888056755066, "rewards/accuracy_reward": 0.39073309302330017, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1808907687664032, "step": 856 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 369.5714416503906, "epoch": 0.01935935664588416, "grad_norm": 2.8092390836906165, "kl": 0.0537109375, "learning_rate": 9.990776963686659e-07, "loss": 0.0021, "reward": 1.7526479959487915, "reward_std": 0.17970237135887146, "rewards/accuracy_reward": 0.4464222490787506, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.202654168009758, "step": 857 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 368.01788330078125, "epoch": 0.019381946326917865, "grad_norm": 1.8274973940909045, "kl": 0.05078125, "learning_rate": 9.990755408596043e-07, "loss": 0.002, "reward": 1.8034889698028564, "reward_std": 0.24559088051319122, "rewards/accuracy_reward": 0.5208333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1862269937992096, "step": 858 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 381.9107360839844, "epoch": 0.019404536007951566, "grad_norm": 2.202022188068607, "kl": 0.057373046875, "learning_rate": 9.990733828370004e-07, "loss": 0.0023, "reward": 1.7475335597991943, "reward_std": 0.23591823875904083, "rewards/accuracy_reward": 0.37894755601882935, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2293003797531128, "step": 859 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 339.4464416503906, "epoch": 0.01942712568898527, "grad_norm": 2.4515422782302836, "kl": 0.06640625, "learning_rate": 9.990712223008647e-07, "loss": 0.0027, "reward": 1.7769242525100708, "reward_std": 0.2306549996137619, "rewards/accuracy_reward": 0.5153518915176392, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.183000847697258, "step": 860 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 356.5000305175781, "epoch": 0.019449715370018977, "grad_norm": 2.716295430302214, "kl": 0.052734375, "learning_rate": 9.990690592512086e-07, "loss": 0.0021, "reward": 1.7326123714447021, "reward_std": 0.13621383905410767, "rewards/accuracy_reward": 0.4356570839881897, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1826695054769516, "step": 861 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 343.14288330078125, "epoch": 0.019472305051052678, "grad_norm": 5.8381683931699735, "kl": 0.064453125, "learning_rate": 9.990668936880425e-07, "loss": 0.0026, "reward": 1.9259992837905884, "reward_std": 0.0626303106546402, "rewards/accuracy_reward": 0.51179039478302, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2606375813484192, "step": 862 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.42857142857142855, "completion_length": 353.2321472167969, "epoch": 0.019494894732086383, "grad_norm": 1.6585766206656476, "kl": 0.055908203125, "learning_rate": 9.990647256113773e-07, "loss": 0.0022, "reward": 1.9009348154067993, "reward_std": 0.03893570229411125, "rewards/accuracy_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2259347140789032, "step": 863 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 370.01788330078125, "epoch": 0.01951748441312009, "grad_norm": 2.6356748655907625, "kl": 0.0498046875, "learning_rate": 9.990625550212244e-07, "loss": 0.002, "reward": 1.9183017015457153, "reward_std": 0.12156493961811066, "rewards/accuracy_reward": 0.561367928981781, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2497907429933548, "step": 864 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 359.1250305175781, "epoch": 0.01954007409415379, "grad_norm": 2.2493027677902373, "kl": 0.060546875, "learning_rate": 9.990603819175942e-07, "loss": 0.0024, "reward": 2.129603624343872, "reward_std": 0.16177546977996826, "rewards/accuracy_reward": 0.67673659324646, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.3314383625984192, "step": 865 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 417.4821472167969, "epoch": 0.019562663775187495, "grad_norm": 6.397693298364241, "kl": 0.04296875, "learning_rate": 9.99058206300498e-07, "loss": 0.0017, "reward": 1.8916966915130615, "reward_std": 0.2568168044090271, "rewards/accuracy_reward": 0.5521842837333679, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2323695719242096, "step": 866 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 400.6071472167969, "epoch": 0.019585253456221197, "grad_norm": 1.6925332499763868, "kl": 0.046142578125, "learning_rate": 9.990560281699464e-07, "loss": 0.0018, "reward": 1.8912885189056396, "reward_std": 0.27133283019065857, "rewards/accuracy_reward": 0.5445913076400757, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.221697136759758, "step": 867 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 406.4821472167969, "epoch": 0.0196078431372549, "grad_norm": 3.3260584955406123, "kl": 0.044189453125, "learning_rate": 9.99053847525951e-07, "loss": 0.0018, "reward": 2.0331335067749023, "reward_std": 0.22975561022758484, "rewards/accuracy_reward": 0.6785714626312256, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2474190890789032, "step": 868 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 411.3571472167969, "epoch": 0.019630432818288607, "grad_norm": 2.9084683480983045, "kl": 0.048095703125, "learning_rate": 9.990516643685221e-07, "loss": 0.0019, "reward": 1.8880228996276855, "reward_std": 0.27905693650245667, "rewards/accuracy_reward": 0.5635896921157837, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1958618313074112, "step": 869 }, { "all_correct": 0.8571428571428571, "all_wrong": 0.0, "completion_length": 371.3750305175781, "epoch": 0.01965302249932231, "grad_norm": 1.9737437891468808, "kl": 0.0537109375, "learning_rate": 9.99049478697671e-07, "loss": 0.0022, "reward": 2.267995834350586, "reward_std": 0.1736246645450592, "rewards/accuracy_reward": 0.8018023371696472, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2947649359703064, "step": 870 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 370.3571472167969, "epoch": 0.019675612180356013, "grad_norm": 7.630647262599972, "kl": 0.0625, "learning_rate": 9.990472905134086e-07, "loss": 0.0025, "reward": 1.816330075263977, "reward_std": 0.1019475981593132, "rewards/accuracy_reward": 0.4781150221824646, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.206072136759758, "step": 871 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 384.3035888671875, "epoch": 0.01969820186138972, "grad_norm": 2.2401530397424527, "kl": 0.06298828125, "learning_rate": 9.990450998157462e-07, "loss": 0.0025, "reward": 1.9081740379333496, "reward_std": 0.2137361615896225, "rewards/accuracy_reward": 0.554609477519989, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2428501844406128, "step": 872 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 366.1071472167969, "epoch": 0.01972079154242342, "grad_norm": 7.438469038537925, "kl": 0.0673828125, "learning_rate": 9.990429066046946e-07, "loss": 0.0027, "reward": 2.077298879623413, "reward_std": 0.15394476056098938, "rewards/accuracy_reward": 0.6726183295249939, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2618233859539032, "step": 873 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 394.5714416503906, "epoch": 0.019743381223457125, "grad_norm": 9.728886509311275, "kl": 0.0556640625, "learning_rate": 9.990407108802648e-07, "loss": 0.0022, "reward": 2.022531509399414, "reward_std": 0.08008179068565369, "rewards/accuracy_reward": 0.6053649187088013, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2564522922039032, "step": 874 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 393.8035888671875, "epoch": 0.01976597090449083, "grad_norm": 2.567607499836925, "kl": 0.055908203125, "learning_rate": 9.99038512642468e-07, "loss": 0.0022, "reward": 1.7120146751403809, "reward_std": 0.2563090920448303, "rewards/accuracy_reward": 0.4423717260360718, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1696428656578064, "step": 875 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 381.46429443359375, "epoch": 0.019788560585524532, "grad_norm": 4.4241614499689375, "kl": 0.060302734375, "learning_rate": 9.99036311891315e-07, "loss": 0.0024, "reward": 1.9297772645950317, "reward_std": 0.06273616850376129, "rewards/accuracy_reward": 0.5193976163864136, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2460937649011612, "step": 876 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 398.26788330078125, "epoch": 0.019811150266558237, "grad_norm": 2.336909335808514, "kl": 0.049072265625, "learning_rate": 9.990341086268172e-07, "loss": 0.002, "reward": 1.4852863550186157, "reward_std": 0.2614281177520752, "rewards/accuracy_reward": 0.3084518015384674, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1125488355755806, "step": 877 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 419.5357360839844, "epoch": 0.01983373994759194, "grad_norm": 2.276660376279331, "kl": 0.053466796875, "learning_rate": 9.990319028489858e-07, "loss": 0.0021, "reward": 1.7237035036087036, "reward_std": 0.20467345416545868, "rewards/accuracy_reward": 0.36725103855133057, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2385951578617096, "step": 878 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 407.9107360839844, "epoch": 0.019856329628625644, "grad_norm": 4.598526334063921, "kl": 0.052734375, "learning_rate": 9.990296945578316e-07, "loss": 0.0021, "reward": 1.964134931564331, "reward_std": 0.291042298078537, "rewards/accuracy_reward": 0.5956534147262573, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2470528781414032, "step": 879 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 405.76788330078125, "epoch": 0.01987891930965935, "grad_norm": 2.957940704654425, "kl": 0.06201171875, "learning_rate": 9.990274837533657e-07, "loss": 0.0025, "reward": 1.7955548763275146, "reward_std": 0.16376911103725433, "rewards/accuracy_reward": 0.3973737061023712, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2696097493171692, "step": 880 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.5714285714285714, "completion_length": 416.7500305175781, "epoch": 0.01990150899069305, "grad_norm": 1.0833903981902322, "kl": 0.046142578125, "learning_rate": 9.990252704355995e-07, "loss": 0.0018, "reward": 1.6227695941925049, "reward_std": 0.10432884842157364, "rewards/accuracy_reward": 0.40455305576324463, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1360735297203064, "step": 881 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 382.7321472167969, "epoch": 0.019924098671726755, "grad_norm": 2.0607905468427936, "kl": 0.052978515625, "learning_rate": 9.99023054604544e-07, "loss": 0.0021, "reward": 2.176024913787842, "reward_std": 0.11020920425653458, "rewards/accuracy_reward": 0.7547165155410767, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.260593980550766, "step": 882 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 417.33929443359375, "epoch": 0.01994668835276046, "grad_norm": 2.3060240506102008, "kl": 0.048583984375, "learning_rate": 9.990208362602103e-07, "loss": 0.0019, "reward": 1.6649805307388306, "reward_std": 0.14435218274593353, "rewards/accuracy_reward": 0.3400746285915375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.207048699259758, "step": 883 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.5714285714285714, "completion_length": 370.14288330078125, "epoch": 0.019969278033794162, "grad_norm": 1.267379206973, "kl": 0.060791015625, "learning_rate": 9.990186154026098e-07, "loss": 0.0024, "reward": 1.4035319089889526, "reward_std": 0.0690922886133194, "rewards/accuracy_reward": 0.2559524178504944, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0904366672039032, "step": 884 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 405.5535888671875, "epoch": 0.019991867714827867, "grad_norm": 4.242177482677213, "kl": 0.054931640625, "learning_rate": 9.990163920317532e-07, "loss": 0.0022, "reward": 1.8104292154312134, "reward_std": 0.12744009494781494, "rewards/accuracy_reward": 0.40782371163368225, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2597481906414032, "step": 885 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 371.8571472167969, "epoch": 0.020014457395861572, "grad_norm": 1.9396365452354538, "kl": 0.060791015625, "learning_rate": 9.990141661476523e-07, "loss": 0.0024, "reward": 1.9866489171981812, "reward_std": 0.11475202441215515, "rewards/accuracy_reward": 0.6354002356529236, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2262486219406128, "step": 886 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 391.5000305175781, "epoch": 0.020037047076895274, "grad_norm": 1.5933101759474713, "kl": 0.056640625, "learning_rate": 9.99011937750318e-07, "loss": 0.0023, "reward": 1.8764127492904663, "reward_std": 0.1274072825908661, "rewards/accuracy_reward": 0.5365270376205444, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2220284640789032, "step": 887 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 414.2321472167969, "epoch": 0.02005963675792898, "grad_norm": 2.2009392125284317, "kl": 0.050048828125, "learning_rate": 9.990097068397613e-07, "loss": 0.002, "reward": 2.241713047027588, "reward_std": 0.1836879849433899, "rewards/accuracy_reward": 0.8112894892692566, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2911376953125, "step": 888 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 435.89288330078125, "epoch": 0.02008222643896268, "grad_norm": 3.7964596134459847, "kl": 0.043701171875, "learning_rate": 9.99007473415994e-07, "loss": 0.0017, "reward": 1.7832797765731812, "reward_std": 0.23830178380012512, "rewards/accuracy_reward": 0.4484162926673889, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2170061469078064, "step": 889 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 375.4821472167969, "epoch": 0.020104816119996385, "grad_norm": 41.44180977844967, "kl": 0.0810546875, "learning_rate": 9.990052374790267e-07, "loss": 0.0032, "reward": 2.111335515975952, "reward_std": 0.23573334515094757, "rewards/accuracy_reward": 0.6792415976524353, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2713797688484192, "step": 890 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 411.1607360839844, "epoch": 0.02012740580103009, "grad_norm": 2.6957512847859753, "kl": 0.048828125, "learning_rate": 9.990029990288714e-07, "loss": 0.002, "reward": 1.4689890146255493, "reward_std": 0.10736312717199326, "rewards/accuracy_reward": 0.20453235507011414, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1573137640953064, "step": 891 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 421.4821472167969, "epoch": 0.020149995482063792, "grad_norm": 5.666900811889456, "kl": 0.052490234375, "learning_rate": 9.990007580655388e-07, "loss": 0.0021, "reward": 1.597846269607544, "reward_std": 0.22573533654212952, "rewards/accuracy_reward": 0.2758491039276123, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2077113687992096, "step": 892 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 379.5357360839844, "epoch": 0.020172585163097497, "grad_norm": 12.146022074890615, "kl": 0.049072265625, "learning_rate": 9.989985145890403e-07, "loss": 0.002, "reward": 1.563113808631897, "reward_std": 0.2203706055879593, "rewards/accuracy_reward": 0.24733532965183258, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2264927625656128, "step": 893 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 356.1607360839844, "epoch": 0.020195174844131202, "grad_norm": 3.572983874959228, "kl": 0.061767578125, "learning_rate": 9.989962685993873e-07, "loss": 0.0025, "reward": 1.815874695777893, "reward_std": 0.14168697595596313, "rewards/accuracy_reward": 0.5050175786018372, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2001430094242096, "step": 894 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 374.1071472167969, "epoch": 0.020217764525164904, "grad_norm": 2.7302690586023757, "kl": 0.05615234375, "learning_rate": 9.98994020096591e-07, "loss": 0.0022, "reward": 2.122257947921753, "reward_std": 0.1393304169178009, "rewards/accuracy_reward": 0.6476624608039856, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.299595445394516, "step": 895 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 413.0535888671875, "epoch": 0.02024035420619861, "grad_norm": 2.2242454874927273, "kl": 0.05322265625, "learning_rate": 9.98991769080663e-07, "loss": 0.0021, "reward": 1.6047946214675903, "reward_std": 0.3279789388179779, "rewards/accuracy_reward": 0.4184977412223816, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1505824625492096, "step": 896 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 353.76788330078125, "epoch": 0.020262943887232314, "grad_norm": 2.139846197482823, "kl": 0.06298828125, "learning_rate": 9.989895155516142e-07, "loss": 0.0025, "reward": 1.67721426486969, "reward_std": 0.15252052247524261, "rewards/accuracy_reward": 0.3391037583351135, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2238246500492096, "step": 897 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 394.1964416503906, "epoch": 0.020285533568266016, "grad_norm": 4.69194272321002, "kl": 0.047607421875, "learning_rate": 9.989872595094564e-07, "loss": 0.0019, "reward": 1.8497105836868286, "reward_std": 0.33172789216041565, "rewards/accuracy_reward": 0.48947054147720337, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2280971109867096, "step": 898 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 374.6964416503906, "epoch": 0.02030812324929972, "grad_norm": 3.1174103615547906, "kl": 0.068359375, "learning_rate": 9.989850009542005e-07, "loss": 0.0027, "reward": 1.9449383020401, "reward_std": 0.15440158545970917, "rewards/accuracy_reward": 0.6016137003898621, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2290387898683548, "step": 899 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 364.4285888671875, "epoch": 0.020330712930333422, "grad_norm": 2.3004456079015907, "kl": 0.0634765625, "learning_rate": 9.989827398858584e-07, "loss": 0.0025, "reward": 1.9505869150161743, "reward_std": 0.1943599283695221, "rewards/accuracy_reward": 0.4984419345855713, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2878592610359192, "step": 900 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 365.4107360839844, "epoch": 0.020353302611367127, "grad_norm": 2.6496318507875953, "kl": 0.055419921875, "learning_rate": 9.98980476304441e-07, "loss": 0.0022, "reward": 1.8605722188949585, "reward_std": 0.09257945418357849, "rewards/accuracy_reward": 0.4607151746749878, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2462855875492096, "step": 901 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 352.5000305175781, "epoch": 0.020375892292400832, "grad_norm": 2.413590680401595, "kl": 0.05908203125, "learning_rate": 9.9897821020996e-07, "loss": 0.0024, "reward": 1.6829962730407715, "reward_std": 0.06458266079425812, "rewards/accuracy_reward": 0.34982797503471375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2117396891117096, "step": 902 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 354.6071472167969, "epoch": 0.020398481973434534, "grad_norm": 3.2412358712608613, "kl": 0.053466796875, "learning_rate": 9.989759416024265e-07, "loss": 0.0021, "reward": 1.704073429107666, "reward_std": 0.058938294649124146, "rewards/accuracy_reward": 0.3784317076206207, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2113560289144516, "step": 903 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.5714285714285714, "completion_length": 401.96429443359375, "epoch": 0.02042107165446824, "grad_norm": 3.2956148633520344, "kl": 0.052978515625, "learning_rate": 9.989736704818524e-07, "loss": 0.0021, "reward": 1.276924729347229, "reward_std": 0.16422729194164276, "rewards/accuracy_reward": 0.21410544216632843, "rewards/format_reward": 0.910714328289032, "rewards/semantic_reward": 0.0949619859457016, "step": 904 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 427.0357360839844, "epoch": 0.020443661335501944, "grad_norm": 3.733507256693784, "kl": 0.044921875, "learning_rate": 9.989713968482487e-07, "loss": 0.0018, "reward": 1.426210880279541, "reward_std": 0.293590247631073, "rewards/accuracy_reward": 0.3449747860431671, "rewards/format_reward": 0.8750000596046448, "rewards/semantic_reward": 0.124093197286129, "step": 905 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 351.39288330078125, "epoch": 0.020466251016535646, "grad_norm": 4.118933753245537, "kl": 0.064453125, "learning_rate": 9.989691207016273e-07, "loss": 0.0026, "reward": 1.7060071229934692, "reward_std": 0.13136713206768036, "rewards/accuracy_reward": 0.4029203951358795, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2173723578453064, "step": 906 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 337.625, "epoch": 0.02048884069756935, "grad_norm": 2.5002632680964583, "kl": 0.057861328125, "learning_rate": 9.989668420419992e-07, "loss": 0.0023, "reward": 2.2658190727233887, "reward_std": 0.14476044476032257, "rewards/accuracy_reward": 0.8130252957344055, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3099365234375, "step": 907 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 371.6964416503906, "epoch": 0.020511430378603056, "grad_norm": 2.3754558746348566, "kl": 0.0634765625, "learning_rate": 9.989645608693763e-07, "loss": 0.0025, "reward": 1.587184190750122, "reward_std": 0.2509857714176178, "rewards/accuracy_reward": 0.3497539162635803, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1517159640789032, "step": 908 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 372.5535888671875, "epoch": 0.020534020059636757, "grad_norm": 2.1758039269713922, "kl": 0.046630859375, "learning_rate": 9.989622771837697e-07, "loss": 0.0019, "reward": 1.7489981651306152, "reward_std": 0.04105871543288231, "rewards/accuracy_reward": 0.4875444173812866, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.168596550822258, "step": 909 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 349.8035888671875, "epoch": 0.020556609740670462, "grad_norm": 2.0190517476726773, "kl": 0.052978515625, "learning_rate": 9.989599909851908e-07, "loss": 0.0021, "reward": 2.188420057296753, "reward_std": 0.0452737882733345, "rewards/accuracy_reward": 0.7555272579193115, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2686070203781128, "step": 910 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 361.1607360839844, "epoch": 0.020579199421704164, "grad_norm": 3.1191282011996173, "kl": 0.059814453125, "learning_rate": 9.989577022736517e-07, "loss": 0.0024, "reward": 1.8859659433364868, "reward_std": 0.16433125734329224, "rewards/accuracy_reward": 0.48527875542640686, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2578299641609192, "step": 911 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 362.8571472167969, "epoch": 0.02060178910273787, "grad_norm": 2.922630754592793, "kl": 0.061279296875, "learning_rate": 9.989554110491635e-07, "loss": 0.0024, "reward": 2.0515670776367188, "reward_std": 0.18065932393074036, "rewards/accuracy_reward": 0.6173526644706726, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.266357421875, "step": 912 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 365.4285888671875, "epoch": 0.020624378783771574, "grad_norm": 2.841015971737669, "kl": 0.06298828125, "learning_rate": 9.98953117311738e-07, "loss": 0.0025, "reward": 2.212888717651367, "reward_std": 0.15748225152492523, "rewards/accuracy_reward": 0.7468311786651611, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2803432047367096, "step": 913 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 359.33929443359375, "epoch": 0.020646968464805276, "grad_norm": 2.779388328944268, "kl": 0.056640625, "learning_rate": 9.989508210613865e-07, "loss": 0.0023, "reward": 1.9253501892089844, "reward_std": 0.07123886048793793, "rewards/accuracy_reward": 0.5815198421478271, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.211687371134758, "step": 914 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 374.5714416503906, "epoch": 0.02066955814583898, "grad_norm": 3.177009924385021, "kl": 0.07421875, "learning_rate": 9.989485222981204e-07, "loss": 0.003, "reward": 1.8034639358520508, "reward_std": 0.03547661006450653, "rewards/accuracy_reward": 0.4604984223842621, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2001081258058548, "step": 915 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.5714416503906, "epoch": 0.020692147826872686, "grad_norm": 8.364626687086378, "kl": 0.06787109375, "learning_rate": 9.989462210219518e-07, "loss": 0.0027, "reward": 1.8256293535232544, "reward_std": 0.2562721371650696, "rewards/accuracy_reward": 0.41404658555984497, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2472970187664032, "step": 916 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 436.6607360839844, "epoch": 0.020714737507906388, "grad_norm": 2.5970879203197845, "kl": 0.064453125, "learning_rate": 9.98943917232892e-07, "loss": 0.0026, "reward": 2.049238920211792, "reward_std": 0.11343767493963242, "rewards/accuracy_reward": 0.5863098502159119, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3057861328125, "step": 917 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 383.4107360839844, "epoch": 0.020737327188940093, "grad_norm": 3.481403806162486, "kl": 0.12109375, "learning_rate": 9.989416109309528e-07, "loss": 0.0049, "reward": 1.9049601554870605, "reward_std": 0.3472961187362671, "rewards/accuracy_reward": 0.5718476176261902, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.225969597697258, "step": 918 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 418.3035888671875, "epoch": 0.020759916869973798, "grad_norm": 95.87273542515081, "kl": 28.875, "learning_rate": 9.989393021161455e-07, "loss": 1.1594, "reward": 1.7595183849334717, "reward_std": 0.27584242820739746, "rewards/accuracy_reward": 0.42572224140167236, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2302246242761612, "step": 919 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.76788330078125, "epoch": 0.0207825065510075, "grad_norm": 2.079394732824646, "kl": 0.0693359375, "learning_rate": 9.989369907884818e-07, "loss": 0.0028, "reward": 1.7986336946487427, "reward_std": 0.3137487471103668, "rewards/accuracy_reward": 0.42024344205856323, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2426757961511612, "step": 920 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 390.1607360839844, "epoch": 0.020805096232041204, "grad_norm": 8.717924989214588, "kl": 0.0966796875, "learning_rate": 9.989346769479733e-07, "loss": 0.0039, "reward": 1.6440218687057495, "reward_std": 0.26827558875083923, "rewards/accuracy_reward": 0.39462515711784363, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1851109117269516, "step": 921 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 395.2857360839844, "epoch": 0.020827685913074906, "grad_norm": 3.143825693745051, "kl": 0.076171875, "learning_rate": 9.989323605946321e-07, "loss": 0.003, "reward": 1.6620557308197021, "reward_std": 0.15659362077713013, "rewards/accuracy_reward": 0.3521783947944641, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2313058078289032, "step": 922 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 401.1607360839844, "epoch": 0.02085027559410861, "grad_norm": 4.0720611149781565, "kl": 0.06494140625, "learning_rate": 9.989300417284694e-07, "loss": 0.0026, "reward": 1.7961465120315552, "reward_std": 0.24299293756484985, "rewards/accuracy_reward": 0.48680639266967773, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.198625847697258, "step": 923 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 418.46429443359375, "epoch": 0.020872865275142316, "grad_norm": 2.090519964840711, "kl": 0.0693359375, "learning_rate": 9.98927720349497e-07, "loss": 0.0028, "reward": 2.1318798065185547, "reward_std": 0.1683434396982193, "rewards/accuracy_reward": 0.70015549659729, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2674386203289032, "step": 924 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 372.5000305175781, "epoch": 0.020895454956176018, "grad_norm": 10.27069524343784, "kl": 0.076171875, "learning_rate": 9.989253964577265e-07, "loss": 0.003, "reward": 2.128497362136841, "reward_std": 0.08290904015302658, "rewards/accuracy_reward": 0.6661019325256348, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2945382297039032, "step": 925 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 374.33929443359375, "epoch": 0.020918044637209723, "grad_norm": 3.8465004947337613, "kl": 0.078125, "learning_rate": 9.989230700531699e-07, "loss": 0.0031, "reward": 1.5750526189804077, "reward_std": 0.03392171114683151, "rewards/accuracy_reward": 0.2710520923137665, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.189714714884758, "step": 926 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 367.7321472167969, "epoch": 0.020940634318243428, "grad_norm": 6.574269505630602, "kl": 0.11376953125, "learning_rate": 9.989207411358387e-07, "loss": 0.0045, "reward": 1.6233782768249512, "reward_std": 0.36868590116500854, "rewards/accuracy_reward": 0.354024738073349, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1764962375164032, "step": 927 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 363.14288330078125, "epoch": 0.02096322399927713, "grad_norm": 1.8182433866456196, "kl": 0.062255859375, "learning_rate": 9.989184097057445e-07, "loss": 0.0025, "reward": 1.469152808189392, "reward_std": 0.12032182514667511, "rewards/accuracy_reward": 0.2704413831233978, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1165684312582016, "step": 928 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 371.89288330078125, "epoch": 0.020985813680310834, "grad_norm": 2.0195265547210064, "kl": 0.0595703125, "learning_rate": 9.989160757628993e-07, "loss": 0.0024, "reward": 1.7023274898529053, "reward_std": 0.0272565595805645, "rewards/accuracy_reward": 0.4299711585044861, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1652134507894516, "step": 929 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 434.3035888671875, "epoch": 0.02100840336134454, "grad_norm": 4.069848568140599, "kl": 0.052734375, "learning_rate": 9.989137393073148e-07, "loss": 0.0021, "reward": 1.893378734588623, "reward_std": 0.1325700730085373, "rewards/accuracy_reward": 0.5442436933517456, "rewards/format_reward": 0.8928571939468384, "rewards/semantic_reward": 0.2919921875, "step": 930 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 367.46429443359375, "epoch": 0.02103099304237824, "grad_norm": 3.292256585655075, "kl": 0.0673828125, "learning_rate": 9.989114003390026e-07, "loss": 0.0027, "reward": 2.2035202980041504, "reward_std": 0.06718352437019348, "rewards/accuracy_reward": 0.6727722883224487, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3378906548023224, "step": 931 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 359.2321472167969, "epoch": 0.021053582723411946, "grad_norm": 6.0386021412316815, "kl": 0.10595703125, "learning_rate": 9.989090588579746e-07, "loss": 0.0043, "reward": 1.6513737440109253, "reward_std": 0.10966678708791733, "rewards/accuracy_reward": 0.3974378705024719, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1682216227054596, "step": 932 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 380.6785888671875, "epoch": 0.021076172404445648, "grad_norm": 1.2741315549563117, "kl": 0.056884765625, "learning_rate": 9.989067148642426e-07, "loss": 0.0023, "reward": 1.365244746208191, "reward_std": 0.27044570446014404, "rewards/accuracy_reward": 0.24047140777111053, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.089059017598629, "step": 933 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 362.6250305175781, "epoch": 0.021098762085479353, "grad_norm": 2.1363850010419347, "kl": 0.05078125, "learning_rate": 9.989043683578187e-07, "loss": 0.002, "reward": 2.1571059226989746, "reward_std": 0.13695143163204193, "rewards/accuracy_reward": 0.7132861614227295, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2723912000656128, "step": 934 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 373.96429443359375, "epoch": 0.021121351766513058, "grad_norm": 15.738457019369651, "kl": 0.12255859375, "learning_rate": 9.989020193387142e-07, "loss": 0.0049, "reward": 1.603759527206421, "reward_std": 0.13527970016002655, "rewards/accuracy_reward": 0.2940636873245239, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1882673054933548, "step": 935 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 355.8035888671875, "epoch": 0.02114394144754676, "grad_norm": 3.1504133890211112, "kl": 0.055419921875, "learning_rate": 9.98899667806941e-07, "loss": 0.0022, "reward": 1.6117509603500366, "reward_std": 0.15666882693767548, "rewards/accuracy_reward": 0.35792478919029236, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1716831773519516, "step": 936 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.89288330078125, "epoch": 0.021166531128580465, "grad_norm": 2.4651066489961737, "kl": 0.05517578125, "learning_rate": 9.98897313762511e-07, "loss": 0.0022, "reward": 1.6863551139831543, "reward_std": 0.10715774446725845, "rewards/accuracy_reward": 0.29041123390197754, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2352295070886612, "step": 937 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 364.5714416503906, "epoch": 0.02118912080961417, "grad_norm": 2.336759329130274, "kl": 0.059814453125, "learning_rate": 9.988949572054365e-07, "loss": 0.0024, "reward": 1.7101929187774658, "reward_std": 0.1263713836669922, "rewards/accuracy_reward": 0.4040195643901825, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1847447007894516, "step": 938 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 385.0714416503906, "epoch": 0.02121171049064787, "grad_norm": 3.5507222340946694, "kl": 0.05810546875, "learning_rate": 9.988925981357291e-07, "loss": 0.0023, "reward": 1.9320828914642334, "reward_std": 0.12355168908834457, "rewards/accuracy_reward": 0.5367773175239563, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2417341023683548, "step": 939 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 373.1250305175781, "epoch": 0.021234300171681576, "grad_norm": 1.6810108105450587, "kl": 0.058349609375, "learning_rate": 9.988902365534001e-07, "loss": 0.0023, "reward": 1.5246633291244507, "reward_std": 0.32071420550346375, "rewards/accuracy_reward": 0.296454519033432, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1460658609867096, "step": 940 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 377.1071472167969, "epoch": 0.02125688985271528, "grad_norm": 2.634135156927849, "kl": 0.049072265625, "learning_rate": 9.988878724584624e-07, "loss": 0.002, "reward": 2.163524627685547, "reward_std": 0.15700702369213104, "rewards/accuracy_reward": 0.7155125141143799, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3015834391117096, "step": 941 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 374.89288330078125, "epoch": 0.021279479533748983, "grad_norm": 2.592657010207468, "kl": 0.052734375, "learning_rate": 9.98885505850927e-07, "loss": 0.0021, "reward": 1.7628685235977173, "reward_std": 0.24102425575256348, "rewards/accuracy_reward": 0.4213034212589264, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2022792398929596, "step": 942 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 395.1785888671875, "epoch": 0.021302069214782688, "grad_norm": 1.8519539530818812, "kl": 0.05126953125, "learning_rate": 9.988831367308064e-07, "loss": 0.002, "reward": 1.6896024942398071, "reward_std": 0.20776191353797913, "rewards/accuracy_reward": 0.40600866079330444, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2014508992433548, "step": 943 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 419.14288330078125, "epoch": 0.02132465889581639, "grad_norm": 2.5448212161952464, "kl": 0.0498046875, "learning_rate": 9.988807650981123e-07, "loss": 0.002, "reward": 1.5086849927902222, "reward_std": 0.32657188177108765, "rewards/accuracy_reward": 0.2955852150917053, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1595284640789032, "step": 944 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 368.33929443359375, "epoch": 0.021347248576850095, "grad_norm": 1.9925824563711674, "kl": 0.0556640625, "learning_rate": 9.988783909528568e-07, "loss": 0.0022, "reward": 1.8910166025161743, "reward_std": 0.046341411769390106, "rewards/accuracy_reward": 0.5254542827606201, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2012765109539032, "step": 945 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 357.9821472167969, "epoch": 0.0213698382578838, "grad_norm": 2.4892327976735182, "kl": 0.06591796875, "learning_rate": 9.988760142950515e-07, "loss": 0.0026, "reward": 2.076420545578003, "reward_std": 0.07126487046480179, "rewards/accuracy_reward": 0.5779131650924683, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3092215657234192, "step": 946 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 389.2857360839844, "epoch": 0.0213924279389175, "grad_norm": 2.0598139146510395, "kl": 0.052978515625, "learning_rate": 9.98873635124709e-07, "loss": 0.0021, "reward": 1.8245679140090942, "reward_std": 0.0651465505361557, "rewards/accuracy_reward": 0.4413926601409912, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2546038031578064, "step": 947 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.14285714285714285, "completion_length": 371.9464416503906, "epoch": 0.021415017619951206, "grad_norm": 1.915305811789709, "kl": 0.049560546875, "learning_rate": 9.988712534418406e-07, "loss": 0.002, "reward": 2.1220717430114746, "reward_std": 0.046287111937999725, "rewards/accuracy_reward": 0.6830351948738098, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.27117919921875, "step": 948 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 377.3035888671875, "epoch": 0.02143760730098491, "grad_norm": 2.8653708015340023, "kl": 0.0595703125, "learning_rate": 9.988688692464586e-07, "loss": 0.0024, "reward": 2.0048375129699707, "reward_std": 0.18885262310504913, "rewards/accuracy_reward": 0.6215643286705017, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2297014594078064, "step": 949 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 380.4285888671875, "epoch": 0.021460196982018613, "grad_norm": 2.4980013457709056, "kl": 0.058837890625, "learning_rate": 9.988664825385751e-07, "loss": 0.0023, "reward": 1.8365325927734375, "reward_std": 0.23632943630218506, "rewards/accuracy_reward": 0.43917617201805115, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2366420328617096, "step": 950 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 392.0357360839844, "epoch": 0.021482786663052318, "grad_norm": 1.7268965647110555, "kl": 0.06298828125, "learning_rate": 9.98864093318202e-07, "loss": 0.0025, "reward": 1.6051889657974243, "reward_std": 0.03204597160220146, "rewards/accuracy_reward": 0.30080482363700867, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1900983601808548, "step": 951 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 406.9464416503906, "epoch": 0.021505376344086023, "grad_norm": 2.7839830628760356, "kl": 0.054443359375, "learning_rate": 9.988617015853514e-07, "loss": 0.0022, "reward": 1.899165391921997, "reward_std": 0.22883982956409454, "rewards/accuracy_reward": 0.4654529094696045, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2872837781906128, "step": 952 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 391.1071472167969, "epoch": 0.021527966025119725, "grad_norm": 4.793327722442552, "kl": 0.0537109375, "learning_rate": 9.988593073400354e-07, "loss": 0.0021, "reward": 1.7157986164093018, "reward_std": 0.19744540750980377, "rewards/accuracy_reward": 0.414145290851593, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1945103257894516, "step": 953 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 413.71429443359375, "epoch": 0.02155055570615343, "grad_norm": 2.0732722958736494, "kl": 0.050048828125, "learning_rate": 9.988569105822657e-07, "loss": 0.002, "reward": 1.8345625400543213, "reward_std": 0.11266724765300751, "rewards/accuracy_reward": 0.5219334363937378, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2126290500164032, "step": 954 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 385.5714416503906, "epoch": 0.02157314538718713, "grad_norm": 1.7984227902726921, "kl": 0.051025390625, "learning_rate": 9.988545113120547e-07, "loss": 0.002, "reward": 1.9566564559936523, "reward_std": 0.24392127990722656, "rewards/accuracy_reward": 0.6055019497871399, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.219011589884758, "step": 955 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 370.26788330078125, "epoch": 0.021595735068220837, "grad_norm": 2.428673550207275, "kl": 0.05322265625, "learning_rate": 9.988521095294145e-07, "loss": 0.0021, "reward": 1.8096109628677368, "reward_std": 0.37718209624290466, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1608014851808548, "step": 956 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 388.9107360839844, "epoch": 0.02161832474925454, "grad_norm": 2.3139953667200923, "kl": 0.057373046875, "learning_rate": 9.988497052343571e-07, "loss": 0.0023, "reward": 1.576098084449768, "reward_std": 0.1333002895116806, "rewards/accuracy_reward": 0.2878882884979248, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1739240437746048, "step": 957 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 364.71429443359375, "epoch": 0.021640914430288243, "grad_norm": 2.7044780442683183, "kl": 0.056884765625, "learning_rate": 9.988472984268948e-07, "loss": 0.0023, "reward": 1.9861977100372314, "reward_std": 0.15006478130817413, "rewards/accuracy_reward": 0.5899015665054321, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2748674750328064, "step": 958 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 370.0000305175781, "epoch": 0.02166350411132195, "grad_norm": 1.7715701895649802, "kl": 0.06396484375, "learning_rate": 9.988448891070393e-07, "loss": 0.0026, "reward": 1.9531179666519165, "reward_std": 0.15426360070705414, "rewards/accuracy_reward": 0.5507811903953552, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2523368000984192, "step": 959 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 397.26788330078125, "epoch": 0.021686093792355653, "grad_norm": 1.6245303456574927, "kl": 0.050537109375, "learning_rate": 9.98842477274803e-07, "loss": 0.002, "reward": 2.017246961593628, "reward_std": 0.32567936182022095, "rewards/accuracy_reward": 0.6785714626312256, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2136753648519516, "step": 960 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 410.64288330078125, "epoch": 0.021708683473389355, "grad_norm": 6.49521192424721, "kl": 0.052978515625, "learning_rate": 9.988400629301982e-07, "loss": 0.0021, "reward": 1.8250948190689087, "reward_std": 0.1166166290640831, "rewards/accuracy_reward": 0.4789765477180481, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2354038953781128, "step": 961 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 378.96429443359375, "epoch": 0.02173127315442306, "grad_norm": 1.8880649356552917, "kl": 0.061767578125, "learning_rate": 9.988376460732366e-07, "loss": 0.0025, "reward": 1.6661429405212402, "reward_std": 0.11011110246181488, "rewards/accuracy_reward": 0.3602416217327118, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1916155219078064, "step": 962 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 372.3214416503906, "epoch": 0.021753862835456765, "grad_norm": 2.3604592168890006, "kl": 0.05224609375, "learning_rate": 9.98835226703931e-07, "loss": 0.0021, "reward": 1.7039803266525269, "reward_std": 0.4158533215522766, "rewards/accuracy_reward": 0.4108930826187134, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2002301961183548, "step": 963 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 361.4464416503906, "epoch": 0.021776452516490467, "grad_norm": 1.8453580735956263, "kl": 0.064453125, "learning_rate": 9.988328048222929e-07, "loss": 0.0026, "reward": 1.6534686088562012, "reward_std": 0.13668274879455566, "rewards/accuracy_reward": 0.37554579973220825, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1743512898683548, "step": 964 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 369.3035888671875, "epoch": 0.021799042197524172, "grad_norm": 30.354696136479134, "kl": 0.06689453125, "learning_rate": 9.98830380428335e-07, "loss": 0.0027, "reward": 1.8009215593338013, "reward_std": 0.08741776645183563, "rewards/accuracy_reward": 0.451392263174057, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2316720187664032, "step": 965 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 350.83929443359375, "epoch": 0.021821631878557873, "grad_norm": 1.762022926533456, "kl": 0.06201171875, "learning_rate": 9.988279535220691e-07, "loss": 0.0025, "reward": 1.8640772104263306, "reward_std": 0.08082599937915802, "rewards/accuracy_reward": 0.5301729440689087, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2160470187664032, "step": 966 }, { "all_correct": 0.0, "all_wrong": 0.14285714285714285, "completion_length": 394.1607360839844, "epoch": 0.02184422155959158, "grad_norm": 20.042517498761622, "kl": 191.0, "learning_rate": 9.98825524103508e-07, "loss": 7.6564, "reward": 1.5986558198928833, "reward_std": 0.3982800841331482, "rewards/accuracy_reward": 0.3236347734928131, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.2000209391117096, "step": 967 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 388.58929443359375, "epoch": 0.021866811240625283, "grad_norm": 21.000623273998574, "kl": 0.05712890625, "learning_rate": 9.988230921726634e-07, "loss": 0.0023, "reward": 1.973653793334961, "reward_std": 0.27722710371017456, "rewards/accuracy_reward": 0.6158270835876465, "rewards/format_reward": 0.9464285969734192, "rewards/semantic_reward": 0.2542550265789032, "step": 968 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 348.8035888671875, "epoch": 0.021889400921658985, "grad_norm": 1.6074556315329482, "kl": 0.06689453125, "learning_rate": 9.98820657729548e-07, "loss": 0.0027, "reward": 1.7254748344421387, "reward_std": 0.09093911945819855, "rewards/accuracy_reward": 0.5017669796943665, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.1487078070640564, "step": 969 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 365.21429443359375, "epoch": 0.02191199060269269, "grad_norm": 4.231416529110806, "kl": 0.06884765625, "learning_rate": 9.988182207741736e-07, "loss": 0.0027, "reward": 1.7726479768753052, "reward_std": 0.17599596083164215, "rewards/accuracy_reward": 0.3928208649158478, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2333984524011612, "step": 970 }, { "all_correct": 0.0, "all_wrong": 0.42857142857142855, "completion_length": 359.8214416503906, "epoch": 0.021934580283726395, "grad_norm": 4.872609360625465, "kl": 0.06640625, "learning_rate": 9.988157813065527e-07, "loss": 0.0026, "reward": 1.49320387840271, "reward_std": 0.02865764871239662, "rewards/accuracy_reward": 0.19528953731060028, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1836286336183548, "step": 971 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 367.3214416503906, "epoch": 0.021957169964760097, "grad_norm": 2.306236054163831, "kl": 0.057861328125, "learning_rate": 9.988133393266977e-07, "loss": 0.0023, "reward": 1.8795967102050781, "reward_std": 0.14311717450618744, "rewards/accuracy_reward": 0.4583912491798401, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2890625, "step": 972 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 356.21429443359375, "epoch": 0.021979759645793802, "grad_norm": 2.0867524962387636, "kl": 0.05859375, "learning_rate": 9.988108948346207e-07, "loss": 0.0023, "reward": 1.6175017356872559, "reward_std": 0.0930633395910263, "rewards/accuracy_reward": 0.350178062915802, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1673235297203064, "step": 973 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 355.0714416503906, "epoch": 0.022002349326827503, "grad_norm": 4.62324508262498, "kl": 0.062255859375, "learning_rate": 9.988084478303342e-07, "loss": 0.0025, "reward": 1.9228763580322266, "reward_std": 0.1356515884399414, "rewards/accuracy_reward": 0.5434991121292114, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2543770968914032, "step": 974 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.0, "completion_length": 371.9285888671875, "epoch": 0.02202493900786121, "grad_norm": 3.395872048313636, "kl": 0.0537109375, "learning_rate": 9.988059983138503e-07, "loss": 0.0022, "reward": 2.1431944370269775, "reward_std": 0.08086629211902618, "rewards/accuracy_reward": 0.6573406457901001, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.321568101644516, "step": 975 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 362.8750305175781, "epoch": 0.022047528688894914, "grad_norm": 2.0153662233228102, "kl": 0.05517578125, "learning_rate": 9.988035462851814e-07, "loss": 0.0022, "reward": 1.7124536037445068, "reward_std": 0.25215065479278564, "rewards/accuracy_reward": 0.43756523728370667, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1713169664144516, "step": 976 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 369.1964416503906, "epoch": 0.022070118369928615, "grad_norm": 2.6724571817021276, "kl": 0.056396484375, "learning_rate": 9.9880109174434e-07, "loss": 0.0023, "reward": 2.1068053245544434, "reward_std": 0.17666606605052948, "rewards/accuracy_reward": 0.6388260722160339, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3036935031414032, "step": 977 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.42857142857142855, "completion_length": 362.4464416503906, "epoch": 0.02209270805096232, "grad_norm": 1.348329827732216, "kl": 0.056396484375, "learning_rate": 9.987986346913383e-07, "loss": 0.0023, "reward": 1.6459546089172363, "reward_std": 0.13849379122257233, "rewards/accuracy_reward": 0.4016149640083313, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1621965765953064, "step": 978 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 367.5535888671875, "epoch": 0.022115297731996025, "grad_norm": 8.490879647780224, "kl": 0.059326171875, "learning_rate": 9.98796175126189e-07, "loss": 0.0024, "reward": 1.6699620485305786, "reward_std": 0.13332386314868927, "rewards/accuracy_reward": 0.3804670572280884, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1966378390789032, "step": 979 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.2857142857142857, "completion_length": 353.08929443359375, "epoch": 0.022137887413029727, "grad_norm": 1.5560843390666848, "kl": 0.06689453125, "learning_rate": 9.987937130489037e-07, "loss": 0.0027, "reward": 1.8675305843353271, "reward_std": 0.11116516590118408, "rewards/accuracy_reward": 0.5329707264900208, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2095598578453064, "step": 980 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 384.46429443359375, "epoch": 0.022160477094063432, "grad_norm": 2.615338154459311, "kl": 0.064453125, "learning_rate": 9.987912484594956e-07, "loss": 0.0026, "reward": 1.7379881143569946, "reward_std": 0.11699634790420532, "rewards/accuracy_reward": 0.4191473722457886, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1866978257894516, "step": 981 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.0, "completion_length": 406.26788330078125, "epoch": 0.022183066775097137, "grad_norm": 1.7243584555696676, "kl": 0.06103515625, "learning_rate": 9.98788781357977e-07, "loss": 0.0024, "reward": 2.1217687129974365, "reward_std": 0.24269244074821472, "rewards/accuracy_reward": 0.634777843952179, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3262765109539032, "step": 982 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 390.1250305175781, "epoch": 0.02220565645613084, "grad_norm": 3.5061586323556795, "kl": 0.083984375, "learning_rate": 9.987863117443597e-07, "loss": 0.0034, "reward": 1.8095641136169434, "reward_std": 0.173909530043602, "rewards/accuracy_reward": 0.5011796355247498, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2190987765789032, "step": 983 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 421.5535888671875, "epoch": 0.022228246137164544, "grad_norm": 4.8850772341016775, "kl": 0.078125, "learning_rate": 9.98783839618657e-07, "loss": 0.0031, "reward": 2.020840883255005, "reward_std": 0.1288142204284668, "rewards/accuracy_reward": 0.6335641145706177, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2444196492433548, "step": 984 }, { "all_correct": 0.5714285714285714, "all_wrong": 0.14285714285714285, "completion_length": 402.7321472167969, "epoch": 0.022250835818198245, "grad_norm": 24.584023652964028, "kl": 0.08349609375, "learning_rate": 9.98781364980881e-07, "loss": 0.0033, "reward": 2.1059749126434326, "reward_std": 0.07977257668972015, "rewards/accuracy_reward": 0.6727683544158936, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.286778062582016, "step": 985 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.42857142857142855, "completion_length": 432.6250305175781, "epoch": 0.02227342549923195, "grad_norm": 2.314901762928895, "kl": 0.087890625, "learning_rate": 9.987788878310438e-07, "loss": 0.0035, "reward": 1.5603269338607788, "reward_std": 0.06052180007100105, "rewards/accuracy_reward": 0.32653769850730896, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1516462117433548, "step": 986 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.0, "completion_length": 419.08929443359375, "epoch": 0.022296015180265655, "grad_norm": 1.8320267135396764, "kl": 0.07958984375, "learning_rate": 9.987764081691582e-07, "loss": 0.0032, "reward": 2.0108320713043213, "reward_std": 0.1520545780658722, "rewards/accuracy_reward": 0.5658997297286987, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.269932359457016, "step": 987 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.14285714285714285, "completion_length": 424.4821472167969, "epoch": 0.022318604861299357, "grad_norm": 1.4757231160979665, "kl": 0.057373046875, "learning_rate": 9.987739259952369e-07, "loss": 0.0023, "reward": 1.8583463430404663, "reward_std": 0.3442102074623108, "rewards/accuracy_reward": 0.5334091186523438, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2213658094406128, "step": 988 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 373.39288330078125, "epoch": 0.022341194542333062, "grad_norm": 4.875345767110855, "kl": 0.08544921875, "learning_rate": 9.98771441309292e-07, "loss": 0.0034, "reward": 1.7921574115753174, "reward_std": 0.2239125519990921, "rewards/accuracy_reward": 0.4544235169887543, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2305908352136612, "step": 989 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.2857142857142857, "completion_length": 405.3571472167969, "epoch": 0.022363784223366767, "grad_norm": 2.2630930139272682, "kl": 0.07373046875, "learning_rate": 9.987689541113362e-07, "loss": 0.003, "reward": 1.7882509231567383, "reward_std": 0.15155267715454102, "rewards/accuracy_reward": 0.4640076756477356, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2063860297203064, "step": 990 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 373.4464416503906, "epoch": 0.02238637390440047, "grad_norm": 2.610736489138499, "kl": 0.06689453125, "learning_rate": 9.987664644013822e-07, "loss": 0.0027, "reward": 1.7985535860061646, "reward_std": 0.3837663531303406, "rewards/accuracy_reward": 0.4795314371585846, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2118791937828064, "step": 991 }, { "all_correct": 0.2857142857142857, "all_wrong": 0.42857142857142855, "completion_length": 397.5535888671875, "epoch": 0.022408963585434174, "grad_norm": 1.3887579597192081, "kl": 0.061279296875, "learning_rate": 9.987639721794423e-07, "loss": 0.0025, "reward": 1.613452434539795, "reward_std": 0.14843927323818207, "rewards/accuracy_reward": 0.3942553699016571, "rewards/format_reward": 0.9642857313156128, "rewards/semantic_reward": 0.1656254380941391, "step": 992 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.0, "completion_length": 357.5000305175781, "epoch": 0.02243155326646788, "grad_norm": 2.218279093526125, "kl": 0.06689453125, "learning_rate": 9.98761477445529e-07, "loss": 0.0027, "reward": 2.1474616527557373, "reward_std": 0.2035028338432312, "rewards/accuracy_reward": 0.6821052432060242, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3117850422859192, "step": 993 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 395.51788330078125, "epoch": 0.02245414294750158, "grad_norm": 9.437721964688272, "kl": 0.0546875, "learning_rate": 9.98758980199655e-07, "loss": 0.0022, "reward": 1.6890603303909302, "reward_std": 0.27020326256752014, "rewards/accuracy_reward": 0.40555354952812195, "rewards/format_reward": 0.9821429252624512, "rewards/semantic_reward": 0.2013637125492096, "step": 994 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.14285714285714285, "completion_length": 367.6250305175781, "epoch": 0.022476732628535286, "grad_norm": 2.909672783671161, "kl": 0.04931640625, "learning_rate": 9.987564804418329e-07, "loss": 0.002, "reward": 1.6529430150985718, "reward_std": 0.40555086731910706, "rewards/accuracy_reward": 0.36600789427757263, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.197649285197258, "step": 995 }, { "all_correct": 0.14285714285714285, "all_wrong": 0.2857142857142857, "completion_length": 352.0357360839844, "epoch": 0.022499322309568987, "grad_norm": 2.0398968278970826, "kl": 0.060302734375, "learning_rate": 9.987539781720752e-07, "loss": 0.0024, "reward": 1.766467809677124, "reward_std": 0.19631309807300568, "rewards/accuracy_reward": 0.46670833230018616, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1997593492269516, "step": 996 }, { "all_correct": 0.0, "all_wrong": 0.2857142857142857, "completion_length": 372.3035888671875, "epoch": 0.022521911990602692, "grad_norm": 7.86645283479572, "kl": 0.057861328125, "learning_rate": 9.987514733903946e-07, "loss": 0.0023, "reward": 1.4864963293075562, "reward_std": 0.29883378744125366, "rewards/accuracy_reward": 0.27813276648521423, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.126220703125, "step": 997 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 371.4464416503906, "epoch": 0.022544501671636397, "grad_norm": 46.450313007747816, "kl": 0.051513671875, "learning_rate": 9.987489660968034e-07, "loss": 0.0021, "reward": 1.910413146018982, "reward_std": 0.2166278064250946, "rewards/accuracy_reward": 0.5546792149543762, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.216448113322258, "step": 998 }, { "all_correct": 0.42857142857142855, "all_wrong": 0.14285714285714285, "completion_length": 357.5357360839844, "epoch": 0.0225670913526701, "grad_norm": 2.0582538760994966, "kl": 0.05322265625, "learning_rate": 9.987464562913148e-07, "loss": 0.0021, "reward": 1.801766037940979, "reward_std": 0.17343232035636902, "rewards/accuracy_reward": 0.4861584007740021, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2120361477136612, "step": 999 }, { "all_correct": 0.7142857142857143, "all_wrong": 0.0, "completion_length": 361.4464416503906, "epoch": 0.022589681033703804, "grad_norm": 3.994710369573862, "kl": 0.055908203125, "learning_rate": 9.987439439739408e-07, "loss": 0.0022, "reward": 2.351353883743286, "reward_std": 0.1544560045003891, "rewards/accuracy_reward": 0.8616077899932861, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3290318250656128, "step": 1000 } ], "logging_steps": 1.0, "max_steps": 44268, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }