{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 505.9479351043701, "epoch": 0.0010666666666666667, "grad_norm": 0.1341146091763646, "kl": 0.0, "learning_rate": 7.142857142857142e-08, "loss": -0.0, "reward": 0.3046875079162419, "reward_std": 0.4547263579443097, "rewards/equation_reward_func": 0.03645833441987634, "rewards/format_reward_func": 0.26822917466051877, "step": 2 }, { "completion_length": 511.1562671661377, "epoch": 0.0021333333333333334, "grad_norm": 0.11909890519385766, "kl": 0.0004194974899291992, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "reward": 0.3098958386108279, "reward_std": 0.4707766156643629, "rewards/equation_reward_func": 0.0494791679084301, "rewards/format_reward_func": 0.26041667349636555, "step": 4 }, { "completion_length": 484.7057456970215, "epoch": 0.0032, "grad_norm": 0.10838779091076944, "kl": 0.0003948211669921875, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "reward": 0.25520834187045693, "reward_std": 0.4062541304156184, "rewards/equation_reward_func": 0.04427083418704569, "rewards/format_reward_func": 0.2109375053551048, "step": 6 }, { "completion_length": 502.9635524749756, "epoch": 0.004266666666666667, "grad_norm": 0.11359857035268194, "kl": 0.00040793418884277344, "learning_rate": 2.857142857142857e-07, "loss": 0.0, "reward": 0.3229166753590107, "reward_std": 0.4702935107052326, "rewards/equation_reward_func": 0.07291666860692203, "rewards/format_reward_func": 0.250000006519258, "step": 8 }, { "completion_length": 472.85418128967285, "epoch": 0.005333333333333333, "grad_norm": 0.13319802291837166, "kl": 0.00041985511779785156, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "reward": 0.3359375074505806, "reward_std": 0.47765984758734703, "rewards/equation_reward_func": 0.052083334885537624, "rewards/format_reward_func": 0.28385417396202683, "step": 10 }, { "completion_length": 474.9609489440918, "epoch": 0.0064, "grad_norm": 0.1283866658242959, "kl": 0.0004889965057373047, "learning_rate": 4.285714285714285e-07, "loss": 0.0, "reward": 0.40625000512227416, "reward_std": 0.5288777491077781, "rewards/equation_reward_func": 0.0703125016298145, "rewards/format_reward_func": 0.33593750884756446, "step": 12 }, { "completion_length": 487.7526226043701, "epoch": 0.007466666666666667, "grad_norm": 0.11491878824082066, "kl": 0.0008172988891601562, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.3984375139698386, "reward_std": 0.4919305704534054, "rewards/equation_reward_func": 0.05208333395421505, "rewards/format_reward_func": 0.3463541781529784, "step": 14 }, { "completion_length": 468.70313835144043, "epoch": 0.008533333333333334, "grad_norm": 0.12246266971775394, "kl": 0.0011203289031982422, "learning_rate": 4.999740409224932e-07, "loss": 0.0, "reward": 0.5494791846722364, "reward_std": 0.5318632125854492, "rewards/equation_reward_func": 0.08854167023673654, "rewards/format_reward_func": 0.46093751303851604, "step": 16 }, { "completion_length": 454.82292556762695, "epoch": 0.0096, "grad_norm": 0.10480668657811888, "kl": 0.00298309326171875, "learning_rate": 4.998961690809627e-07, "loss": 0.0, "reward": 0.6796875204890966, "reward_std": 0.5534657873213291, "rewards/equation_reward_func": 0.06770833465270698, "rewards/format_reward_func": 0.611979192122817, "step": 18 }, { "completion_length": 453.3411560058594, "epoch": 0.010666666666666666, "grad_norm": 0.11208435254258003, "kl": 0.005069732666015625, "learning_rate": 4.997664006472578e-07, "loss": 0.0, "reward": 0.7500000186264515, "reward_std": 0.5607063695788383, "rewards/equation_reward_func": 0.0859375016298145, "rewards/format_reward_func": 0.6640625167638063, "step": 20 }, { "completion_length": 450.89063262939453, "epoch": 0.011733333333333333, "grad_norm": 0.10552116383248636, "kl": 0.005932807922363281, "learning_rate": 4.995847625707292e-07, "loss": 0.0, "reward": 0.8593750149011612, "reward_std": 0.5123661290854216, "rewards/equation_reward_func": 0.09895833511836827, "rewards/format_reward_func": 0.7604166939854622, "step": 22 }, { "completion_length": 449.1198043823242, "epoch": 0.0128, "grad_norm": 0.10482422281110657, "kl": 0.007808685302734375, "learning_rate": 4.993512925726318e-07, "loss": 0.0, "reward": 0.8958333544433117, "reward_std": 0.44584160670638084, "rewards/equation_reward_func": 0.07812500302679837, "rewards/format_reward_func": 0.8177083507180214, "step": 24 }, { "completion_length": 437.75782012939453, "epoch": 0.013866666666666666, "grad_norm": 0.08078578907154227, "kl": 0.0073490142822265625, "learning_rate": 4.990660391382923e-07, "loss": 0.0, "reward": 0.9505208656191826, "reward_std": 0.40651129884645343, "rewards/equation_reward_func": 0.09895833604969084, "rewards/format_reward_func": 0.8515625186264515, "step": 26 }, { "completion_length": 441.77345275878906, "epoch": 0.014933333333333333, "grad_norm": 0.08279347003242855, "kl": 0.00899505615234375, "learning_rate": 4.987290615070384e-07, "loss": 0.0, "reward": 0.9583333544433117, "reward_std": 0.31725937221199274, "rewards/equation_reward_func": 0.06510416814126074, "rewards/format_reward_func": 0.893229179084301, "step": 28 }, { "completion_length": 438.5364685058594, "epoch": 0.016, "grad_norm": 0.07534793657846317, "kl": 0.01221466064453125, "learning_rate": 4.983404296598978e-07, "loss": 0.0, "reward": 1.0104166828095913, "reward_std": 0.286367348395288, "rewards/equation_reward_func": 0.08072916860692203, "rewards/format_reward_func": 0.9296875186264515, "step": 30 }, { "completion_length": 446.8125114440918, "epoch": 0.017066666666666667, "grad_norm": 0.07968622664869553, "kl": 0.011959075927734375, "learning_rate": 4.979002243050646e-07, "loss": 0.0, "reward": 1.0026041977107525, "reward_std": 0.344503759406507, "rewards/equation_reward_func": 0.09635416860692203, "rewards/format_reward_func": 0.9062500223517418, "step": 32 }, { "completion_length": 427.65886306762695, "epoch": 0.018133333333333335, "grad_norm": 0.08471832672268678, "kl": 0.027118682861328125, "learning_rate": 4.974085368611381e-07, "loss": 0.0, "reward": 1.0442708693444729, "reward_std": 0.2840048740617931, "rewards/equation_reward_func": 0.1015625037252903, "rewards/format_reward_func": 0.9427083544433117, "step": 34 }, { "completion_length": 444.25261878967285, "epoch": 0.0192, "grad_norm": 0.06852883069586094, "kl": 0.01403045654296875, "learning_rate": 4.968654694381379e-07, "loss": 0.0, "reward": 0.9921875335276127, "reward_std": 0.21781930467113853, "rewards/equation_reward_func": 0.05208333511836827, "rewards/format_reward_func": 0.9401041865348816, "step": 36 }, { "completion_length": 383.59896659851074, "epoch": 0.020266666666666665, "grad_norm": 0.08057979196934888, "kl": 0.0169677734375, "learning_rate": 4.962711348162987e-07, "loss": 0.0, "reward": 1.1223958656191826, "reward_std": 0.2702699927613139, "rewards/equation_reward_func": 0.14062500395812094, "rewards/format_reward_func": 0.9817708507180214, "step": 38 }, { "completion_length": 411.5677185058594, "epoch": 0.021333333333333333, "grad_norm": 0.07850698291787955, "kl": 0.017246246337890625, "learning_rate": 4.956256564226487e-07, "loss": 0.0, "reward": 1.0989583656191826, "reward_std": 0.2887088777497411, "rewards/equation_reward_func": 0.13281250279396772, "rewards/format_reward_func": 0.9661458507180214, "step": 40 }, { "completion_length": 396.64584159851074, "epoch": 0.0224, "grad_norm": 0.07505103817031399, "kl": 0.017597198486328125, "learning_rate": 4.949291683053768e-07, "loss": 0.0, "reward": 1.0807291977107525, "reward_std": 0.2670950279571116, "rewards/equation_reward_func": 0.11458333698101342, "rewards/format_reward_func": 0.9661458507180214, "step": 42 }, { "completion_length": 387.0468864440918, "epoch": 0.023466666666666667, "grad_norm": 0.09175240895759779, "kl": 0.017871856689453125, "learning_rate": 4.941818151059955e-07, "loss": 0.0, "reward": 1.1015625335276127, "reward_std": 0.2870901683345437, "rewards/equation_reward_func": 0.1354166711680591, "rewards/format_reward_func": 0.9661458507180214, "step": 44 }, { "completion_length": 400.3645935058594, "epoch": 0.024533333333333334, "grad_norm": 0.09491357639118295, "kl": 0.019161224365234375, "learning_rate": 4.933837520293017e-07, "loss": 0.0, "reward": 1.070312537252903, "reward_std": 0.2785795754753053, "rewards/equation_reward_func": 0.10937500442378223, "rewards/format_reward_func": 0.9609375260770321, "step": 46 }, { "completion_length": 403.8671989440918, "epoch": 0.0256, "grad_norm": 0.08449768835766272, "kl": 0.01947021484375, "learning_rate": 4.925351448111454e-07, "loss": 0.0, "reward": 1.0598958767950535, "reward_std": 0.1955897193402052, "rewards/equation_reward_func": 0.08333333604969084, "rewards/format_reward_func": 0.9765625149011612, "step": 48 }, { "completion_length": 384.1770896911621, "epoch": 0.02666666666666667, "grad_norm": 0.09879170444522951, "kl": 0.02040863037109375, "learning_rate": 4.91636169684011e-07, "loss": 0.0, "reward": 1.1223958730697632, "reward_std": 0.31093722581863403, "rewards/equation_reward_func": 0.1406250037252903, "rewards/format_reward_func": 0.9817708432674408, "step": 50 }, { "completion_length": 391.669282913208, "epoch": 0.027733333333333332, "grad_norm": 0.10757568231914379, "kl": 0.0244903564453125, "learning_rate": 4.906870133404186e-07, "loss": 0.0, "reward": 1.1197916977107525, "reward_std": 0.3494974756613374, "rewards/equation_reward_func": 0.15885417140088975, "rewards/format_reward_func": 0.9609375186264515, "step": 52 }, { "completion_length": 387.16407012939453, "epoch": 0.0288, "grad_norm": 0.0916962283697697, "kl": 0.02394866943359375, "learning_rate": 4.896878728941531e-07, "loss": 0.0, "reward": 1.1067708656191826, "reward_std": 0.25607615802437067, "rewards/equation_reward_func": 0.1328125020954758, "rewards/format_reward_func": 0.9739583544433117, "step": 54 }, { "completion_length": 346.4114646911621, "epoch": 0.029866666666666666, "grad_norm": 0.09993350369732659, "kl": 0.0276031494140625, "learning_rate": 4.886389558393284e-07, "loss": 0.0, "reward": 1.1510416939854622, "reward_std": 0.2859157114289701, "rewards/equation_reward_func": 0.16145833767950535, "rewards/format_reward_func": 0.9895833432674408, "step": 56 }, { "completion_length": 361.99219512939453, "epoch": 0.030933333333333334, "grad_norm": 0.11653485215024455, "kl": 0.02984619140625, "learning_rate": 4.875404800072976e-07, "loss": 0.0, "reward": 1.1640625447034836, "reward_std": 0.3471745736896992, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9765625149011612, "step": 58 }, { "completion_length": 367.1015739440918, "epoch": 0.032, "grad_norm": 0.07180913754511904, "kl": 0.03044891357421875, "learning_rate": 4.86392673521415e-07, "loss": 0.0, "reward": 1.0911458805203438, "reward_std": 0.1999878236092627, "rewards/equation_reward_func": 0.10416666930541396, "rewards/format_reward_func": 0.9869791753590107, "step": 60 }, { "completion_length": 366.5208435058594, "epoch": 0.03306666666666667, "grad_norm": 0.08088172620555445, "kl": 0.0330810546875, "learning_rate": 4.851957747496606e-07, "loss": 0.0, "reward": 1.1510416939854622, "reward_std": 0.28296295227482915, "rewards/equation_reward_func": 0.16927083488553762, "rewards/format_reward_func": 0.9817708469927311, "step": 62 }, { "completion_length": 357.73178482055664, "epoch": 0.034133333333333335, "grad_norm": 0.0844167380266008, "kl": 0.03631591796875, "learning_rate": 4.839500322551386e-07, "loss": 0.0, "reward": 1.1197916939854622, "reward_std": 0.2452517431229353, "rewards/equation_reward_func": 0.14843750186264515, "rewards/format_reward_func": 0.9713541753590107, "step": 64 }, { "completion_length": 353.9739685058594, "epoch": 0.0352, "grad_norm": 0.0778527671209511, "kl": 0.041229248046875, "learning_rate": 4.826557047444563e-07, "loss": 0.0, "reward": 1.1796875298023224, "reward_std": 0.30663188826292753, "rewards/equation_reward_func": 0.19791667279787362, "rewards/format_reward_func": 0.9817708469927311, "step": 66 }, { "completion_length": 348.2239685058594, "epoch": 0.03626666666666667, "grad_norm": 0.07408528500512421, "kl": 0.044708251953125, "learning_rate": 4.813130610139993e-07, "loss": 0.0, "reward": 1.0729167014360428, "reward_std": 0.17930190591141582, "rewards/equation_reward_func": 0.0885416695382446, "rewards/format_reward_func": 0.9843750149011612, "step": 68 }, { "completion_length": 318.35938835144043, "epoch": 0.037333333333333336, "grad_norm": 0.10471668022395769, "kl": 0.0505828857421875, "learning_rate": 4.799223798941089e-07, "loss": 0.0001, "reward": 1.187500037252903, "reward_std": 0.2974981819279492, "rewards/equation_reward_func": 0.2031250053551048, "rewards/format_reward_func": 0.9843750111758709, "step": 70 }, { "completion_length": 312.2213659286499, "epoch": 0.0384, "grad_norm": 0.08445574387607607, "kl": 0.058990478515625, "learning_rate": 4.78483950191177e-07, "loss": 0.0001, "reward": 1.1562500298023224, "reward_std": 0.23554043704643846, "rewards/equation_reward_func": 0.17187500651925802, "rewards/format_reward_func": 0.9843750111758709, "step": 72 }, { "completion_length": 320.13542556762695, "epoch": 0.039466666666666664, "grad_norm": 0.10154941280104149, "kl": 0.0615997314453125, "learning_rate": 4.769980706276687e-07, "loss": 0.0001, "reward": 1.1770833730697632, "reward_std": 0.26962050748988986, "rewards/equation_reward_func": 0.19270834000781178, "rewards/format_reward_func": 0.9843750111758709, "step": 74 }, { "completion_length": 334.70052909851074, "epoch": 0.04053333333333333, "grad_norm": 0.08509345877302323, "kl": 0.061676025390625, "learning_rate": 4.7546504978008595e-07, "loss": 0.0001, "reward": 1.1458333730697632, "reward_std": 0.20033816620707512, "rewards/equation_reward_func": 0.15885417233221233, "rewards/format_reward_func": 0.986979179084301, "step": 76 }, { "completion_length": 333.23438453674316, "epoch": 0.0416, "grad_norm": 0.10027144175078107, "kl": 0.065399169921875, "learning_rate": 4.738852060148848e-07, "loss": 0.0001, "reward": 1.1171875447034836, "reward_std": 0.23261011950671673, "rewards/equation_reward_func": 0.13541667070239782, "rewards/format_reward_func": 0.9817708507180214, "step": 78 }, { "completion_length": 331.69011878967285, "epoch": 0.042666666666666665, "grad_norm": 0.07507534432076213, "kl": 0.071014404296875, "learning_rate": 4.722588674223593e-07, "loss": 0.0001, "reward": 1.1276042014360428, "reward_std": 0.2506814347580075, "rewards/equation_reward_func": 0.14843750442378223, "rewards/format_reward_func": 0.9791666828095913, "step": 80 }, { "completion_length": 344.0781364440918, "epoch": 0.04373333333333333, "grad_norm": 0.09863254302808237, "kl": 0.070526123046875, "learning_rate": 4.70586371748506e-07, "loss": 0.0001, "reward": 1.2031250447034836, "reward_std": 0.2764001186005771, "rewards/equation_reward_func": 0.2109375074505806, "rewards/format_reward_func": 0.9921875074505806, "step": 82 }, { "completion_length": 330.4479274749756, "epoch": 0.0448, "grad_norm": 0.10155910053999813, "kl": 0.07550048828125, "learning_rate": 4.6886806632488363e-07, "loss": 0.0001, "reward": 1.2708333730697632, "reward_std": 0.3232872476801276, "rewards/equation_reward_func": 0.28906251140870154, "rewards/format_reward_func": 0.9817708469927311, "step": 84 }, { "completion_length": 341.1197986602783, "epoch": 0.04586666666666667, "grad_norm": 0.09455703883061281, "kl": 0.07513427734375, "learning_rate": 4.6710430799648143e-07, "loss": 0.0001, "reward": 1.1953125298023224, "reward_std": 0.3194303079508245, "rewards/equation_reward_func": 0.2291666753590107, "rewards/format_reward_func": 0.9661458507180214, "step": 86 }, { "completion_length": 341.54427909851074, "epoch": 0.046933333333333334, "grad_norm": 0.08290471243926564, "kl": 0.077423095703125, "learning_rate": 4.652954630476127e-07, "loss": 0.0001, "reward": 1.1979167014360428, "reward_std": 0.2291324818506837, "rewards/equation_reward_func": 0.2239583395421505, "rewards/format_reward_func": 0.9739583469927311, "step": 88 }, { "completion_length": 335.0989627838135, "epoch": 0.048, "grad_norm": 0.10748566516697469, "kl": 0.088287353515625, "learning_rate": 4.6344190712584713e-07, "loss": 0.0001, "reward": 1.1692708730697632, "reward_std": 0.3015799345448613, "rewards/equation_reward_func": 0.1979166700039059, "rewards/format_reward_func": 0.9713541902601719, "step": 90 }, { "completion_length": 314.3177185058594, "epoch": 0.04906666666666667, "grad_norm": 0.10758855837243832, "kl": 0.08538818359375, "learning_rate": 4.615440251639995e-07, "loss": 0.0001, "reward": 1.3151041977107525, "reward_std": 0.3682410903275013, "rewards/equation_reward_func": 0.33593750838190317, "rewards/format_reward_func": 0.9791666753590107, "step": 92 }, { "completion_length": 313.01302909851074, "epoch": 0.050133333333333335, "grad_norm": 0.124066638172858, "kl": 0.0859375, "learning_rate": 4.596022113001894e-07, "loss": 0.0001, "reward": 1.276041705161333, "reward_std": 0.30914933141320944, "rewards/equation_reward_func": 0.29427084303461015, "rewards/format_reward_func": 0.9817708432674408, "step": 94 }, { "completion_length": 334.85417556762695, "epoch": 0.0512, "grad_norm": 0.1018803932324317, "kl": 0.088104248046875, "learning_rate": 4.576168687959895e-07, "loss": 0.0001, "reward": 1.2135417014360428, "reward_std": 0.2573512555100024, "rewards/equation_reward_func": 0.2395833416376263, "rewards/format_reward_func": 0.9739583469927311, "step": 96 }, { "completion_length": 351.9817810058594, "epoch": 0.05226666666666667, "grad_norm": 0.10509374857128695, "kl": 0.098785400390625, "learning_rate": 4.555884099526793e-07, "loss": 0.0001, "reward": 1.250000037252903, "reward_std": 0.29483586829155684, "rewards/equation_reward_func": 0.27083334093913436, "rewards/format_reward_func": 0.9791666753590107, "step": 98 }, { "completion_length": 358.38021755218506, "epoch": 0.05333333333333334, "grad_norm": 0.0978516383302316, "kl": 0.08575439453125, "learning_rate": 4.5351725602562174e-07, "loss": 0.0001, "reward": 1.2942708656191826, "reward_std": 0.32903878297656775, "rewards/equation_reward_func": 0.3203125046566129, "rewards/format_reward_func": 0.9739583432674408, "step": 100 }, { "completion_length": 361.90365409851074, "epoch": 0.0544, "grad_norm": 0.07892841773395727, "kl": 0.092681884765625, "learning_rate": 4.514038371367791e-07, "loss": 0.0001, "reward": 1.2838542014360428, "reward_std": 0.23603887297213078, "rewards/equation_reward_func": 0.2942708432674408, "rewards/format_reward_func": 0.9895833395421505, "step": 102 }, { "completion_length": 370.2447986602783, "epoch": 0.055466666666666664, "grad_norm": 0.07956969957231312, "kl": 0.088226318359375, "learning_rate": 4.4924859218538936e-07, "loss": 0.0001, "reward": 1.2682292088866234, "reward_std": 0.2611841419711709, "rewards/equation_reward_func": 0.2838541760575026, "rewards/format_reward_func": 0.9843750074505806, "step": 104 }, { "completion_length": 405.403657913208, "epoch": 0.05653333333333333, "grad_norm": 0.10207984517578009, "kl": 0.0877227783203125, "learning_rate": 4.470519687568185e-07, "loss": 0.0001, "reward": 1.2786458618938923, "reward_std": 0.27022232208400965, "rewards/equation_reward_func": 0.31250000838190317, "rewards/format_reward_func": 0.9661458507180214, "step": 106 }, { "completion_length": 392.93490982055664, "epoch": 0.0576, "grad_norm": 0.08438917528245744, "kl": 0.0877685546875, "learning_rate": 4.4481442302960923e-07, "loss": 0.0001, "reward": 1.3072917014360428, "reward_std": 0.31525306357070804, "rewards/equation_reward_func": 0.34375000838190317, "rewards/format_reward_func": 0.963541679084301, "step": 108 }, { "completion_length": 399.8698024749756, "epoch": 0.058666666666666666, "grad_norm": 0.08270590545214734, "kl": 0.09637451171875, "learning_rate": 4.4253641968074505e-07, "loss": 0.0001, "reward": 1.268229190260172, "reward_std": 0.24568770825862885, "rewards/equation_reward_func": 0.3046875062864274, "rewards/format_reward_func": 0.9635416828095913, "step": 110 }, { "completion_length": 409.60417556762695, "epoch": 0.05973333333333333, "grad_norm": 0.10271913225077348, "kl": 0.0924072265625, "learning_rate": 4.402184317891501e-07, "loss": 0.0001, "reward": 1.2812500335276127, "reward_std": 0.33530174382030964, "rewards/equation_reward_func": 0.3385416748933494, "rewards/format_reward_func": 0.9427083544433117, "step": 112 }, { "completion_length": 416.4088649749756, "epoch": 0.0608, "grad_norm": 0.08166810576477633, "kl": 0.095794677734375, "learning_rate": 4.37860940737443e-07, "loss": 0.0001, "reward": 1.1770833805203438, "reward_std": 0.26351519441232085, "rewards/equation_reward_func": 0.22395834187045693, "rewards/format_reward_func": 0.9531250149011612, "step": 114 }, { "completion_length": 390.3463611602783, "epoch": 0.06186666666666667, "grad_norm": 0.09414353563065953, "kl": 0.11090087890625, "learning_rate": 4.354644361119671e-07, "loss": 0.0001, "reward": 1.398437537252903, "reward_std": 0.30470984475687146, "rewards/equation_reward_func": 0.42187501583248377, "rewards/format_reward_func": 0.9765625149011612, "step": 116 }, { "completion_length": 378.01563358306885, "epoch": 0.06293333333333333, "grad_norm": 0.07635029320541607, "kl": 0.124725341796875, "learning_rate": 4.3302941560111716e-07, "loss": 0.0001, "reward": 1.3958333730697632, "reward_std": 0.36394598754122853, "rewards/equation_reward_func": 0.4166666765231639, "rewards/format_reward_func": 0.9791666865348816, "step": 118 }, { "completion_length": 392.65625953674316, "epoch": 0.064, "grad_norm": 0.0833024147650861, "kl": 0.1026611328125, "learning_rate": 4.3055638489198236e-07, "loss": 0.0001, "reward": 1.3359375298023224, "reward_std": 0.37286510691046715, "rewards/equation_reward_func": 0.3906250102445483, "rewards/format_reward_func": 0.9453125186264515, "step": 120 }, { "completion_length": 399.5078182220459, "epoch": 0.06506666666666666, "grad_norm": 0.0892199212165042, "kl": 0.1014404296875, "learning_rate": 4.280458575653296e-07, "loss": 0.0001, "reward": 1.3307292088866234, "reward_std": 0.3504871279001236, "rewards/equation_reward_func": 0.38802084513008595, "rewards/format_reward_func": 0.9427083507180214, "step": 122 }, { "completion_length": 450.1354331970215, "epoch": 0.06613333333333334, "grad_norm": 0.06581923430481687, "kl": 0.114990234375, "learning_rate": 4.2549835498894665e-07, "loss": 0.0001, "reward": 1.2604166939854622, "reward_std": 0.3068140549585223, "rewards/equation_reward_func": 0.32552084559574723, "rewards/format_reward_func": 0.9348958544433117, "step": 124 }, { "completion_length": 390.036470413208, "epoch": 0.0672, "grad_norm": 0.07114986931726634, "kl": 0.10528564453125, "learning_rate": 4.229144062093679e-07, "loss": 0.0001, "reward": 1.3723958730697632, "reward_std": 0.29870040342211723, "rewards/equation_reward_func": 0.39843751303851604, "rewards/format_reward_func": 0.9739583469927311, "step": 126 }, { "completion_length": 392.59115505218506, "epoch": 0.06826666666666667, "grad_norm": 0.0877107079994648, "kl": 0.109405517578125, "learning_rate": 4.2029454784200675e-07, "loss": 0.0001, "reward": 1.390625037252903, "reward_std": 0.280646042432636, "rewards/equation_reward_func": 0.42447917722165585, "rewards/format_reward_func": 0.9661458507180214, "step": 128 }, { "completion_length": 421.0078191757202, "epoch": 0.06933333333333333, "grad_norm": 0.09643905280459295, "kl": 0.10009765625, "learning_rate": 4.1763932395971433e-07, "loss": 0.0001, "reward": 1.2942708693444729, "reward_std": 0.3986189612187445, "rewards/equation_reward_func": 0.3567708421032876, "rewards/format_reward_func": 0.9375000223517418, "step": 130 }, { "completion_length": 461.4791736602783, "epoch": 0.0704, "grad_norm": 0.06366382823979087, "kl": 0.101837158203125, "learning_rate": 4.1494928597979117e-07, "loss": 0.0001, "reward": 1.2760417088866234, "reward_std": 0.27500381181016564, "rewards/equation_reward_func": 0.32291667629033327, "rewards/format_reward_func": 0.9531250186264515, "step": 132 }, { "completion_length": 394.3255319595337, "epoch": 0.07146666666666666, "grad_norm": 0.0853911421540347, "kl": 0.130126953125, "learning_rate": 4.122249925494726e-07, "loss": 0.0001, "reward": 1.403645858168602, "reward_std": 0.25308565702289343, "rewards/equation_reward_func": 0.43229168001562357, "rewards/format_reward_func": 0.9713541828095913, "step": 134 }, { "completion_length": 426.466157913208, "epoch": 0.07253333333333334, "grad_norm": 0.0692987274556644, "kl": 0.11669921875, "learning_rate": 4.094670094299131e-07, "loss": 0.0001, "reward": 1.281250037252903, "reward_std": 0.316250397823751, "rewards/equation_reward_func": 0.33072917559184134, "rewards/format_reward_func": 0.9505208469927311, "step": 136 }, { "completion_length": 439.9010543823242, "epoch": 0.0736, "grad_norm": 0.08170658774133101, "kl": 0.104949951171875, "learning_rate": 4.066759093786931e-07, "loss": 0.0001, "reward": 1.2760416977107525, "reward_std": 0.35973797645419836, "rewards/equation_reward_func": 0.3411458428017795, "rewards/format_reward_func": 0.9348958618938923, "step": 138 }, { "completion_length": 416.6406354904175, "epoch": 0.07466666666666667, "grad_norm": 0.06667199771271264, "kl": 0.12353515625, "learning_rate": 4.038522720308732e-07, "loss": 0.0001, "reward": 1.3854167088866234, "reward_std": 0.21267010737210512, "rewards/equation_reward_func": 0.4088541741948575, "rewards/format_reward_func": 0.9765625149011612, "step": 140 }, { "completion_length": 440.536470413208, "epoch": 0.07573333333333333, "grad_norm": 0.0850091968151683, "kl": 0.11474609375, "learning_rate": 4.009966837786194e-07, "loss": 0.0001, "reward": 1.3255208693444729, "reward_std": 0.30754279950633645, "rewards/equation_reward_func": 0.36458334629423916, "rewards/format_reward_func": 0.9609375074505806, "step": 142 }, { "completion_length": 401.09896659851074, "epoch": 0.0768, "grad_norm": 0.0768836508261685, "kl": 0.116485595703125, "learning_rate": 3.981097376494259e-07, "loss": 0.0001, "reward": 1.4557292237877846, "reward_std": 0.31219895882532, "rewards/equation_reward_func": 0.5026041828095913, "rewards/format_reward_func": 0.9531250186264515, "step": 144 }, { "completion_length": 461.122407913208, "epoch": 0.07786666666666667, "grad_norm": 0.10600525349484782, "kl": 0.114044189453125, "learning_rate": 3.951920331829592e-07, "loss": 0.0001, "reward": 1.2890625335276127, "reward_std": 0.2976598385721445, "rewards/equation_reward_func": 0.3255208437331021, "rewards/format_reward_func": 0.963541679084301, "step": 146 }, { "completion_length": 413.9713649749756, "epoch": 0.07893333333333333, "grad_norm": 0.0723392326431143, "kl": 0.123321533203125, "learning_rate": 3.922441763065506e-07, "loss": 0.0001, "reward": 1.4088542014360428, "reward_std": 0.23494611913338304, "rewards/equation_reward_func": 0.43750000931322575, "rewards/format_reward_func": 0.9713541828095913, "step": 148 }, { "completion_length": 458.7604331970215, "epoch": 0.08, "grad_norm": 0.09048584328529992, "kl": 0.12384033203125, "learning_rate": 3.8926677920936093e-07, "loss": 0.0001, "reward": 1.2656250223517418, "reward_std": 0.3245450472459197, "rewards/equation_reward_func": 0.32031250838190317, "rewards/format_reward_func": 0.9453125111758709, "step": 150 }, { "completion_length": 378.33073902130127, "epoch": 0.08106666666666666, "grad_norm": 0.10455674533718096, "kl": 0.13720703125, "learning_rate": 3.862604602152464e-07, "loss": 0.0001, "reward": 1.4244792014360428, "reward_std": 0.26624298514798284, "rewards/equation_reward_func": 0.46093751210719347, "rewards/format_reward_func": 0.9635416828095913, "step": 152 }, { "completion_length": 421.9140796661377, "epoch": 0.08213333333333334, "grad_norm": 0.10103705731464013, "kl": 0.129638671875, "learning_rate": 3.8322584365434934e-07, "loss": 0.0001, "reward": 1.3255208879709244, "reward_std": 0.24930242728441954, "rewards/equation_reward_func": 0.3723958465270698, "rewards/format_reward_func": 0.9531250186264515, "step": 154 }, { "completion_length": 463.58334159851074, "epoch": 0.0832, "grad_norm": 0.09221432956401719, "kl": 0.127166748046875, "learning_rate": 3.8016355973344173e-07, "loss": 0.0001, "reward": 1.234375037252903, "reward_std": 0.2910663695074618, "rewards/equation_reward_func": 0.28906250605359674, "rewards/format_reward_func": 0.9453125260770321, "step": 156 }, { "completion_length": 419.51563835144043, "epoch": 0.08426666666666667, "grad_norm": 0.08138226453807305, "kl": 0.1285400390625, "learning_rate": 3.7707424440504863e-07, "loss": 0.0001, "reward": 1.3489583730697632, "reward_std": 0.23599386168643832, "rewards/equation_reward_func": 0.37500001094304025, "rewards/format_reward_func": 0.9739583544433117, "step": 158 }, { "completion_length": 361.1015729904175, "epoch": 0.08533333333333333, "grad_norm": 0.13299459818559828, "kl": 0.15423583984375, "learning_rate": 3.739585392353787e-07, "loss": 0.0002, "reward": 1.434895858168602, "reward_std": 0.28986221412196755, "rewards/equation_reward_func": 0.458333341171965, "rewards/format_reward_func": 0.9765625186264515, "step": 160 }, { "completion_length": 391.0599117279053, "epoch": 0.0864, "grad_norm": 0.10062549742509476, "kl": 0.140045166015625, "learning_rate": 3.7081709127108767e-07, "loss": 0.0001, "reward": 1.4244791939854622, "reward_std": 0.2554763099178672, "rewards/equation_reward_func": 0.4427083439659327, "rewards/format_reward_func": 0.9817708469927311, "step": 162 }, { "completion_length": 346.71094512939453, "epoch": 0.08746666666666666, "grad_norm": 0.07557865430106443, "kl": 0.165771484375, "learning_rate": 3.6765055290490513e-07, "loss": 0.0002, "reward": 1.510416716337204, "reward_std": 0.23889524163678288, "rewards/equation_reward_func": 0.5390625223517418, "rewards/format_reward_func": 0.9713541828095913, "step": 164 }, { "completion_length": 374.559907913208, "epoch": 0.08853333333333334, "grad_norm": 0.09484161296330915, "kl": 0.145050048828125, "learning_rate": 3.644595817401501e-07, "loss": 0.0001, "reward": 1.4140625596046448, "reward_std": 0.2526052575558424, "rewards/equation_reward_func": 0.43229167931713164, "rewards/format_reward_func": 0.9817708507180214, "step": 166 }, { "completion_length": 389.614595413208, "epoch": 0.0896, "grad_norm": 0.10850466477020716, "kl": 0.140869140625, "learning_rate": 3.6124484045416483e-07, "loss": 0.0001, "reward": 1.3411458730697632, "reward_std": 0.20541261043399572, "rewards/equation_reward_func": 0.3515625118743628, "rewards/format_reward_func": 0.9895833432674408, "step": 168 }, { "completion_length": 314.33594512939453, "epoch": 0.09066666666666667, "grad_norm": 0.09160402552556286, "kl": 0.159759521484375, "learning_rate": 3.580069966606949e-07, "loss": 0.0002, "reward": 1.4739583805203438, "reward_std": 0.2342346585355699, "rewards/equation_reward_func": 0.5000000114087015, "rewards/format_reward_func": 0.9739583469927311, "step": 170 }, { "completion_length": 359.8906354904175, "epoch": 0.09173333333333333, "grad_norm": 0.09610423165466968, "kl": 0.154388427734375, "learning_rate": 3.547467227712444e-07, "loss": 0.0002, "reward": 1.437500037252903, "reward_std": 0.1834291499108076, "rewards/equation_reward_func": 0.45572918467223644, "rewards/format_reward_func": 0.9817708469927311, "step": 172 }, { "completion_length": 365.7343854904175, "epoch": 0.0928, "grad_norm": 0.09889865100739882, "kl": 0.15478515625, "learning_rate": 3.5146469585543386e-07, "loss": 0.0002, "reward": 1.414062537252903, "reward_std": 0.19458062946796417, "rewards/equation_reward_func": 0.4322916797827929, "rewards/format_reward_func": 0.9817708432674408, "step": 174 }, { "completion_length": 348.29427909851074, "epoch": 0.09386666666666667, "grad_norm": 0.09367098793216834, "kl": 0.159515380859375, "learning_rate": 3.481615975003922e-07, "loss": 0.0002, "reward": 1.4739583879709244, "reward_std": 0.15797653933987021, "rewards/equation_reward_func": 0.4921875149011612, "rewards/format_reward_func": 0.9817708469927311, "step": 176 }, { "completion_length": 357.6145963668823, "epoch": 0.09493333333333333, "grad_norm": 0.07894542256229298, "kl": 0.150299072265625, "learning_rate": 3.448381136692089e-07, "loss": 0.0002, "reward": 1.4401042126119137, "reward_std": 0.2548735234886408, "rewards/equation_reward_func": 0.4765625074505806, "rewards/format_reward_func": 0.9635416828095913, "step": 178 }, { "completion_length": 353.8281354904175, "epoch": 0.096, "grad_norm": 0.10120368862706956, "kl": 0.1510009765625, "learning_rate": 3.4149493455847897e-07, "loss": 0.0002, "reward": 1.377604216337204, "reward_std": 0.17720257258042693, "rewards/equation_reward_func": 0.3932291795499623, "rewards/format_reward_func": 0.9843750074505806, "step": 180 }, { "completion_length": 337.3437547683716, "epoch": 0.09706666666666666, "grad_norm": 0.06857743257585827, "kl": 0.171661376953125, "learning_rate": 3.3813275445496766e-07, "loss": 0.0002, "reward": 1.3958333879709244, "reward_std": 0.216899492777884, "rewards/equation_reward_func": 0.4140625144354999, "rewards/format_reward_func": 0.9817708469927311, "step": 182 }, { "completion_length": 373.3619899749756, "epoch": 0.09813333333333334, "grad_norm": 0.07039499292151902, "kl": 0.185546875, "learning_rate": 3.347522715914262e-07, "loss": 0.0002, "reward": 1.2838542088866234, "reward_std": 0.14952099742367864, "rewards/equation_reward_func": 0.29947917186655104, "rewards/format_reward_func": 0.9843750037252903, "step": 184 }, { "completion_length": 343.94011783599854, "epoch": 0.0992, "grad_norm": 0.07557857715641425, "kl": 0.172119140625, "learning_rate": 3.313541880015877e-07, "loss": 0.0002, "reward": 1.3671875521540642, "reward_std": 0.15858241729438305, "rewards/equation_reward_func": 0.380208347691223, "rewards/format_reward_func": 0.986979179084301, "step": 186 }, { "completion_length": 360.5390748977661, "epoch": 0.10026666666666667, "grad_norm": 0.11214755840839478, "kl": 0.213623046875, "learning_rate": 3.279392093743747e-07, "loss": 0.0002, "reward": 1.3880208730697632, "reward_std": 0.19066602177917957, "rewards/equation_reward_func": 0.4010416786186397, "rewards/format_reward_func": 0.9869791753590107, "step": 188 }, { "completion_length": 322.72397232055664, "epoch": 0.10133333333333333, "grad_norm": 0.09281179127833081, "kl": 0.2625732421875, "learning_rate": 3.245080449073459e-07, "loss": 0.0003, "reward": 1.4557292088866234, "reward_std": 0.21278624143451452, "rewards/equation_reward_func": 0.4765625102445483, "rewards/format_reward_func": 0.9791666753590107, "step": 190 }, { "completion_length": 337.25261306762695, "epoch": 0.1024, "grad_norm": 0.12372831662094742, "kl": 0.18206787109375, "learning_rate": 3.210614071594162e-07, "loss": 0.0002, "reward": 1.4218750447034836, "reward_std": 0.21987988194450736, "rewards/equation_reward_func": 0.440104179084301, "rewards/format_reward_func": 0.9817708395421505, "step": 192 }, { "completion_length": 344.55729961395264, "epoch": 0.10346666666666667, "grad_norm": 0.06378441341807725, "kl": 0.167724609375, "learning_rate": 3.1760001190287695e-07, "loss": 0.0002, "reward": 1.351562537252903, "reward_std": 0.14025551918894053, "rewards/equation_reward_func": 0.36979167722165585, "rewards/format_reward_func": 0.9817708432674408, "step": 194 }, { "completion_length": 314.03646659851074, "epoch": 0.10453333333333334, "grad_norm": 0.09003427985578723, "kl": 0.17474365234375, "learning_rate": 3.141245779747502e-07, "loss": 0.0002, "reward": 1.4479167237877846, "reward_std": 0.2472039177082479, "rewards/equation_reward_func": 0.46875001722946763, "rewards/format_reward_func": 0.979166679084301, "step": 196 }, { "completion_length": 282.7083406448364, "epoch": 0.1056, "grad_norm": 0.1290749910466798, "kl": 0.21856689453125, "learning_rate": 3.106358271275056e-07, "loss": 0.0002, "reward": 1.476562537252903, "reward_std": 0.1649267366155982, "rewards/equation_reward_func": 0.4843750186264515, "rewards/format_reward_func": 0.9921875074505806, "step": 198 }, { "completion_length": 276.01823806762695, "epoch": 0.10666666666666667, "grad_norm": 0.08477695235189277, "kl": 0.1934814453125, "learning_rate": 3.0713448387917227e-07, "loss": 0.0002, "reward": 1.5468750298023224, "reward_std": 0.13912134431302547, "rewards/equation_reward_func": 0.552083348389715, "rewards/format_reward_func": 0.9947916716337204, "step": 200 }, { "completion_length": 317.55730152130127, "epoch": 0.10773333333333333, "grad_norm": 2.4369008488049477, "kl": 5.66986083984375, "learning_rate": 3.0362127536287636e-07, "loss": 0.0057, "reward": 1.421875037252903, "reward_std": 0.16129080019891262, "rewards/equation_reward_func": 0.4270833423361182, "rewards/format_reward_func": 0.9947916679084301, "step": 202 }, { "completion_length": 321.65625762939453, "epoch": 0.1088, "grad_norm": 0.09396431699981035, "kl": 0.18658447265625, "learning_rate": 3.0009693117583523e-07, "loss": 0.0002, "reward": 1.4348958656191826, "reward_std": 0.19856942351907492, "rewards/equation_reward_func": 0.453125013737008, "rewards/format_reward_func": 0.9817708469927311, "step": 204 }, { "completion_length": 330.83855056762695, "epoch": 0.10986666666666667, "grad_norm": 0.07687252722110068, "kl": 0.1839599609375, "learning_rate": 2.965621832278401e-07, "loss": 0.0002, "reward": 1.377604216337204, "reward_std": 0.15589443547651172, "rewards/equation_reward_func": 0.39583334885537624, "rewards/format_reward_func": 0.9817708469927311, "step": 206 }, { "completion_length": 328.23438358306885, "epoch": 0.11093333333333333, "grad_norm": 0.0810844061250071, "kl": 0.1962890625, "learning_rate": 2.9301776558925875e-07, "loss": 0.0002, "reward": 1.3697917014360428, "reward_std": 0.20208620419725776, "rewards/equation_reward_func": 0.4036458465270698, "rewards/format_reward_func": 0.9661458544433117, "step": 208 }, { "completion_length": 313.95313262939453, "epoch": 0.112, "grad_norm": 0.0851816615508796, "kl": 0.23468017578125, "learning_rate": 2.894644143385885e-07, "loss": 0.0002, "reward": 1.3958333767950535, "reward_std": 0.18581857532262802, "rewards/equation_reward_func": 0.4244791748933494, "rewards/format_reward_func": 0.971354179084301, "step": 210 }, { "completion_length": 326.42448806762695, "epoch": 0.11306666666666666, "grad_norm": 0.0786620471083819, "kl": 0.19378662109375, "learning_rate": 2.859028674095937e-07, "loss": 0.0002, "reward": 1.4010417014360428, "reward_std": 0.1981433075852692, "rewards/equation_reward_func": 0.42708334513008595, "rewards/format_reward_func": 0.9739583432674408, "step": 212 }, { "completion_length": 308.4192781448364, "epoch": 0.11413333333333334, "grad_norm": 0.08352066179848143, "kl": 0.189453125, "learning_rate": 2.823338644380566e-07, "loss": 0.0002, "reward": 1.4401042126119137, "reward_std": 0.2307603359222412, "rewards/equation_reward_func": 0.47135418048128486, "rewards/format_reward_func": 0.9687500149011612, "step": 214 }, { "completion_length": 341.86198711395264, "epoch": 0.1152, "grad_norm": 0.10334760188864624, "kl": 0.22479248046875, "learning_rate": 2.7875814660817504e-07, "loss": 0.0002, "reward": 1.3880208693444729, "reward_std": 0.2630339222960174, "rewards/equation_reward_func": 0.4244791779201478, "rewards/format_reward_func": 0.9635416828095913, "step": 216 }, { "completion_length": 390.3906364440918, "epoch": 0.11626666666666667, "grad_norm": 0.1090470945421399, "kl": 0.2252197265625, "learning_rate": 2.751764564986396e-07, "loss": 0.0002, "reward": 1.223958358168602, "reward_std": 0.23174711503088474, "rewards/equation_reward_func": 0.27864584047347307, "rewards/format_reward_func": 0.9453125223517418, "step": 218 }, { "completion_length": 329.63282585144043, "epoch": 0.11733333333333333, "grad_norm": 0.10077949546695844, "kl": 0.2005615234375, "learning_rate": 2.715895379284194e-07, "loss": 0.0002, "reward": 1.3958333730697632, "reward_std": 0.26168868225067854, "rewards/equation_reward_func": 0.4427083469927311, "rewards/format_reward_func": 0.9531250223517418, "step": 220 }, { "completion_length": 358.4375104904175, "epoch": 0.1184, "grad_norm": 0.08964140632655672, "kl": 0.21527099609375, "learning_rate": 2.6799813580229174e-07, "loss": 0.0002, "reward": 1.3593750447034836, "reward_std": 0.25906086526811123, "rewards/equation_reward_func": 0.4036458386108279, "rewards/format_reward_func": 0.9557291902601719, "step": 222 }, { "completion_length": 343.70834255218506, "epoch": 0.11946666666666667, "grad_norm": 0.07620045097589506, "kl": 0.19964599609375, "learning_rate": 2.6440299595614606e-07, "loss": 0.0002, "reward": 1.3307291939854622, "reward_std": 0.2277261232957244, "rewards/equation_reward_func": 0.3697916753590107, "rewards/format_reward_func": 0.9609375223517418, "step": 224 }, { "completion_length": 343.32032108306885, "epoch": 0.12053333333333334, "grad_norm": 0.08700892029776192, "kl": 0.2109375, "learning_rate": 2.6080486500209347e-07, "loss": 0.0002, "reward": 1.3541667014360428, "reward_std": 0.21279292972758412, "rewards/equation_reward_func": 0.40104167512618005, "rewards/format_reward_func": 0.9531250149011612, "step": 226 }, { "completion_length": 275.88542556762695, "epoch": 0.1216, "grad_norm": 0.11035562445594559, "kl": 0.202880859375, "learning_rate": 2.572044901734166e-07, "loss": 0.0002, "reward": 1.5833333730697632, "reward_std": 0.25712650874629617, "rewards/equation_reward_func": 0.6197916865348816, "rewards/format_reward_func": 0.9635416716337204, "step": 228 }, { "completion_length": 314.13542318344116, "epoch": 0.12266666666666666, "grad_norm": 0.10200349640738855, "kl": 0.21881103515625, "learning_rate": 2.536026191693893e-07, "loss": 0.0002, "reward": 1.4505208693444729, "reward_std": 0.29838538402691483, "rewards/equation_reward_func": 0.5156250093132257, "rewards/format_reward_func": 0.9348958544433117, "step": 230 }, { "completion_length": 334.46615505218506, "epoch": 0.12373333333333333, "grad_norm": 0.104610809797409, "kl": 0.20111083984375, "learning_rate": 2.5e-07, "loss": 0.0002, "reward": 1.4479166939854622, "reward_std": 0.32854113075882196, "rewards/equation_reward_func": 0.4921875128056854, "rewards/format_reward_func": 0.9557291902601719, "step": 232 }, { "completion_length": 348.3463592529297, "epoch": 0.1248, "grad_norm": 0.11970668433207705, "kl": 0.21832275390625, "learning_rate": 2.4639738083061073e-07, "loss": 0.0002, "reward": 1.2968750409781933, "reward_std": 0.28170605981722474, "rewards/equation_reward_func": 0.35937500884756446, "rewards/format_reward_func": 0.9375000149011612, "step": 234 }, { "completion_length": 353.0625104904175, "epoch": 0.12586666666666665, "grad_norm": 0.11146940462264297, "kl": 0.2242431640625, "learning_rate": 2.4279550982658345e-07, "loss": 0.0002, "reward": 1.283854205161333, "reward_std": 0.2352255848236382, "rewards/equation_reward_func": 0.3359375107102096, "rewards/format_reward_func": 0.9479166828095913, "step": 236 }, { "completion_length": 328.27865982055664, "epoch": 0.12693333333333334, "grad_norm": 0.09019157224178884, "kl": 0.2286376953125, "learning_rate": 2.3919513499790646e-07, "loss": 0.0002, "reward": 1.4036458767950535, "reward_std": 0.2419091323390603, "rewards/equation_reward_func": 0.4557291849050671, "rewards/format_reward_func": 0.947916679084301, "step": 238 }, { "completion_length": 292.41146755218506, "epoch": 0.128, "grad_norm": 0.11216015195235872, "kl": 0.20770263671875, "learning_rate": 2.3559700404385394e-07, "loss": 0.0002, "reward": 1.4218750521540642, "reward_std": 0.214123603887856, "rewards/equation_reward_func": 0.45833334303461015, "rewards/format_reward_func": 0.963541679084301, "step": 240 }, { "completion_length": 289.44011306762695, "epoch": 0.12906666666666666, "grad_norm": 0.06748907528166415, "kl": 0.21905517578125, "learning_rate": 2.3200186419770823e-07, "loss": 0.0002, "reward": 1.4973958730697632, "reward_std": 0.21762575302273035, "rewards/equation_reward_func": 0.5390625144354999, "rewards/format_reward_func": 0.9583333544433117, "step": 242 }, { "completion_length": 312.97396659851074, "epoch": 0.13013333333333332, "grad_norm": 0.08585761519803439, "kl": 0.22705078125, "learning_rate": 2.284104620715807e-07, "loss": 0.0002, "reward": 1.382812537252903, "reward_std": 0.2223974741064012, "rewards/equation_reward_func": 0.4218750149011612, "rewards/format_reward_func": 0.9609375186264515, "step": 244 }, { "completion_length": 281.0208377838135, "epoch": 0.1312, "grad_norm": 0.09243139356469632, "kl": 0.22540283203125, "learning_rate": 2.2482354350136043e-07, "loss": 0.0002, "reward": 1.4947917088866234, "reward_std": 0.23069008206948638, "rewards/equation_reward_func": 0.5208333479240537, "rewards/format_reward_func": 0.9739583469927311, "step": 246 }, { "completion_length": 301.8541736602783, "epoch": 0.13226666666666667, "grad_norm": 0.09610646803963738, "kl": 0.22479248046875, "learning_rate": 2.2124185339182496e-07, "loss": 0.0002, "reward": 1.3802083730697632, "reward_std": 0.1815217286348343, "rewards/equation_reward_func": 0.4270833432674408, "rewards/format_reward_func": 0.9531250186264515, "step": 248 }, { "completion_length": 347.1015729904175, "epoch": 0.13333333333333333, "grad_norm": 0.1746681094283612, "kl": 0.23883056640625, "learning_rate": 2.1766613556194344e-07, "loss": 0.0002, "reward": 1.2213542088866234, "reward_std": 0.22283816616982222, "rewards/equation_reward_func": 0.26041667559184134, "rewards/format_reward_func": 0.9609375149011612, "step": 250 }, { "completion_length": 293.10677909851074, "epoch": 0.1344, "grad_norm": 0.07969455343084161, "kl": 0.305419921875, "learning_rate": 2.1409713259040628e-07, "loss": 0.0003, "reward": 1.4114583730697632, "reward_std": 0.2006126595661044, "rewards/equation_reward_func": 0.432291679084301, "rewards/format_reward_func": 0.9791666753590107, "step": 252 }, { "completion_length": 295.825532913208, "epoch": 0.13546666666666668, "grad_norm": 0.11043203499359036, "kl": 0.215087890625, "learning_rate": 2.105355856614115e-07, "loss": 0.0002, "reward": 1.4114583730697632, "reward_std": 0.3007270940579474, "rewards/equation_reward_func": 0.46354168374091387, "rewards/format_reward_func": 0.9479166828095913, "step": 254 }, { "completion_length": 295.0286531448364, "epoch": 0.13653333333333334, "grad_norm": 0.10604018583177363, "kl": 0.2293701171875, "learning_rate": 2.069822344107413e-07, "loss": 0.0002, "reward": 1.4401042088866234, "reward_std": 0.16259960131719708, "rewards/equation_reward_func": 0.46875001303851604, "rewards/format_reward_func": 0.9713541828095913, "step": 256 }, { "completion_length": 312.09897232055664, "epoch": 0.1376, "grad_norm": 0.11581309250324548, "kl": 0.22454833984375, "learning_rate": 2.034378167721599e-07, "loss": 0.0002, "reward": 1.3411458618938923, "reward_std": 0.31250663055107, "rewards/equation_reward_func": 0.39322918001562357, "rewards/format_reward_func": 0.9479166939854622, "step": 258 }, { "completion_length": 301.36980152130127, "epoch": 0.13866666666666666, "grad_norm": 0.10375800085599599, "kl": 0.24493408203125, "learning_rate": 1.9990306882416485e-07, "loss": 0.0002, "reward": 1.4635416977107525, "reward_std": 0.2693312247283757, "rewards/equation_reward_func": 0.5104166809469461, "rewards/format_reward_func": 0.9531250149011612, "step": 260 }, { "completion_length": 314.0885524749756, "epoch": 0.13973333333333332, "grad_norm": 0.1087966329523751, "kl": 0.222900390625, "learning_rate": 1.9637872463712362e-07, "loss": 0.0002, "reward": 1.4062500409781933, "reward_std": 0.26262100599706173, "rewards/equation_reward_func": 0.44270834792405367, "rewards/format_reward_func": 0.963541679084301, "step": 262 }, { "completion_length": 281.7239646911621, "epoch": 0.1408, "grad_norm": 0.09695420136164315, "kl": 0.26971435546875, "learning_rate": 1.9286551612082773e-07, "loss": 0.0003, "reward": 1.4479167088866234, "reward_std": 0.2460037199780345, "rewards/equation_reward_func": 0.48697918094694614, "rewards/format_reward_func": 0.9609375149011612, "step": 264 }, { "completion_length": 315.37240505218506, "epoch": 0.14186666666666667, "grad_norm": 0.10037156483806228, "kl": 0.24798583984375, "learning_rate": 1.8936417287249446e-07, "loss": 0.0002, "reward": 1.3385417088866234, "reward_std": 0.2581388554535806, "rewards/equation_reward_func": 0.39583334466442466, "rewards/format_reward_func": 0.9427083469927311, "step": 266 }, { "completion_length": 334.924485206604, "epoch": 0.14293333333333333, "grad_norm": 0.15279355937220046, "kl": 0.26873779296875, "learning_rate": 1.8587542202524985e-07, "loss": 0.0003, "reward": 1.268229205161333, "reward_std": 0.28603212209418416, "rewards/equation_reward_func": 0.3177083428017795, "rewards/format_reward_func": 0.9505208544433117, "step": 268 }, { "completion_length": 292.8906297683716, "epoch": 0.144, "grad_norm": 0.09627939797808117, "kl": 0.25811767578125, "learning_rate": 1.82399988097123e-07, "loss": 0.0003, "reward": 1.3828125335276127, "reward_std": 0.240143911447376, "rewards/equation_reward_func": 0.4479166786186397, "rewards/format_reward_func": 0.9348958544433117, "step": 270 }, { "completion_length": 303.7291774749756, "epoch": 0.14506666666666668, "grad_norm": 0.09414307623625273, "kl": 0.25408935546875, "learning_rate": 1.7893859284058378e-07, "loss": 0.0003, "reward": 1.3671875298023224, "reward_std": 0.24746731435880065, "rewards/equation_reward_func": 0.4218750111758709, "rewards/format_reward_func": 0.9453125149011612, "step": 272 }, { "completion_length": 276.94792318344116, "epoch": 0.14613333333333334, "grad_norm": 0.1299701036522939, "kl": 0.57366943359375, "learning_rate": 1.7549195509265407e-07, "loss": 0.0006, "reward": 1.4348958730697632, "reward_std": 0.2572689475491643, "rewards/equation_reward_func": 0.4791666748933494, "rewards/format_reward_func": 0.9557291902601719, "step": 274 }, { "completion_length": 248.166672706604, "epoch": 0.1472, "grad_norm": 0.08206460484425186, "kl": 0.256103515625, "learning_rate": 1.7206079062562536e-07, "loss": 0.0003, "reward": 1.5833333656191826, "reward_std": 0.21109008882194757, "rewards/equation_reward_func": 0.6145833458285779, "rewards/format_reward_func": 0.9687500260770321, "step": 276 }, { "completion_length": 305.27865409851074, "epoch": 0.14826666666666666, "grad_norm": 0.10621644156716899, "kl": 0.2762451171875, "learning_rate": 1.6864581199841226e-07, "loss": 0.0003, "reward": 1.312500026077032, "reward_std": 0.24705103458836675, "rewards/equation_reward_func": 0.36718751094304025, "rewards/format_reward_func": 0.9453125186264515, "step": 278 }, { "completion_length": 286.6406297683716, "epoch": 0.14933333333333335, "grad_norm": 0.10751127049009096, "kl": 0.26580810546875, "learning_rate": 1.6524772840857388e-07, "loss": 0.0003, "reward": 1.3072916977107525, "reward_std": 0.2637113491073251, "rewards/equation_reward_func": 0.38281250768341124, "rewards/format_reward_func": 0.9244791865348816, "step": 280 }, { "completion_length": 275.32552909851074, "epoch": 0.1504, "grad_norm": 0.10203495847611208, "kl": 0.29620361328125, "learning_rate": 1.6186724554503237e-07, "loss": 0.0003, "reward": 1.4687500409781933, "reward_std": 0.23805115604773164, "rewards/equation_reward_func": 0.5156250128056854, "rewards/format_reward_func": 0.9531250186264515, "step": 282 }, { "completion_length": 312.76823806762695, "epoch": 0.15146666666666667, "grad_norm": 0.09010102560559675, "kl": 0.26605224609375, "learning_rate": 1.5850506544152103e-07, "loss": 0.0003, "reward": 1.2786458618938923, "reward_std": 0.27972705382853746, "rewards/equation_reward_func": 0.35156250977888703, "rewards/format_reward_func": 0.9270833432674408, "step": 284 }, { "completion_length": 263.97916984558105, "epoch": 0.15253333333333333, "grad_norm": 0.09699956880184334, "kl": 0.271728515625, "learning_rate": 1.5516188633079107e-07, "loss": 0.0003, "reward": 1.4088542088866234, "reward_std": 0.21715012891218066, "rewards/equation_reward_func": 0.432291679084301, "rewards/format_reward_func": 0.9765625186264515, "step": 286 }, { "completion_length": 284.93750762939453, "epoch": 0.1536, "grad_norm": 0.13730205530993134, "kl": 0.26202392578125, "learning_rate": 1.5183840249960784e-07, "loss": 0.0003, "reward": 1.2916666977107525, "reward_std": 0.2690475699491799, "rewards/equation_reward_func": 0.3411458432674408, "rewards/format_reward_func": 0.9505208507180214, "step": 288 }, { "completion_length": 313.8724036216736, "epoch": 0.15466666666666667, "grad_norm": 0.10982987970993405, "kl": 0.25787353515625, "learning_rate": 1.4853530414456612e-07, "loss": 0.0003, "reward": 1.3359375447034836, "reward_std": 0.28103851480409503, "rewards/equation_reward_func": 0.38020834419876337, "rewards/format_reward_func": 0.9557291865348816, "step": 290 }, { "completion_length": 280.13021659851074, "epoch": 0.15573333333333333, "grad_norm": 0.10569696273751499, "kl": 0.2752685546875, "learning_rate": 1.4525327722875568e-07, "loss": 0.0003, "reward": 1.3723958730697632, "reward_std": 0.253665282856673, "rewards/equation_reward_func": 0.4270833453629166, "rewards/format_reward_func": 0.9453125223517418, "step": 292 }, { "completion_length": 266.9010486602783, "epoch": 0.1568, "grad_norm": 0.1273947740183966, "kl": 0.2657470703125, "learning_rate": 1.4199300333930515e-07, "loss": 0.0003, "reward": 1.4635417088866234, "reward_std": 0.28517728950828314, "rewards/equation_reward_func": 0.5026041837409139, "rewards/format_reward_func": 0.9609375111758709, "step": 294 }, { "completion_length": 328.830735206604, "epoch": 0.15786666666666666, "grad_norm": 0.1699855426323704, "kl": 0.2620849609375, "learning_rate": 1.3875515954583523e-07, "loss": 0.0003, "reward": 1.2187500447034836, "reward_std": 0.3317327341064811, "rewards/equation_reward_func": 0.2942708428017795, "rewards/format_reward_func": 0.9244791902601719, "step": 296 }, { "completion_length": 329.12240982055664, "epoch": 0.15893333333333334, "grad_norm": 0.14001227147909825, "kl": 0.27099609375, "learning_rate": 1.3554041825985e-07, "loss": 0.0003, "reward": 1.1979167014360428, "reward_std": 0.2845407989807427, "rewards/equation_reward_func": 0.28125000931322575, "rewards/format_reward_func": 0.9166666865348816, "step": 298 }, { "completion_length": 283.0989661216736, "epoch": 0.16, "grad_norm": 0.10223346879835553, "kl": 0.24761962890625, "learning_rate": 1.323494470950949e-07, "loss": 0.0002, "reward": 1.429687537252903, "reward_std": 0.26960491156205535, "rewards/equation_reward_func": 0.47135418094694614, "rewards/format_reward_func": 0.9583333544433117, "step": 300 }, { "completion_length": 254.60156726837158, "epoch": 0.16106666666666666, "grad_norm": 0.08918786164304986, "kl": 0.260986328125, "learning_rate": 1.2918290872891236e-07, "loss": 0.0003, "reward": 1.4348958805203438, "reward_std": 0.15168809751048684, "rewards/equation_reward_func": 0.458333347691223, "rewards/format_reward_func": 0.9765625111758709, "step": 302 }, { "completion_length": 280.38021516799927, "epoch": 0.16213333333333332, "grad_norm": 0.10981016883182508, "kl": 0.26275634765625, "learning_rate": 1.260414607646213e-07, "loss": 0.0003, "reward": 1.3880208618938923, "reward_std": 0.2798879165202379, "rewards/equation_reward_func": 0.42708334303461015, "rewards/format_reward_func": 0.9609375111758709, "step": 304 }, { "completion_length": 236.63802528381348, "epoch": 0.1632, "grad_norm": 0.1126860308935798, "kl": 0.24639892578125, "learning_rate": 1.2292575559495143e-07, "loss": 0.0002, "reward": 1.5338541939854622, "reward_std": 0.21581484470516443, "rewards/equation_reward_func": 0.5598958488553762, "rewards/format_reward_func": 0.9739583507180214, "step": 306 }, { "completion_length": 274.51823902130127, "epoch": 0.16426666666666667, "grad_norm": 0.11433058952931557, "kl": 0.2418212890625, "learning_rate": 1.1983644026655835e-07, "loss": 0.0002, "reward": 1.3984375298023224, "reward_std": 0.2787149855867028, "rewards/equation_reward_func": 0.4505208458285779, "rewards/format_reward_func": 0.9479166828095913, "step": 308 }, { "completion_length": 269.51303005218506, "epoch": 0.16533333333333333, "grad_norm": 0.10900628538932935, "kl": 0.2515869140625, "learning_rate": 1.1677415634565066e-07, "loss": 0.0003, "reward": 1.4531250298023224, "reward_std": 0.21808092296123505, "rewards/equation_reward_func": 0.5026041760575026, "rewards/format_reward_func": 0.950520858168602, "step": 310 }, { "completion_length": 276.29948711395264, "epoch": 0.1664, "grad_norm": 0.10471445766441949, "kl": 0.24822998046875, "learning_rate": 1.1373953978475353e-07, "loss": 0.0002, "reward": 1.4088542014360428, "reward_std": 0.2563867177814245, "rewards/equation_reward_func": 0.4557291786186397, "rewards/format_reward_func": 0.9531250260770321, "step": 312 }, { "completion_length": 281.54167652130127, "epoch": 0.16746666666666668, "grad_norm": 0.11476171924959432, "kl": 0.25030517578125, "learning_rate": 1.1073322079063913e-07, "loss": 0.0003, "reward": 1.419270884245634, "reward_std": 0.2665014350786805, "rewards/equation_reward_func": 0.46354168374091387, "rewards/format_reward_func": 0.9557291865348816, "step": 314 }, { "completion_length": 282.43490409851074, "epoch": 0.16853333333333334, "grad_norm": 0.0841971248428421, "kl": 0.21875, "learning_rate": 1.0775582369344946e-07, "loss": 0.0002, "reward": 1.424479216337204, "reward_std": 0.2608643379062414, "rewards/equation_reward_func": 0.46875001303851604, "rewards/format_reward_func": 0.9557291902601719, "step": 316 }, { "completion_length": 273.4349060058594, "epoch": 0.1696, "grad_norm": 0.09515899802774246, "kl": 0.24407958984375, "learning_rate": 1.0480796681704077e-07, "loss": 0.0002, "reward": 1.4010417088866234, "reward_std": 0.2546477783471346, "rewards/equation_reward_func": 0.4453125174622983, "rewards/format_reward_func": 0.9557291865348816, "step": 318 }, { "completion_length": 316.6458435058594, "epoch": 0.17066666666666666, "grad_norm": 0.099987410497596, "kl": 0.23638916015625, "learning_rate": 1.018902623505741e-07, "loss": 0.0002, "reward": 1.2942708656191826, "reward_std": 0.29723000014200807, "rewards/equation_reward_func": 0.3671875111758709, "rewards/format_reward_func": 0.927083358168602, "step": 320 }, { "completion_length": 288.893235206604, "epoch": 0.17173333333333332, "grad_norm": 0.11246455050265577, "kl": 0.23480224609375, "learning_rate": 9.900331622138063e-08, "loss": 0.0002, "reward": 1.3723958730697632, "reward_std": 0.289981079287827, "rewards/equation_reward_func": 0.4114583421032876, "rewards/format_reward_func": 0.9609375149011612, "step": 322 }, { "completion_length": 264.9921979904175, "epoch": 0.1728, "grad_norm": 0.10025221120521255, "kl": 0.24884033203125, "learning_rate": 9.614772796912681e-08, "loss": 0.0002, "reward": 1.398437537252903, "reward_std": 0.21295037120580673, "rewards/equation_reward_func": 0.4296875102445483, "rewards/format_reward_func": 0.9687500111758709, "step": 324 }, { "completion_length": 283.9114661216736, "epoch": 0.17386666666666667, "grad_norm": 0.08393060980669469, "kl": 0.2662353515625, "learning_rate": 9.332409062130686e-08, "loss": 0.0003, "reward": 1.3046875298023224, "reward_std": 0.211736383382231, "rewards/equation_reward_func": 0.3437500102445483, "rewards/format_reward_func": 0.9609375149011612, "step": 326 }, { "completion_length": 280.97657108306885, "epoch": 0.17493333333333333, "grad_norm": 0.09266235555090595, "kl": 0.26544189453125, "learning_rate": 9.053299057008699e-08, "loss": 0.0003, "reward": 1.3619792014360428, "reward_std": 0.18739549908787012, "rewards/equation_reward_func": 0.40364584513008595, "rewards/format_reward_func": 0.9583333544433117, "step": 328 }, { "completion_length": 266.9140729904175, "epoch": 0.176, "grad_norm": 0.17475099073751835, "kl": 0.24237060546875, "learning_rate": 8.777500745052743e-08, "loss": 0.0002, "reward": 1.4192708879709244, "reward_std": 0.2251653028652072, "rewards/equation_reward_func": 0.45312501629814506, "rewards/format_reward_func": 0.9661458544433117, "step": 330 }, { "completion_length": 281.2734489440918, "epoch": 0.17706666666666668, "grad_norm": 0.11185068411943261, "kl": 0.24456787109375, "learning_rate": 8.505071402020892e-08, "loss": 0.0002, "reward": 1.393229216337204, "reward_std": 0.2644071178510785, "rewards/equation_reward_func": 0.4453125111758709, "rewards/format_reward_func": 0.947916679084301, "step": 332 }, { "completion_length": 283.9192819595337, "epoch": 0.17813333333333334, "grad_norm": 0.14116520705594282, "kl": 0.2410888671875, "learning_rate": 8.236067604028562e-08, "loss": 0.0002, "reward": 1.3723958656191826, "reward_std": 0.2818891149945557, "rewards/equation_reward_func": 0.41406250931322575, "rewards/format_reward_func": 0.9583333544433117, "step": 334 }, { "completion_length": 260.65104579925537, "epoch": 0.1792, "grad_norm": 0.1336225513443869, "kl": 0.239501953125, "learning_rate": 7.970545215799327e-08, "loss": 0.0002, "reward": 1.4869791939854622, "reward_std": 0.28690007980912924, "rewards/equation_reward_func": 0.5390625186264515, "rewards/format_reward_func": 0.9479166902601719, "step": 336 }, { "completion_length": 248.3099012374878, "epoch": 0.18026666666666666, "grad_norm": 0.09856720056681173, "kl": 0.23907470703125, "learning_rate": 7.708559379063204e-08, "loss": 0.0002, "reward": 1.4817708656191826, "reward_std": 0.23133338056504726, "rewards/equation_reward_func": 0.5260416767559946, "rewards/format_reward_func": 0.9557291902601719, "step": 338 }, { "completion_length": 290.5052146911621, "epoch": 0.18133333333333335, "grad_norm": 0.1122615481772805, "kl": 0.24444580078125, "learning_rate": 7.45016450110534e-08, "loss": 0.0002, "reward": 1.2838542014360428, "reward_std": 0.2104581743478775, "rewards/equation_reward_func": 0.32291667233221233, "rewards/format_reward_func": 0.9609375186264515, "step": 340 }, { "completion_length": 284.09896516799927, "epoch": 0.1824, "grad_norm": 0.09965326339693975, "kl": 0.2493896484375, "learning_rate": 7.195414243467029e-08, "loss": 0.0002, "reward": 1.3906250335276127, "reward_std": 0.27104497281834483, "rewards/equation_reward_func": 0.4401041779201478, "rewards/format_reward_func": 0.9505208469927311, "step": 342 }, { "completion_length": 282.057297706604, "epoch": 0.18346666666666667, "grad_norm": 0.08762325381098879, "kl": 0.32757568359375, "learning_rate": 6.944361510801763e-08, "loss": 0.0003, "reward": 1.2994792014360428, "reward_std": 0.23241478390991688, "rewards/equation_reward_func": 0.3385416711680591, "rewards/format_reward_func": 0.9609375186264515, "step": 344 }, { "completion_length": 279.9739685058594, "epoch": 0.18453333333333333, "grad_norm": 0.14683183029957406, "kl": 0.6046142578125, "learning_rate": 6.697058439888283e-08, "loss": 0.0006, "reward": 1.3697917088866234, "reward_std": 0.26870738714933395, "rewards/equation_reward_func": 0.41406251629814506, "rewards/format_reward_func": 0.955729179084301, "step": 346 }, { "completion_length": 270.08594703674316, "epoch": 0.1856, "grad_norm": 0.12276857645312758, "kl": 0.24163818359375, "learning_rate": 6.453556388803288e-08, "loss": 0.0002, "reward": 1.4062500484287739, "reward_std": 0.28894974663853645, "rewards/equation_reward_func": 0.4557291795499623, "rewards/format_reward_func": 0.9505208544433117, "step": 348 }, { "completion_length": 256.07552909851074, "epoch": 0.18666666666666668, "grad_norm": 0.1261473193256241, "kl": 0.29376220703125, "learning_rate": 6.213905926255697e-08, "loss": 0.0003, "reward": 1.4479167014360428, "reward_std": 0.250754666980356, "rewards/equation_reward_func": 0.5078125149011612, "rewards/format_reward_func": 0.9401041902601719, "step": 350 }, { "completion_length": 249.1354274749756, "epoch": 0.18773333333333334, "grad_norm": 0.08506597582252638, "kl": 0.244384765625, "learning_rate": 5.978156821084987e-08, "loss": 0.0002, "reward": 1.432291705161333, "reward_std": 0.19336163811385632, "rewards/equation_reward_func": 0.47916667349636555, "rewards/format_reward_func": 0.9531250260770321, "step": 352 }, { "completion_length": 277.8489685058594, "epoch": 0.1888, "grad_norm": 0.12037895470125451, "kl": 0.23748779296875, "learning_rate": 5.7463580319254853e-08, "loss": 0.0002, "reward": 1.3437500521540642, "reward_std": 0.2519768704660237, "rewards/equation_reward_func": 0.4114583432674408, "rewards/format_reward_func": 0.9322916865348816, "step": 354 }, { "completion_length": 252.59375858306885, "epoch": 0.18986666666666666, "grad_norm": 0.11733297431372698, "kl": 0.239013671875, "learning_rate": 5.518557697039081e-08, "loss": 0.0002, "reward": 1.4557292014360428, "reward_std": 0.2128398958593607, "rewards/equation_reward_func": 0.4921875107102096, "rewards/format_reward_func": 0.9635416828095913, "step": 356 }, { "completion_length": 252.0677137374878, "epoch": 0.19093333333333334, "grad_norm": 0.08775856965094549, "kl": 2.04522705078125, "learning_rate": 5.294803124318145e-08, "loss": 0.0021, "reward": 1.5104167088866234, "reward_std": 0.2261988613754511, "rewards/equation_reward_func": 0.5338541800156236, "rewards/format_reward_func": 0.9765625149011612, "step": 358 }, { "completion_length": 266.70313262939453, "epoch": 0.192, "grad_norm": 0.10933086508784831, "kl": 0.2430419921875, "learning_rate": 5.07514078146106e-08, "loss": 0.0002, "reward": 1.3984375409781933, "reward_std": 0.22465246403589845, "rewards/equation_reward_func": 0.4401041786186397, "rewards/format_reward_func": 0.9583333507180214, "step": 360 }, { "completion_length": 244.33073902130127, "epoch": 0.19306666666666666, "grad_norm": 0.09058401208636457, "kl": 0.2347412109375, "learning_rate": 4.859616286322094e-08, "loss": 0.0002, "reward": 1.4895833730697632, "reward_std": 0.20016511622816324, "rewards/equation_reward_func": 0.5260416809469461, "rewards/format_reward_func": 0.9635416828095913, "step": 362 }, { "completion_length": 254.1927146911621, "epoch": 0.19413333333333332, "grad_norm": 0.06760472710437652, "kl": 0.24163818359375, "learning_rate": 4.648274397437829e-08, "loss": 0.0002, "reward": 1.416666705161333, "reward_std": 0.1794181428849697, "rewards/equation_reward_func": 0.447916679084301, "rewards/format_reward_func": 0.9687500186264515, "step": 364 }, { "completion_length": 265.5052156448364, "epoch": 0.1952, "grad_norm": 0.09934227406541099, "kl": 0.24114990234375, "learning_rate": 4.4411590047320617e-08, "loss": 0.0002, "reward": 1.437500037252903, "reward_std": 0.23727863328531384, "rewards/equation_reward_func": 0.47916667722165585, "rewards/format_reward_func": 0.9583333507180214, "step": 366 }, { "completion_length": 287.24219131469727, "epoch": 0.19626666666666667, "grad_norm": 0.08869368411582416, "kl": 0.2509765625, "learning_rate": 4.2383131204010494e-08, "loss": 0.0003, "reward": 1.3229166977107525, "reward_std": 0.2696537869051099, "rewards/equation_reward_func": 0.36718750558793545, "rewards/format_reward_func": 0.9557291828095913, "step": 368 }, { "completion_length": 250.90886116027832, "epoch": 0.19733333333333333, "grad_norm": 0.11141469624967881, "kl": 0.2425537109375, "learning_rate": 4.039778869981064e-08, "loss": 0.0002, "reward": 1.408854205161333, "reward_std": 0.2594145955517888, "rewards/equation_reward_func": 0.45572917349636555, "rewards/format_reward_func": 0.9531250223517418, "step": 370 }, { "completion_length": 288.8619861602783, "epoch": 0.1984, "grad_norm": 0.09328173881518842, "kl": 0.260009765625, "learning_rate": 3.845597483600049e-08, "loss": 0.0003, "reward": 1.2708333618938923, "reward_std": 0.24974829843267798, "rewards/equation_reward_func": 0.3281250046566129, "rewards/format_reward_func": 0.942708358168602, "step": 372 }, { "completion_length": 257.04167652130127, "epoch": 0.19946666666666665, "grad_norm": 0.11987502766296552, "kl": 0.26214599609375, "learning_rate": 3.655809287415284e-08, "loss": 0.0003, "reward": 1.4140625521540642, "reward_std": 0.23185446253046393, "rewards/equation_reward_func": 0.45572918001562357, "rewards/format_reward_func": 0.9583333544433117, "step": 374 }, { "completion_length": 246.8671932220459, "epoch": 0.20053333333333334, "grad_norm": 0.07938676127449044, "kl": 0.2530517578125, "learning_rate": 3.4704536952387285e-08, "loss": 0.0003, "reward": 1.4531250298023224, "reward_std": 0.2485762145370245, "rewards/equation_reward_func": 0.4843750090803951, "rewards/format_reward_func": 0.9687500186264515, "step": 376 }, { "completion_length": 271.40625762939453, "epoch": 0.2016, "grad_norm": 0.10397425885690677, "kl": 0.253662109375, "learning_rate": 3.2895692003518575e-08, "loss": 0.0003, "reward": 1.372395884245634, "reward_std": 0.2290022149682045, "rewards/equation_reward_func": 0.419270847691223, "rewards/format_reward_func": 0.9531250149011612, "step": 378 }, { "completion_length": 277.70313262939453, "epoch": 0.20266666666666666, "grad_norm": 0.11176010513775461, "kl": 0.2552490234375, "learning_rate": 3.113193367511635e-08, "loss": 0.0003, "reward": 1.3489583693444729, "reward_std": 0.3045574314892292, "rewards/equation_reward_func": 0.4114583481568843, "rewards/format_reward_func": 0.9375000186264515, "step": 380 }, { "completion_length": 264.783860206604, "epoch": 0.20373333333333332, "grad_norm": 0.09633127157125651, "kl": 0.2548828125, "learning_rate": 2.9413628251493934e-08, "loss": 0.0003, "reward": 1.3932292014360428, "reward_std": 0.26427287235856056, "rewards/equation_reward_func": 0.4401041737291962, "rewards/format_reward_func": 0.9531250111758709, "step": 382 }, { "completion_length": 249.04167366027832, "epoch": 0.2048, "grad_norm": 0.06042361226548213, "kl": 0.25054931640625, "learning_rate": 2.774113257764066e-08, "loss": 0.0003, "reward": 1.4322917088866234, "reward_std": 0.19768574135378003, "rewards/equation_reward_func": 0.4817708507180214, "rewards/format_reward_func": 0.9505208469927311, "step": 384 }, { "completion_length": 289.47136306762695, "epoch": 0.20586666666666667, "grad_norm": 0.09636095745621918, "kl": 0.24542236328125, "learning_rate": 2.611479398511518e-08, "loss": 0.0002, "reward": 1.291666705161333, "reward_std": 0.22107936535030603, "rewards/equation_reward_func": 0.3463541760575026, "rewards/format_reward_func": 0.9453125186264515, "step": 386 }, { "completion_length": 247.9687581062317, "epoch": 0.20693333333333333, "grad_norm": 0.10748505650467376, "kl": 0.2657470703125, "learning_rate": 2.4534950219914057e-08, "loss": 0.0003, "reward": 1.494791705161333, "reward_std": 0.24816493690013885, "rewards/equation_reward_func": 0.5312500125728548, "rewards/format_reward_func": 0.9635416828095913, "step": 388 }, { "completion_length": 248.68490505218506, "epoch": 0.208, "grad_norm": 0.11757891850912854, "kl": 0.2335205078125, "learning_rate": 2.300192937233128e-08, "loss": 0.0002, "reward": 1.4505208656191826, "reward_std": 0.22064228588715196, "rewards/equation_reward_func": 0.4895833458285779, "rewards/format_reward_func": 0.9609375186264515, "step": 390 }, { "completion_length": 271.49480056762695, "epoch": 0.20906666666666668, "grad_norm": 0.07918511324806074, "kl": 0.23931884765625, "learning_rate": 2.1516049808822935e-08, "loss": 0.0002, "reward": 1.3515625409781933, "reward_std": 0.18755131447687745, "rewards/equation_reward_func": 0.38020834140479565, "rewards/format_reward_func": 0.9713541828095913, "step": 392 }, { "completion_length": 259.51563835144043, "epoch": 0.21013333333333334, "grad_norm": 0.1495234231858708, "kl": 0.241455078125, "learning_rate": 2.007762010589098e-08, "loss": 0.0002, "reward": 1.4947917088866234, "reward_std": 0.33302151458337903, "rewards/equation_reward_func": 0.5468750111758709, "rewards/format_reward_func": 0.9479166828095913, "step": 394 }, { "completion_length": 280.27344608306885, "epoch": 0.2112, "grad_norm": 0.10448152858384566, "kl": 0.25335693359375, "learning_rate": 1.8686938986000627e-08, "loss": 0.0003, "reward": 1.3593750335276127, "reward_std": 0.2329879915341735, "rewards/equation_reward_func": 0.3984375107102096, "rewards/format_reward_func": 0.9609375186264515, "step": 396 }, { "completion_length": 243.24479961395264, "epoch": 0.21226666666666666, "grad_norm": 0.10158686561243806, "kl": 0.26129150390625, "learning_rate": 1.734429525554365e-08, "loss": 0.0003, "reward": 1.5312500447034836, "reward_std": 0.26366367703303695, "rewards/equation_reward_func": 0.5677083469927311, "rewards/format_reward_func": 0.9635416828095913, "step": 398 }, { "completion_length": 262.8463611602783, "epoch": 0.21333333333333335, "grad_norm": 0.0879686678616527, "kl": 0.23956298828125, "learning_rate": 1.604996774486145e-08, "loss": 0.0002, "reward": 1.4479167088866234, "reward_std": 0.2433197470381856, "rewards/equation_reward_func": 0.4973958428017795, "rewards/format_reward_func": 0.9505208544433117, "step": 400 }, { "completion_length": 255.82292366027832, "epoch": 0.2144, "grad_norm": 0.09500435271087032, "kl": 0.23638916015625, "learning_rate": 1.4804225250339281e-08, "loss": 0.0002, "reward": 1.4192708730697632, "reward_std": 0.23259615385904908, "rewards/equation_reward_func": 0.4609375165309757, "rewards/format_reward_func": 0.9583333469927311, "step": 402 }, { "completion_length": 217.63802671432495, "epoch": 0.21546666666666667, "grad_norm": 0.10034531857683562, "kl": 0.2501220703125, "learning_rate": 1.360732647858498e-08, "loss": 0.0003, "reward": 1.5156250335276127, "reward_std": 0.17062418861314654, "rewards/equation_reward_func": 0.5442708432674408, "rewards/format_reward_func": 0.9713541865348816, "step": 404 }, { "completion_length": 234.06771516799927, "epoch": 0.21653333333333333, "grad_norm": 0.12227115923971459, "kl": 0.248779296875, "learning_rate": 1.2459519992702311e-08, "loss": 0.0002, "reward": 1.4921875298023224, "reward_std": 0.23618489829823375, "rewards/equation_reward_func": 0.5234375111758709, "rewards/format_reward_func": 0.9687500149011612, "step": 406 }, { "completion_length": 243.70052909851074, "epoch": 0.2176, "grad_norm": 0.15926056972625335, "kl": 0.27069091796875, "learning_rate": 1.1361044160671629e-08, "loss": 0.0003, "reward": 1.4505208618938923, "reward_std": 0.28682674188166857, "rewards/equation_reward_func": 0.5052083535119891, "rewards/format_reward_func": 0.9453125149011612, "step": 408 }, { "completion_length": 275.8385486602783, "epoch": 0.21866666666666668, "grad_norm": 0.12757068910568817, "kl": 0.24969482421875, "learning_rate": 1.0312127105846947e-08, "loss": 0.0002, "reward": 1.3645833730697632, "reward_std": 0.21345845330506563, "rewards/equation_reward_func": 0.40104167675599456, "rewards/format_reward_func": 0.9635416828095913, "step": 410 }, { "completion_length": 287.200532913208, "epoch": 0.21973333333333334, "grad_norm": 0.11457759488995656, "kl": 0.244384765625, "learning_rate": 9.312986659581301e-09, "loss": 0.0002, "reward": 1.3229166977107525, "reward_std": 0.21469376189634204, "rewards/equation_reward_func": 0.3593750111758709, "rewards/format_reward_func": 0.9635416865348816, "step": 412 }, { "completion_length": 265.48438358306885, "epoch": 0.2208, "grad_norm": 0.12852298602657852, "kl": 0.2640380859375, "learning_rate": 8.363830315988945e-09, "loss": 0.0003, "reward": 1.3463541828095913, "reward_std": 0.23709475807845592, "rewards/equation_reward_func": 0.39583334093913436, "rewards/format_reward_func": 0.9505208507180214, "step": 414 }, { "completion_length": 271.54688358306885, "epoch": 0.22186666666666666, "grad_norm": 0.08478231012580131, "kl": 0.28369140625, "learning_rate": 7.46485518885462e-09, "loss": 0.0003, "reward": 1.3489583693444729, "reward_std": 0.22044954542070627, "rewards/equation_reward_func": 0.3932291807141155, "rewards/format_reward_func": 0.9557291828095913, "step": 416 }, { "completion_length": 243.90625667572021, "epoch": 0.22293333333333334, "grad_norm": 0.10846557765273872, "kl": 0.24072265625, "learning_rate": 6.616247970698319e-09, "loss": 0.0002, "reward": 1.533854205161333, "reward_std": 0.2181540415622294, "rewards/equation_reward_func": 0.5598958469927311, "rewards/format_reward_func": 0.9739583469927311, "step": 418 }, { "completion_length": 264.71094608306885, "epoch": 0.224, "grad_norm": 0.1142319675311567, "kl": 0.261962890625, "learning_rate": 5.8181848940044855e-09, "loss": 0.0003, "reward": 1.4114583805203438, "reward_std": 0.22676061373203993, "rewards/equation_reward_func": 0.4635416807141155, "rewards/format_reward_func": 0.9479166828095913, "step": 420 }, { "completion_length": 229.906259059906, "epoch": 0.22506666666666666, "grad_norm": 0.09559713141008308, "kl": 0.2418212890625, "learning_rate": 5.070831694623135e-09, "loss": 0.0002, "reward": 1.531250037252903, "reward_std": 0.19829656789079309, "rewards/equation_reward_func": 0.5703125149011612, "rewards/format_reward_func": 0.9609375186264515, "step": 422 }, { "completion_length": 248.05209159851074, "epoch": 0.22613333333333333, "grad_norm": 0.11040696978140259, "kl": 0.24212646484375, "learning_rate": 4.374343577351336e-09, "loss": 0.0002, "reward": 1.4192708656191826, "reward_std": 0.27204828383401036, "rewards/equation_reward_func": 0.4531250123400241, "rewards/format_reward_func": 0.9661458544433117, "step": 424 }, { "completion_length": 257.89584159851074, "epoch": 0.2272, "grad_norm": 0.0992000332189083, "kl": 0.24200439453125, "learning_rate": 3.7288651837012745e-09, "loss": 0.0002, "reward": 1.4062500298023224, "reward_std": 0.2646353510208428, "rewards/equation_reward_func": 0.45312501094304025, "rewards/format_reward_func": 0.9531250223517418, "step": 426 }, { "completion_length": 231.69792461395264, "epoch": 0.22826666666666667, "grad_norm": 0.1407050044165881, "kl": 0.27252197265625, "learning_rate": 3.134530561862081e-09, "loss": 0.0003, "reward": 1.4869792014360428, "reward_std": 0.15594792971387506, "rewards/equation_reward_func": 0.5104166809469461, "rewards/format_reward_func": 0.9765625149011612, "step": 428 }, { "completion_length": 276.9817762374878, "epoch": 0.22933333333333333, "grad_norm": 0.1167791204621414, "kl": 0.24884033203125, "learning_rate": 2.5914631388619103e-09, "loss": 0.0002, "reward": 1.3463542088866234, "reward_std": 0.21618649549782276, "rewards/equation_reward_func": 0.3906250107102096, "rewards/format_reward_func": 0.9557291902601719, "step": 430 }, { "completion_length": 249.96094417572021, "epoch": 0.2304, "grad_norm": 0.10179382560252617, "kl": 0.24853515625, "learning_rate": 2.0997756949353297e-09, "loss": 0.0002, "reward": 1.4817708656191826, "reward_std": 0.20142082124948502, "rewards/equation_reward_func": 0.513020845130086, "rewards/format_reward_func": 0.9687500111758709, "step": 432 }, { "completion_length": 304.00261211395264, "epoch": 0.23146666666666665, "grad_norm": 0.12633771333357205, "kl": 0.2705078125, "learning_rate": 1.6595703401020844e-09, "loss": 0.0003, "reward": 1.2734375409781933, "reward_std": 0.2817671154625714, "rewards/equation_reward_func": 0.3307291779201478, "rewards/format_reward_func": 0.9427083507180214, "step": 434 }, { "completion_length": 244.33594417572021, "epoch": 0.23253333333333334, "grad_norm": 0.12174371002417166, "kl": 0.24542236328125, "learning_rate": 1.2709384929615596e-09, "loss": 0.0002, "reward": 1.4817708805203438, "reward_std": 0.24246670864522457, "rewards/equation_reward_func": 0.5208333441987634, "rewards/format_reward_func": 0.9609375186264515, "step": 436 }, { "completion_length": 274.6692781448364, "epoch": 0.2336, "grad_norm": 0.08153629624949502, "kl": 0.2364501953125, "learning_rate": 9.339608617077165e-10, "loss": 0.0002, "reward": 1.3958333730697632, "reward_std": 0.18359084147959948, "rewards/equation_reward_func": 0.44531250884756446, "rewards/format_reward_func": 0.9505208544433117, "step": 438 }, { "completion_length": 271.5208406448364, "epoch": 0.23466666666666666, "grad_norm": 0.10062195336090982, "kl": 0.25860595703125, "learning_rate": 6.487074273681114e-10, "loss": 0.0003, "reward": 1.3567708730697632, "reward_std": 0.2888470063917339, "rewards/equation_reward_func": 0.4062500102445483, "rewards/format_reward_func": 0.950520858168602, "step": 440 }, { "completion_length": 279.31511306762695, "epoch": 0.23573333333333332, "grad_norm": 0.11347953554392516, "kl": 0.27435302734375, "learning_rate": 4.152374292708538e-10, "loss": 0.0003, "reward": 1.328125037252903, "reward_std": 0.2600484313443303, "rewards/equation_reward_func": 0.3723958460614085, "rewards/format_reward_func": 0.9557291902601719, "step": 442 }, { "completion_length": 229.1927137374878, "epoch": 0.2368, "grad_norm": 0.08761777334438094, "kl": 0.23480224609375, "learning_rate": 2.3359935274214204e-10, "loss": 0.0002, "reward": 1.5416667088866234, "reward_std": 0.19063151394948363, "rewards/equation_reward_func": 0.5651041842065752, "rewards/format_reward_func": 0.9765625149011612, "step": 444 }, { "completion_length": 269.6927146911621, "epoch": 0.23786666666666667, "grad_norm": 0.08325007668726372, "kl": 0.24908447265625, "learning_rate": 1.0383091903720665e-10, "loss": 0.0002, "reward": 1.3802083730697632, "reward_std": 0.19487999146804214, "rewards/equation_reward_func": 0.41927084675990045, "rewards/format_reward_func": 0.9609375223517418, "step": 446 }, { "completion_length": 252.57813453674316, "epoch": 0.23893333333333333, "grad_norm": 0.07984790038875238, "kl": 0.24322509765625, "learning_rate": 2.595907750671533e-11, "loss": 0.0002, "reward": 1.4505208730697632, "reward_std": 0.1807808456942439, "rewards/equation_reward_func": 0.47656251257285476, "rewards/format_reward_func": 0.9739583469927311, "step": 448 }, { "completion_length": 277.79948806762695, "epoch": 0.24, "grad_norm": 0.11593052361546653, "kl": 0.26580810546875, "learning_rate": 0.0, "loss": 0.0003, "reward": 1.3541667088866234, "reward_std": 0.2557070981711149, "rewards/equation_reward_func": 0.40364584303461015, "rewards/format_reward_func": 0.9505208469927311, "step": 450 }, { "epoch": 0.24, "step": 450, "total_flos": 0.0, "train_loss": 0.00020930594997387746, "train_runtime": 20107.4364, "train_samples_per_second": 0.537, "train_steps_per_second": 0.022 } ], "logging_steps": 2, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }