{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5599626691553896, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 502.7198944091797, "epoch": 0.002488722974023954, "grad_norm": 0.10407081592214139, "kl": 0.0, "learning_rate": 7.142857142857142e-08, "loss": -0.0, "reward": 0.30915180034935474, "reward_std": 0.45076136849820614, "rewards/equation_reward_func": 0.06473214633297175, "rewards/format_reward_func": 0.24441965389996767, "step": 2 }, { "completion_length": 493.8493537902832, "epoch": 0.004977445948047908, "grad_norm": 0.09927009002301451, "kl": 0.00039756298065185547, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "reward": 0.3638393059372902, "reward_std": 0.49667514115571976, "rewards/equation_reward_func": 0.06696428824216127, "rewards/format_reward_func": 0.2968750149011612, "step": 4 }, { "completion_length": 497.8616313934326, "epoch": 0.007466168922071862, "grad_norm": 0.093370113106332, "kl": 0.0003921985626220703, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "reward": 0.3147321604192257, "reward_std": 0.45942891016602516, "rewards/equation_reward_func": 0.06138393213041127, "rewards/format_reward_func": 0.25334822479635477, "step": 6 }, { "completion_length": 498.6730136871338, "epoch": 0.009954891896095816, "grad_norm": 0.087715911930183, "kl": 0.00040519237518310547, "learning_rate": 2.857142857142857e-07, "loss": 0.0, "reward": 0.3671875223517418, "reward_std": 0.510881919413805, "rewards/equation_reward_func": 0.07589286006987095, "rewards/format_reward_func": 0.2912946566939354, "step": 8 }, { "completion_length": 506.512300491333, "epoch": 0.01244361487011977, "grad_norm": 0.1032540463580205, "kl": 0.00044214725494384766, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "reward": 0.3303571604192257, "reward_std": 0.48546610213816166, "rewards/equation_reward_func": 0.06026785948779434, "rewards/format_reward_func": 0.2700892947614193, "step": 10 }, { "completion_length": 501.3951072692871, "epoch": 0.014932337844143724, "grad_norm": 0.10817803852285492, "kl": 0.0004922151565551758, "learning_rate": 4.285714285714285e-07, "loss": 0.0, "reward": 0.33258930407464504, "reward_std": 0.4802868850529194, "rewards/equation_reward_func": 0.03906250197906047, "rewards/format_reward_func": 0.2935267984867096, "step": 12 }, { "completion_length": 471.93529510498047, "epoch": 0.017421060818167678, "grad_norm": 0.09644558381581954, "kl": 0.0010251998901367188, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.510044664144516, "reward_std": 0.5347170457243919, "rewards/equation_reward_func": 0.061383931897580624, "rewards/format_reward_func": 0.4486607387661934, "step": 14 }, { "completion_length": 457.7031478881836, "epoch": 0.019909783792191632, "grad_norm": 0.09293320344371607, "kl": 0.0017347335815429688, "learning_rate": 4.999740409224932e-07, "loss": 0.0, "reward": 0.5535714514553547, "reward_std": 0.5403263326734304, "rewards/equation_reward_func": 0.05357143096625805, "rewards/format_reward_func": 0.5000000223517418, "step": 16 }, { "completion_length": 449.1908645629883, "epoch": 0.022398506766215586, "grad_norm": 0.07268783989009871, "kl": 0.0044155120849609375, "learning_rate": 4.998961690809627e-07, "loss": 0.0, "reward": 0.7812500447034836, "reward_std": 0.5079520158469677, "rewards/equation_reward_func": 0.08258928905706853, "rewards/format_reward_func": 0.6986607424914837, "step": 18 }, { "completion_length": 439.35828018188477, "epoch": 0.02488722974023954, "grad_norm": 0.07030613994163097, "kl": 0.007472991943359375, "learning_rate": 4.997664006472578e-07, "loss": 0.0, "reward": 0.8560268208384514, "reward_std": 0.48480591736733913, "rewards/equation_reward_func": 0.09933036286383867, "rewards/format_reward_func": 0.7566964626312256, "step": 20 }, { "completion_length": 448.36609268188477, "epoch": 0.027375952714263494, "grad_norm": 0.0688012680174, "kl": 0.0070819854736328125, "learning_rate": 4.995847625707292e-07, "loss": 0.0, "reward": 0.856026828289032, "reward_std": 0.45433524437248707, "rewards/equation_reward_func": 0.07477678905706853, "rewards/format_reward_func": 0.7812500298023224, "step": 22 }, { "completion_length": 449.35046577453613, "epoch": 0.029864675688287448, "grad_norm": 0.06658434704593806, "kl": 0.007633209228515625, "learning_rate": 4.993512925726318e-07, "loss": 0.0, "reward": 0.9084821790456772, "reward_std": 0.42067001201212406, "rewards/equation_reward_func": 0.0792410756694153, "rewards/format_reward_func": 0.8292411155998707, "step": 24 }, { "completion_length": 451.9955539703369, "epoch": 0.0323533986623114, "grad_norm": 0.059102019385286024, "kl": 0.008182525634765625, "learning_rate": 4.990660391382923e-07, "loss": 0.0, "reward": 0.9698661118745804, "reward_std": 0.4051512759178877, "rewards/equation_reward_func": 0.10714286286383867, "rewards/format_reward_func": 0.8627232685685158, "step": 26 }, { "completion_length": 436.1049270629883, "epoch": 0.034842121636335356, "grad_norm": 0.0505975332867349, "kl": 0.010929107666015625, "learning_rate": 4.987290615070384e-07, "loss": 0.0, "reward": 0.9866071864962578, "reward_std": 0.30416737496852875, "rewards/equation_reward_func": 0.07366071757860482, "rewards/format_reward_func": 0.9129464700818062, "step": 28 }, { "completion_length": 415.63952827453613, "epoch": 0.03733084461035931, "grad_norm": 0.05617076097244933, "kl": 0.01288604736328125, "learning_rate": 4.983404296598978e-07, "loss": 0.0, "reward": 1.0814732611179352, "reward_std": 0.3300146460533142, "rewards/equation_reward_func": 0.1395089360885322, "rewards/format_reward_func": 0.9419643245637417, "step": 30 }, { "completion_length": 406.3560447692871, "epoch": 0.039819567584383264, "grad_norm": 0.04537480362751575, "kl": 0.015163421630859375, "learning_rate": 4.979002243050646e-07, "loss": 0.0, "reward": 1.0691964663565159, "reward_std": 0.25317980349063873, "rewards/equation_reward_func": 0.10602679057046771, "rewards/format_reward_func": 0.9631696753203869, "step": 32 }, { "completion_length": 420.8080539703369, "epoch": 0.042308290558407215, "grad_norm": 0.05671737906093819, "kl": 0.015956878662109375, "learning_rate": 4.974085368611381e-07, "loss": 0.0, "reward": 1.0703125521540642, "reward_std": 0.28108513820916414, "rewards/equation_reward_func": 0.1138392926659435, "rewards/format_reward_func": 0.956473246216774, "step": 34 }, { "completion_length": 390.3638553619385, "epoch": 0.04479701353243117, "grad_norm": 0.057276721518591346, "kl": 0.0186004638671875, "learning_rate": 4.968654694381379e-07, "loss": 0.0, "reward": 1.1171875521540642, "reward_std": 0.2574427053332329, "rewards/equation_reward_func": 0.13616072107106447, "rewards/format_reward_func": 0.9810268059372902, "step": 36 }, { "completion_length": 388.2288131713867, "epoch": 0.04728573650645512, "grad_norm": 0.05608729427224838, "kl": 0.025173187255859375, "learning_rate": 4.962711348162987e-07, "loss": 0.0, "reward": 1.0770089775323868, "reward_std": 0.2866173293441534, "rewards/equation_reward_func": 0.112723219092004, "rewards/format_reward_func": 0.9642857387661934, "step": 38 }, { "completion_length": 373.26229095458984, "epoch": 0.04977445948047908, "grad_norm": 0.05668420494869164, "kl": 0.02100372314453125, "learning_rate": 4.956256564226487e-07, "loss": 0.0, "reward": 1.1049107648432255, "reward_std": 0.254839526489377, "rewards/equation_reward_func": 0.1272321477299556, "rewards/format_reward_func": 0.9776786081492901, "step": 40 }, { "completion_length": 381.3013553619385, "epoch": 0.05226318245450303, "grad_norm": 0.05384466374487655, "kl": 0.02074432373046875, "learning_rate": 4.949291683053768e-07, "loss": 0.0, "reward": 1.0825893394649029, "reward_std": 0.23148810863494873, "rewards/equation_reward_func": 0.10714286239817739, "rewards/format_reward_func": 0.9754464626312256, "step": 42 }, { "completion_length": 374.8694362640381, "epoch": 0.05475190542852699, "grad_norm": 0.05693779183820393, "kl": 0.0219879150390625, "learning_rate": 4.941818151059955e-07, "loss": 0.0, "reward": 1.0959821939468384, "reward_std": 0.23808866180479527, "rewards/equation_reward_func": 0.11607143329456449, "rewards/format_reward_func": 0.9799107499420643, "step": 44 }, { "completion_length": 357.81028175354004, "epoch": 0.05724062840255094, "grad_norm": 0.050061664583350464, "kl": 0.025909423828125, "learning_rate": 4.933837520293017e-07, "loss": 0.0, "reward": 1.1026786416769028, "reward_std": 0.22216346580535173, "rewards/equation_reward_func": 0.1160714344587177, "rewards/format_reward_func": 0.9866071753203869, "step": 46 }, { "completion_length": 376.34823417663574, "epoch": 0.059729351376574896, "grad_norm": 0.05730255508966384, "kl": 0.0270843505859375, "learning_rate": 4.925351448111454e-07, "loss": 0.0, "reward": 1.1517857685685158, "reward_std": 0.2740681590512395, "rewards/equation_reward_func": 0.17299108253791928, "rewards/format_reward_func": 0.9787946753203869, "step": 48 }, { "completion_length": 355.76564025878906, "epoch": 0.06221807435059885, "grad_norm": 0.052885138884175203, "kl": 0.03263092041015625, "learning_rate": 4.91636169684011e-07, "loss": 0.0, "reward": 1.13058041036129, "reward_std": 0.2741837967187166, "rewards/equation_reward_func": 0.1551339344587177, "rewards/format_reward_func": 0.9754464514553547, "step": 50 }, { "completion_length": 333.06139945983887, "epoch": 0.0647067973246228, "grad_norm": 0.06133859386576639, "kl": 0.0342864990234375, "learning_rate": 4.906870133404186e-07, "loss": 0.0, "reward": 1.14620541036129, "reward_std": 0.26373843336477876, "rewards/equation_reward_func": 0.15959822433069348, "rewards/format_reward_func": 0.9866071678698063, "step": 52 }, { "completion_length": 359.7198848724365, "epoch": 0.06719552029864675, "grad_norm": 0.05999061818317411, "kl": 0.035614013671875, "learning_rate": 4.896878728941531e-07, "loss": 0.0, "reward": 1.13058041036129, "reward_std": 0.30478311236947775, "rewards/equation_reward_func": 0.16741072200238705, "rewards/format_reward_func": 0.9631696715950966, "step": 54 }, { "completion_length": 339.29354095458984, "epoch": 0.06968424327267071, "grad_norm": 0.05830073958166394, "kl": 0.0386199951171875, "learning_rate": 4.886389558393284e-07, "loss": 0.0, "reward": 1.1930803954601288, "reward_std": 0.289231700822711, "rewards/equation_reward_func": 0.20982143562287092, "rewards/format_reward_func": 0.9832589514553547, "step": 56 }, { "completion_length": 356.0680980682373, "epoch": 0.07217296624669467, "grad_norm": 0.0544928686410873, "kl": 0.041656494140625, "learning_rate": 4.875404800072976e-07, "loss": 0.0, "reward": 1.1406250596046448, "reward_std": 0.22303624730557203, "rewards/equation_reward_func": 0.15625000721774995, "rewards/format_reward_func": 0.984375037252903, "step": 58 }, { "completion_length": 340.20537185668945, "epoch": 0.07466168922071861, "grad_norm": 0.05744987372930731, "kl": 0.0499420166015625, "learning_rate": 4.86392673521415e-07, "loss": 0.0, "reward": 1.1741071939468384, "reward_std": 0.27861948776990175, "rewards/equation_reward_func": 0.184151794295758, "rewards/format_reward_func": 0.9899553768336773, "step": 60 }, { "completion_length": 340.42635345458984, "epoch": 0.07715041219474257, "grad_norm": 0.061800785648734306, "kl": 0.04815673828125, "learning_rate": 4.851957747496606e-07, "loss": 0.0, "reward": 1.2120536342263222, "reward_std": 0.2723410092294216, "rewards/equation_reward_func": 0.22656251164153218, "rewards/format_reward_func": 0.9854911006987095, "step": 62 }, { "completion_length": 360.4308204650879, "epoch": 0.07963913516876653, "grad_norm": 0.054162346491789554, "kl": 0.05035400390625, "learning_rate": 4.839500322551386e-07, "loss": 0.0001, "reward": 1.1752232611179352, "reward_std": 0.2869079066440463, "rewards/equation_reward_func": 0.19419643748551607, "rewards/format_reward_func": 0.9810268133878708, "step": 64 }, { "completion_length": 350.62166595458984, "epoch": 0.08212785814279049, "grad_norm": 0.06519334959182187, "kl": 0.0565338134765625, "learning_rate": 4.826557047444563e-07, "loss": 0.0001, "reward": 1.1953125670552254, "reward_std": 0.30847367085516453, "rewards/equation_reward_func": 0.21763393934816122, "rewards/format_reward_func": 0.9776786006987095, "step": 66 }, { "completion_length": 353.1428756713867, "epoch": 0.08461658111681443, "grad_norm": 0.06183291246364765, "kl": 0.0570526123046875, "learning_rate": 4.813130610139993e-07, "loss": 0.0001, "reward": 1.2187500447034836, "reward_std": 0.31583766732364893, "rewards/equation_reward_func": 0.23549107927829027, "rewards/format_reward_func": 0.9832589626312256, "step": 68 }, { "completion_length": 374.2712211608887, "epoch": 0.08710530409083839, "grad_norm": 0.057224664305537236, "kl": 0.0590667724609375, "learning_rate": 4.799223798941089e-07, "loss": 0.0001, "reward": 1.188616119325161, "reward_std": 0.3013247288763523, "rewards/equation_reward_func": 0.21875001047737896, "rewards/format_reward_func": 0.9698660969734192, "step": 70 }, { "completion_length": 371.8750114440918, "epoch": 0.08959402706486234, "grad_norm": 0.058154083828801245, "kl": 0.064361572265625, "learning_rate": 4.78483950191177e-07, "loss": 0.0001, "reward": 1.2187500447034836, "reward_std": 0.32332514226436615, "rewards/equation_reward_func": 0.2433035826543346, "rewards/format_reward_func": 0.9754464589059353, "step": 72 }, { "completion_length": 365.0033664703369, "epoch": 0.0920827500388863, "grad_norm": 0.05964598037277607, "kl": 0.0686798095703125, "learning_rate": 4.769980706276687e-07, "loss": 0.0001, "reward": 1.243303619325161, "reward_std": 0.3098542420193553, "rewards/equation_reward_func": 0.2622767958091572, "rewards/format_reward_func": 0.9810268133878708, "step": 74 }, { "completion_length": 369.5089473724365, "epoch": 0.09457147301291025, "grad_norm": 0.051790769083910046, "kl": 0.074310302734375, "learning_rate": 4.7546504978008595e-07, "loss": 0.0001, "reward": 1.2645089775323868, "reward_std": 0.30733966641128063, "rewards/equation_reward_func": 0.2890625153668225, "rewards/format_reward_func": 0.9754464700818062, "step": 76 }, { "completion_length": 384.2756881713867, "epoch": 0.0970601959869342, "grad_norm": 0.05484455622025143, "kl": 0.07171630859375, "learning_rate": 4.738852060148848e-07, "loss": 0.0001, "reward": 1.250000063329935, "reward_std": 0.3137952769175172, "rewards/equation_reward_func": 0.2756696529686451, "rewards/format_reward_func": 0.9743303991854191, "step": 78 }, { "completion_length": 408.2500171661377, "epoch": 0.09954891896095816, "grad_norm": 0.0574310858584145, "kl": 0.075714111328125, "learning_rate": 4.722588674223593e-07, "loss": 0.0001, "reward": 1.2343750521540642, "reward_std": 0.3266843259334564, "rewards/equation_reward_func": 0.2801339440047741, "rewards/format_reward_func": 0.9542410969734192, "step": 80 }, { "completion_length": 418.1774711608887, "epoch": 0.10203764193498212, "grad_norm": 0.04959040686750888, "kl": 0.07958984375, "learning_rate": 4.70586371748506e-07, "loss": 0.0001, "reward": 1.2377232685685158, "reward_std": 0.3039839044213295, "rewards/equation_reward_func": 0.27455358440056443, "rewards/format_reward_func": 0.9631696678698063, "step": 82 }, { "completion_length": 392.39064598083496, "epoch": 0.10452636490900606, "grad_norm": 0.05828296572641528, "kl": 0.086578369140625, "learning_rate": 4.6886806632488363e-07, "loss": 0.0001, "reward": 1.2935268506407738, "reward_std": 0.3257301030680537, "rewards/equation_reward_func": 0.32254465483129025, "rewards/format_reward_func": 0.9709821790456772, "step": 84 }, { "completion_length": 393.9118461608887, "epoch": 0.10701508788303002, "grad_norm": 0.05583356462661088, "kl": 0.100616455078125, "learning_rate": 4.6710430799648143e-07, "loss": 0.0001, "reward": 1.2979911342263222, "reward_std": 0.2977573899552226, "rewards/equation_reward_func": 0.3370535937137902, "rewards/format_reward_func": 0.9609375298023224, "step": 86 }, { "completion_length": 418.97546768188477, "epoch": 0.10950381085705398, "grad_norm": 0.05984739996252054, "kl": 0.091064453125, "learning_rate": 4.652954630476127e-07, "loss": 0.0001, "reward": 1.2444196939468384, "reward_std": 0.347733149304986, "rewards/equation_reward_func": 0.29241072945296764, "rewards/format_reward_func": 0.9520089663565159, "step": 88 }, { "completion_length": 412.0435428619385, "epoch": 0.11199253383107793, "grad_norm": 0.05993124016275902, "kl": 0.096588134765625, "learning_rate": 4.6344190712584713e-07, "loss": 0.0001, "reward": 1.3292411416769028, "reward_std": 0.36256127804517746, "rewards/equation_reward_func": 0.37388394493609667, "rewards/format_reward_func": 0.9553571864962578, "step": 90 }, { "completion_length": 417.2634086608887, "epoch": 0.11448125680510188, "grad_norm": 0.05382564496764054, "kl": 0.09246826171875, "learning_rate": 4.615440251639995e-07, "loss": 0.0001, "reward": 1.290178619325161, "reward_std": 0.30793262645602226, "rewards/equation_reward_func": 0.33147323224693537, "rewards/format_reward_func": 0.9587053991854191, "step": 92 }, { "completion_length": 443.2835006713867, "epoch": 0.11696997977912583, "grad_norm": 0.06169358639422184, "kl": 0.100860595703125, "learning_rate": 4.596022113001894e-07, "loss": 0.0001, "reward": 1.2935268580913544, "reward_std": 0.35633409582078457, "rewards/equation_reward_func": 0.34821430686861277, "rewards/format_reward_func": 0.9453125447034836, "step": 94 }, { "completion_length": 408.9777011871338, "epoch": 0.11945870275314979, "grad_norm": 0.06289641800144408, "kl": 0.10186767578125, "learning_rate": 4.576168687959895e-07, "loss": 0.0001, "reward": 1.3582589849829674, "reward_std": 0.3258579559624195, "rewards/equation_reward_func": 0.40178573317825794, "rewards/format_reward_func": 0.9564732536673546, "step": 96 }, { "completion_length": 431.306941986084, "epoch": 0.12194742572717375, "grad_norm": 0.05214216522023378, "kl": 0.101165771484375, "learning_rate": 4.555884099526793e-07, "loss": 0.0001, "reward": 1.3091518506407738, "reward_std": 0.28289179550483823, "rewards/equation_reward_func": 0.3560268050059676, "rewards/format_reward_func": 0.9531250260770321, "step": 98 }, { "completion_length": 427.6640796661377, "epoch": 0.1244361487011977, "grad_norm": 0.06593974374565598, "kl": 0.106964111328125, "learning_rate": 4.5351725602562174e-07, "loss": 0.0001, "reward": 1.311383992433548, "reward_std": 0.3198657017201185, "rewards/equation_reward_func": 0.36272323317825794, "rewards/format_reward_func": 0.9486607573926449, "step": 100 }, { "completion_length": 412.4486827850342, "epoch": 0.12692487167522165, "grad_norm": 0.057534299379812, "kl": 0.104766845703125, "learning_rate": 4.514038371367791e-07, "loss": 0.0001, "reward": 1.4162947088479996, "reward_std": 0.2974701370112598, "rewards/equation_reward_func": 0.44531251955777407, "rewards/format_reward_func": 0.9709821678698063, "step": 102 }, { "completion_length": 412.8136329650879, "epoch": 0.1294135946492456, "grad_norm": 0.047305658451965514, "kl": 0.11578369140625, "learning_rate": 4.4924859218538936e-07, "loss": 0.0001, "reward": 1.358258992433548, "reward_std": 0.31219158321619034, "rewards/equation_reward_func": 0.40513394866138697, "rewards/format_reward_func": 0.9531250409781933, "step": 104 }, { "completion_length": 414.66296195983887, "epoch": 0.13190231762326957, "grad_norm": 0.05814046051575021, "kl": 0.118560791015625, "learning_rate": 4.470519687568185e-07, "loss": 0.0001, "reward": 1.3627232760190964, "reward_std": 0.28798825573176146, "rewards/equation_reward_func": 0.3973214505240321, "rewards/format_reward_func": 0.9654018133878708, "step": 106 }, { "completion_length": 408.1540355682373, "epoch": 0.1343910405972935, "grad_norm": 0.056119658206168294, "kl": 0.114013671875, "learning_rate": 4.4481442302960923e-07, "loss": 0.0001, "reward": 1.3459822088479996, "reward_std": 0.27404902782291174, "rewards/equation_reward_func": 0.3761160969734192, "rewards/format_reward_func": 0.9698660969734192, "step": 108 }, { "completion_length": 389.2522506713867, "epoch": 0.13687976357131748, "grad_norm": 0.06337568584346356, "kl": 0.118377685546875, "learning_rate": 4.4253641968074505e-07, "loss": 0.0001, "reward": 1.4229911267757416, "reward_std": 0.2757099084556103, "rewards/equation_reward_func": 0.4609375149011612, "rewards/format_reward_func": 0.9620536044239998, "step": 110 }, { "completion_length": 378.68305015563965, "epoch": 0.13936848654534142, "grad_norm": 0.055648006379385793, "kl": 0.134857177734375, "learning_rate": 4.402184317891501e-07, "loss": 0.0001, "reward": 1.439732201397419, "reward_std": 0.2977797780185938, "rewards/equation_reward_func": 0.46651787497103214, "rewards/format_reward_func": 0.9732143171131611, "step": 112 }, { "completion_length": 407.8962211608887, "epoch": 0.14185720951936537, "grad_norm": 0.0591540848415508, "kl": 0.126220703125, "learning_rate": 4.37860940737443e-07, "loss": 0.0001, "reward": 1.3571429178118706, "reward_std": 0.24285848252475262, "rewards/equation_reward_func": 0.39174109138548374, "rewards/format_reward_func": 0.9654018133878708, "step": 114 }, { "completion_length": 360.5067090988159, "epoch": 0.14434593249338934, "grad_norm": 0.05663015266712956, "kl": 0.133392333984375, "learning_rate": 4.354644361119671e-07, "loss": 0.0001, "reward": 1.4698661342263222, "reward_std": 0.21007388643920422, "rewards/equation_reward_func": 0.49330359790474176, "rewards/format_reward_func": 0.9765625298023224, "step": 116 }, { "completion_length": 376.5457820892334, "epoch": 0.14683465546741328, "grad_norm": 0.05339975098270581, "kl": 0.1224365234375, "learning_rate": 4.3302941560111716e-07, "loss": 0.0001, "reward": 1.434151865541935, "reward_std": 0.21489905659109354, "rewards/equation_reward_func": 0.45647323597222567, "rewards/format_reward_func": 0.9776785932481289, "step": 118 }, { "completion_length": 365.3828287124634, "epoch": 0.14932337844143723, "grad_norm": 0.05910197740570998, "kl": 0.1470947265625, "learning_rate": 4.3055638489198236e-07, "loss": 0.0001, "reward": 1.3984375670552254, "reward_std": 0.2542656445875764, "rewards/equation_reward_func": 0.42857144493609667, "rewards/format_reward_func": 0.9698660969734192, "step": 120 }, { "completion_length": 352.23662662506104, "epoch": 0.1518121014154612, "grad_norm": 0.05864149302937991, "kl": 0.146240234375, "learning_rate": 4.280458575653296e-07, "loss": 0.0001, "reward": 1.456473283469677, "reward_std": 0.18524617236107588, "rewards/equation_reward_func": 0.4754464505240321, "rewards/format_reward_func": 0.9810268171131611, "step": 122 }, { "completion_length": 348.39287090301514, "epoch": 0.15430082438948514, "grad_norm": 0.0663797306790776, "kl": 0.15472412109375, "learning_rate": 4.2549835498894665e-07, "loss": 0.0002, "reward": 1.4453125670552254, "reward_std": 0.23717239312827587, "rewards/equation_reward_func": 0.4732143087312579, "rewards/format_reward_func": 0.972098246216774, "step": 124 }, { "completion_length": 356.01117610931396, "epoch": 0.1567895473635091, "grad_norm": 0.05497099225448875, "kl": 0.13714599609375, "learning_rate": 4.229144062093679e-07, "loss": 0.0001, "reward": 1.4397322051227093, "reward_std": 0.19848236767575145, "rewards/equation_reward_func": 0.46540180779993534, "rewards/format_reward_func": 0.9743303880095482, "step": 126 }, { "completion_length": 381.28573417663574, "epoch": 0.15927827033753306, "grad_norm": 0.05653433573745518, "kl": 0.13873291015625, "learning_rate": 4.2029454784200675e-07, "loss": 0.0001, "reward": 1.3180804327130318, "reward_std": 0.18374477326869965, "rewards/equation_reward_func": 0.34598215762525797, "rewards/format_reward_func": 0.9720982387661934, "step": 128 }, { "completion_length": 343.4129581451416, "epoch": 0.161766993311557, "grad_norm": 0.06565562774179554, "kl": 0.165069580078125, "learning_rate": 4.1763932395971433e-07, "loss": 0.0002, "reward": 1.4308036342263222, "reward_std": 0.16549686901271343, "rewards/equation_reward_func": 0.4441964477300644, "rewards/format_reward_func": 0.9866071678698063, "step": 130 }, { "completion_length": 348.75224781036377, "epoch": 0.16425571628558097, "grad_norm": 0.06462157522635509, "kl": 0.1668701171875, "learning_rate": 4.1494928597979117e-07, "loss": 0.0002, "reward": 1.4352679252624512, "reward_std": 0.1786380740813911, "rewards/equation_reward_func": 0.45424109883606434, "rewards/format_reward_func": 0.9810268096625805, "step": 132 }, { "completion_length": 308.1205520629883, "epoch": 0.16674443925960492, "grad_norm": 0.05800701976146753, "kl": 0.151275634765625, "learning_rate": 4.122249925494726e-07, "loss": 0.0002, "reward": 1.4765625670552254, "reward_std": 0.17193927755579352, "rewards/equation_reward_func": 0.48214288242161274, "rewards/format_reward_func": 0.994419664144516, "step": 134 }, { "completion_length": 346.0546998977661, "epoch": 0.16923316223362886, "grad_norm": 0.04492148649071812, "kl": 0.15283203125, "learning_rate": 4.094670094299131e-07, "loss": 0.0002, "reward": 1.3660714849829674, "reward_std": 0.161542855668813, "rewards/equation_reward_func": 0.38058037986047566, "rewards/format_reward_func": 0.9854911044239998, "step": 136 }, { "completion_length": 331.13618087768555, "epoch": 0.17172188520765283, "grad_norm": 0.2212276658068583, "kl": 0.537384033203125, "learning_rate": 4.066759093786931e-07, "loss": 0.0005, "reward": 1.41964291036129, "reward_std": 0.19885259168222547, "rewards/equation_reward_func": 0.43861609138548374, "rewards/format_reward_func": 0.9810268059372902, "step": 138 }, { "completion_length": 344.59264945983887, "epoch": 0.17421060818167677, "grad_norm": 0.05665698464447795, "kl": 0.151947021484375, "learning_rate": 4.038522720308732e-07, "loss": 0.0002, "reward": 1.3805804178118706, "reward_std": 0.18276562774553895, "rewards/equation_reward_func": 0.39508930686861277, "rewards/format_reward_func": 0.9854910969734192, "step": 140 }, { "completion_length": 328.3872928619385, "epoch": 0.17669933115570072, "grad_norm": 0.05858138418747856, "kl": 0.16424560546875, "learning_rate": 4.009966837786194e-07, "loss": 0.0002, "reward": 1.4631697237491608, "reward_std": 0.21255239751189947, "rewards/equation_reward_func": 0.47879466973245144, "rewards/format_reward_func": 0.9843750335276127, "step": 142 }, { "completion_length": 322.76898860931396, "epoch": 0.1791880541297247, "grad_norm": 0.051814888933446955, "kl": 0.157470703125, "learning_rate": 3.981097376494259e-07, "loss": 0.0002, "reward": 1.4575893506407738, "reward_std": 0.19401998538523912, "rewards/equation_reward_func": 0.4743303768336773, "rewards/format_reward_func": 0.9832589514553547, "step": 144 }, { "completion_length": 333.7712211608887, "epoch": 0.18167677710374863, "grad_norm": 0.05624207982164332, "kl": 0.160125732421875, "learning_rate": 3.951920331829592e-07, "loss": 0.0002, "reward": 1.4207589775323868, "reward_std": 0.15926225204020739, "rewards/equation_reward_func": 0.43861609138548374, "rewards/format_reward_func": 0.9821428842842579, "step": 146 }, { "completion_length": 319.8225564956665, "epoch": 0.1841655000777726, "grad_norm": 0.057600263729418885, "kl": 0.154571533203125, "learning_rate": 3.922441763065506e-07, "loss": 0.0002, "reward": 1.4151786416769028, "reward_std": 0.1929843365214765, "rewards/equation_reward_func": 0.43191966600716114, "rewards/format_reward_func": 0.9832589589059353, "step": 148 }, { "completion_length": 341.7154140472412, "epoch": 0.18665422305179655, "grad_norm": 0.05008851715635548, "kl": 0.17266845703125, "learning_rate": 3.8926677920936093e-07, "loss": 0.0002, "reward": 1.390625074505806, "reward_std": 0.1591361197642982, "rewards/equation_reward_func": 0.40290180686861277, "rewards/format_reward_func": 0.9877232424914837, "step": 150 }, { "completion_length": 308.49889850616455, "epoch": 0.1891429460258205, "grad_norm": 0.04651438810073453, "kl": 0.1688232421875, "learning_rate": 3.862604602152464e-07, "loss": 0.0002, "reward": 1.472098283469677, "reward_std": 0.17588552320376039, "rewards/equation_reward_func": 0.4854910932481289, "rewards/format_reward_func": 0.9866071790456772, "step": 152 }, { "completion_length": 329.714298248291, "epoch": 0.19163166899984446, "grad_norm": 0.05395346273397737, "kl": 0.1552734375, "learning_rate": 3.8322584365434934e-07, "loss": 0.0002, "reward": 1.409598283469677, "reward_std": 0.1818417957983911, "rewards/equation_reward_func": 0.43080359511077404, "rewards/format_reward_func": 0.9787946678698063, "step": 154 }, { "completion_length": 297.5982275009155, "epoch": 0.1941203919738684, "grad_norm": 0.04614157256005193, "kl": 0.17120361328125, "learning_rate": 3.8016355973344173e-07, "loss": 0.0002, "reward": 1.5055804252624512, "reward_std": 0.17592015489935875, "rewards/equation_reward_func": 0.5178571604192257, "rewards/format_reward_func": 0.9877232424914837, "step": 156 }, { "completion_length": 305.8538112640381, "epoch": 0.19660911494789235, "grad_norm": 0.0694599272801652, "kl": 0.2060546875, "learning_rate": 3.7707424440504863e-07, "loss": 0.0002, "reward": 1.4587054252624512, "reward_std": 0.19024762557819486, "rewards/equation_reward_func": 0.47098217345774174, "rewards/format_reward_func": 0.9877232350409031, "step": 158 }, { "completion_length": 326.53796005249023, "epoch": 0.19909783792191632, "grad_norm": 0.06216275817115459, "kl": 0.17303466796875, "learning_rate": 3.739585392353787e-07, "loss": 0.0002, "reward": 1.3604911342263222, "reward_std": 0.16597195249050856, "rewards/equation_reward_func": 0.37946430314332247, "rewards/format_reward_func": 0.9810268208384514, "step": 160 }, { "completion_length": 289.9274663925171, "epoch": 0.20158656089594026, "grad_norm": 0.07236670443181108, "kl": 0.18756103515625, "learning_rate": 3.7081709127108767e-07, "loss": 0.0002, "reward": 1.4866072162985802, "reward_std": 0.2302361116744578, "rewards/equation_reward_func": 0.5033482387661934, "rewards/format_reward_func": 0.9832589589059353, "step": 162 }, { "completion_length": 323.6227798461914, "epoch": 0.20407528386996424, "grad_norm": 0.048051970688120026, "kl": 0.172607421875, "learning_rate": 3.6765055290490513e-07, "loss": 0.0002, "reward": 1.3683036342263222, "reward_std": 0.17599576339125633, "rewards/equation_reward_func": 0.38504466286394745, "rewards/format_reward_func": 0.9832589589059353, "step": 164 }, { "completion_length": 287.5881814956665, "epoch": 0.20656400684398818, "grad_norm": 0.05486464300817734, "kl": 0.18212890625, "learning_rate": 3.644595817401501e-07, "loss": 0.0002, "reward": 1.4843750670552254, "reward_std": 0.17731093661859632, "rewards/equation_reward_func": 0.5011160932481289, "rewards/format_reward_func": 0.9832589626312256, "step": 166 }, { "completion_length": 284.36050605773926, "epoch": 0.20905272981801212, "grad_norm": 0.06808427865251532, "kl": 0.195556640625, "learning_rate": 3.6124484045416483e-07, "loss": 0.0002, "reward": 1.4665179327130318, "reward_std": 0.20187203446403146, "rewards/equation_reward_func": 0.48660716600716114, "rewards/format_reward_func": 0.9799107611179352, "step": 168 }, { "completion_length": 291.9732255935669, "epoch": 0.2115414527920361, "grad_norm": 0.04519403336857609, "kl": 0.18017578125, "learning_rate": 3.580069966606949e-07, "loss": 0.0002, "reward": 1.4084821939468384, "reward_std": 0.19656919362023473, "rewards/equation_reward_func": 0.44308037497103214, "rewards/format_reward_func": 0.9654018208384514, "step": 170 }, { "completion_length": 318.1685400009155, "epoch": 0.21403017576606004, "grad_norm": 0.061705947760315766, "kl": 0.1832275390625, "learning_rate": 3.547467227712444e-07, "loss": 0.0002, "reward": 1.3526786342263222, "reward_std": 0.19870673725381494, "rewards/equation_reward_func": 0.37834823690354824, "rewards/format_reward_func": 0.9743303954601288, "step": 172 }, { "completion_length": 301.315860748291, "epoch": 0.21651889874008398, "grad_norm": 0.054334606936889365, "kl": 0.1953125, "learning_rate": 3.5146469585543386e-07, "loss": 0.0002, "reward": 1.4017857760190964, "reward_std": 0.20476062037050724, "rewards/equation_reward_func": 0.42075894959270954, "rewards/format_reward_func": 0.9810268096625805, "step": 174 }, { "completion_length": 304.6763563156128, "epoch": 0.21900762171410795, "grad_norm": 0.05052301150918478, "kl": 0.1910400390625, "learning_rate": 3.481615975003922e-07, "loss": 0.0002, "reward": 1.3560268431901932, "reward_std": 0.19252017885446548, "rewards/equation_reward_func": 0.38169644493609667, "rewards/format_reward_func": 0.9743303842842579, "step": 176 }, { "completion_length": 269.1328239440918, "epoch": 0.2214963446881319, "grad_norm": 0.05172823495274905, "kl": 0.2069091796875, "learning_rate": 3.448381136692089e-07, "loss": 0.0002, "reward": 1.4609375596046448, "reward_std": 0.16476588556542993, "rewards/equation_reward_func": 0.48883930779993534, "rewards/format_reward_func": 0.972098246216774, "step": 178 }, { "completion_length": 308.0904140472412, "epoch": 0.22398506766215587, "grad_norm": 0.0613229880527607, "kl": 0.19024658203125, "learning_rate": 3.4149493455847897e-07, "loss": 0.0002, "reward": 1.3571429178118706, "reward_std": 0.22263891389593482, "rewards/equation_reward_func": 0.393973232829012, "rewards/format_reward_func": 0.963169664144516, "step": 180 }, { "completion_length": 265.180814743042, "epoch": 0.2264737906361798, "grad_norm": 0.05586824711527208, "kl": 0.20489501953125, "learning_rate": 3.3813275445496766e-07, "loss": 0.0002, "reward": 1.520089354366064, "reward_std": 0.18474067840725183, "rewards/equation_reward_func": 0.5479911003494635, "rewards/format_reward_func": 0.972098246216774, "step": 182 }, { "completion_length": 303.05358600616455, "epoch": 0.22896251361020376, "grad_norm": 0.04781670355873072, "kl": 0.19744873046875, "learning_rate": 3.347522715914262e-07, "loss": 0.0002, "reward": 1.379464365541935, "reward_std": 0.21752706728875637, "rewards/equation_reward_func": 0.4174107350409031, "rewards/format_reward_func": 0.9620536006987095, "step": 184 }, { "completion_length": 266.5502338409424, "epoch": 0.23145123658422773, "grad_norm": 0.07219267256948605, "kl": 0.21258544921875, "learning_rate": 3.313541880015877e-07, "loss": 0.0002, "reward": 1.476562574505806, "reward_std": 0.1959572951309383, "rewards/equation_reward_func": 0.5033482350409031, "rewards/format_reward_func": 0.9732143245637417, "step": 186 }, { "completion_length": 285.2198791503906, "epoch": 0.23393995955825167, "grad_norm": 0.1000308934340073, "kl": 0.215087890625, "learning_rate": 3.279392093743747e-07, "loss": 0.0002, "reward": 1.3883929252624512, "reward_std": 0.21638350887224078, "rewards/equation_reward_func": 0.4252232303842902, "rewards/format_reward_func": 0.9631696715950966, "step": 188 }, { "completion_length": 275.9531412124634, "epoch": 0.2364286825322756, "grad_norm": 0.063450820634565, "kl": 0.2064208984375, "learning_rate": 3.245080449073459e-07, "loss": 0.0002, "reward": 1.4296875670552254, "reward_std": 0.18975705141201615, "rewards/equation_reward_func": 0.46093752048909664, "rewards/format_reward_func": 0.9687500409781933, "step": 190 }, { "completion_length": 281.63282680511475, "epoch": 0.23891740550629958, "grad_norm": 0.07927927952984541, "kl": 0.23663330078125, "learning_rate": 3.210614071594162e-07, "loss": 0.0002, "reward": 1.4776786491274834, "reward_std": 0.2009006328880787, "rewards/equation_reward_func": 0.5212053842842579, "rewards/format_reward_func": 0.9564732536673546, "step": 192 }, { "completion_length": 306.393985748291, "epoch": 0.24140612848032353, "grad_norm": 0.06500632509866108, "kl": 0.2120361328125, "learning_rate": 3.1760001190287695e-07, "loss": 0.0002, "reward": 1.3917411416769028, "reward_std": 0.23199980147182941, "rewards/equation_reward_func": 0.44084823317825794, "rewards/format_reward_func": 0.9508928880095482, "step": 194 }, { "completion_length": 313.911847114563, "epoch": 0.2438948514543475, "grad_norm": 0.07817216779042299, "kl": 0.22210693359375, "learning_rate": 3.141245779747502e-07, "loss": 0.0002, "reward": 1.3649554178118706, "reward_std": 0.21487555792555213, "rewards/equation_reward_func": 0.4207589514553547, "rewards/format_reward_func": 0.9441964663565159, "step": 196 }, { "completion_length": 304.01787090301514, "epoch": 0.24638357442837144, "grad_norm": 0.059524282155233844, "kl": 0.23980712890625, "learning_rate": 3.106358271275056e-07, "loss": 0.0002, "reward": 1.3984375596046448, "reward_std": 0.21076519135385752, "rewards/equation_reward_func": 0.44196430314332247, "rewards/format_reward_func": 0.9564732424914837, "step": 198 }, { "completion_length": 292.20983505249023, "epoch": 0.2488722974023954, "grad_norm": 0.05679318708182419, "kl": 0.21978759765625, "learning_rate": 3.0713448387917227e-07, "loss": 0.0002, "reward": 1.4107143580913544, "reward_std": 0.1913211834616959, "rewards/equation_reward_func": 0.4508928805589676, "rewards/format_reward_func": 0.9598214589059353, "step": 200 }, { "completion_length": 279.8493461608887, "epoch": 0.25136102037641933, "grad_norm": 0.05632547780170071, "kl": 0.2144775390625, "learning_rate": 3.0362127536287636e-07, "loss": 0.0002, "reward": 1.4609375670552254, "reward_std": 0.19915124960243702, "rewards/equation_reward_func": 0.49665181152522564, "rewards/format_reward_func": 0.964285746216774, "step": 202 }, { "completion_length": 305.8973331451416, "epoch": 0.2538497433504433, "grad_norm": 0.06737671533384239, "kl": 0.23370361328125, "learning_rate": 3.0009693117583523e-07, "loss": 0.0002, "reward": 1.3426339998841286, "reward_std": 0.23496287874877453, "rewards/equation_reward_func": 0.3895089477300644, "rewards/format_reward_func": 0.9531250335276127, "step": 204 }, { "completion_length": 306.7031373977661, "epoch": 0.2563384663244673, "grad_norm": 0.055756372882820274, "kl": 0.2335205078125, "learning_rate": 2.965621832278401e-07, "loss": 0.0002, "reward": 1.3973214998841286, "reward_std": 0.21409290935844183, "rewards/equation_reward_func": 0.435267879627645, "rewards/format_reward_func": 0.9620536081492901, "step": 206 }, { "completion_length": 301.6216621398926, "epoch": 0.2588271892984912, "grad_norm": 0.057990490708177245, "kl": 0.210205078125, "learning_rate": 2.9301776558925875e-07, "loss": 0.0002, "reward": 1.3593750596046448, "reward_std": 0.18663123482838273, "rewards/equation_reward_func": 0.39397323317825794, "rewards/format_reward_func": 0.9654018208384514, "step": 208 }, { "completion_length": 302.0312662124634, "epoch": 0.26131591227251516, "grad_norm": 0.05591978437529843, "kl": 0.19818115234375, "learning_rate": 2.894644143385885e-07, "loss": 0.0002, "reward": 1.3549107983708382, "reward_std": 0.181817049626261, "rewards/equation_reward_func": 0.38839287823066115, "rewards/format_reward_func": 0.9665178917348385, "step": 210 }, { "completion_length": 295.438627243042, "epoch": 0.26380463524653913, "grad_norm": 0.05880963721504364, "kl": 0.20281982421875, "learning_rate": 2.859028674095937e-07, "loss": 0.0002, "reward": 1.3861607611179352, "reward_std": 0.1711200401186943, "rewards/equation_reward_func": 0.415178588591516, "rewards/format_reward_func": 0.9709821790456772, "step": 212 }, { "completion_length": 280.2243413925171, "epoch": 0.26629335822056305, "grad_norm": 0.05541782868549402, "kl": 0.2681884765625, "learning_rate": 2.823338644380566e-07, "loss": 0.0003, "reward": 1.4341518506407738, "reward_std": 0.1762135154567659, "rewards/equation_reward_func": 0.45758930779993534, "rewards/format_reward_func": 0.9765625186264515, "step": 214 }, { "completion_length": 262.88952255249023, "epoch": 0.268782081194587, "grad_norm": 0.06876739528559901, "kl": 0.1982421875, "learning_rate": 2.7875814660817504e-07, "loss": 0.0002, "reward": 1.4810268580913544, "reward_std": 0.19189182948321104, "rewards/equation_reward_func": 0.5044643133878708, "rewards/format_reward_func": 0.976562537252903, "step": 216 }, { "completion_length": 268.20090198516846, "epoch": 0.271270804168611, "grad_norm": 0.06507593507837049, "kl": 0.1932373046875, "learning_rate": 2.751764564986396e-07, "loss": 0.0002, "reward": 1.4252232760190964, "reward_std": 0.1531409532763064, "rewards/equation_reward_func": 0.4464285960420966, "rewards/format_reward_func": 0.9787946864962578, "step": 218 }, { "completion_length": 248.7288064956665, "epoch": 0.27375952714263496, "grad_norm": 0.04947068793737493, "kl": 0.22991943359375, "learning_rate": 2.715895379284194e-07, "loss": 0.0002, "reward": 1.4598215073347092, "reward_std": 0.17193997697904706, "rewards/equation_reward_func": 0.4776785857975483, "rewards/format_reward_func": 0.9821428768336773, "step": 220 }, { "completion_length": 250.73104000091553, "epoch": 0.2762482501166589, "grad_norm": 0.05521345272079342, "kl": 0.218994140625, "learning_rate": 2.6799813580229174e-07, "loss": 0.0002, "reward": 1.464285783469677, "reward_std": 0.14387341123074293, "rewards/equation_reward_func": 0.47991073690354824, "rewards/format_reward_func": 0.9843750186264515, "step": 222 }, { "completion_length": 250.93973922729492, "epoch": 0.27873697309068285, "grad_norm": 0.0483398884521067, "kl": 0.1998291015625, "learning_rate": 2.6440299595614606e-07, "loss": 0.0002, "reward": 1.4575893431901932, "reward_std": 0.1358907464891672, "rewards/equation_reward_func": 0.4720982280559838, "rewards/format_reward_func": 0.9854911118745804, "step": 224 }, { "completion_length": 255.92411518096924, "epoch": 0.2812256960647068, "grad_norm": 0.05969943458746344, "kl": 0.20379638671875, "learning_rate": 2.6080486500209347e-07, "loss": 0.0002, "reward": 1.4352679252624512, "reward_std": 0.15126124769449234, "rewards/equation_reward_func": 0.4542410960420966, "rewards/format_reward_func": 0.9810268133878708, "step": 226 }, { "completion_length": 242.58260345458984, "epoch": 0.28371441903873074, "grad_norm": 0.07222088082221652, "kl": 0.2523193359375, "learning_rate": 2.572044901734166e-07, "loss": 0.0003, "reward": 1.4486607760190964, "reward_std": 0.1779097393155098, "rewards/equation_reward_func": 0.4642857387661934, "rewards/format_reward_func": 0.9843750260770321, "step": 228 }, { "completion_length": 241.76117038726807, "epoch": 0.2862031420127547, "grad_norm": 0.06362399984743052, "kl": 0.2340087890625, "learning_rate": 2.536026191693893e-07, "loss": 0.0002, "reward": 1.4531250670552254, "reward_std": 0.16605611657723784, "rewards/equation_reward_func": 0.4743303768336773, "rewards/format_reward_func": 0.9787946715950966, "step": 230 }, { "completion_length": 232.6540288925171, "epoch": 0.2886918649867787, "grad_norm": 0.08974695425968056, "kl": 0.20977783203125, "learning_rate": 2.5e-07, "loss": 0.0002, "reward": 1.4899554252624512, "reward_std": 0.14485393464565277, "rewards/equation_reward_func": 0.5100446660071611, "rewards/format_reward_func": 0.9799107611179352, "step": 232 }, { "completion_length": 238.89398288726807, "epoch": 0.2911805879608026, "grad_norm": 0.056325369308057746, "kl": 0.2130126953125, "learning_rate": 2.4639738083061073e-07, "loss": 0.0002, "reward": 1.492187574505806, "reward_std": 0.1786569063551724, "rewards/equation_reward_func": 0.5200893115252256, "rewards/format_reward_func": 0.9720982536673546, "step": 234 }, { "completion_length": 275.3326053619385, "epoch": 0.29366931093482657, "grad_norm": 0.055885259957339305, "kl": 0.19952392578125, "learning_rate": 2.4279550982658345e-07, "loss": 0.0002, "reward": 1.3493304178118706, "reward_std": 0.19797838106751442, "rewards/equation_reward_func": 0.3794643059372902, "rewards/format_reward_func": 0.9698661006987095, "step": 236 }, { "completion_length": 248.7734498977661, "epoch": 0.29615803390885054, "grad_norm": 0.053249216235646074, "kl": 0.24993896484375, "learning_rate": 2.3919513499790646e-07, "loss": 0.0002, "reward": 1.4843750521540642, "reward_std": 0.20823450107127428, "rewards/equation_reward_func": 0.5189732387661934, "rewards/format_reward_func": 0.9654018133878708, "step": 238 }, { "completion_length": 274.2745695114136, "epoch": 0.29864675688287445, "grad_norm": 0.05354241002117662, "kl": 0.23797607421875, "learning_rate": 2.3559700404385394e-07, "loss": 0.0002, "reward": 1.3593750596046448, "reward_std": 0.18844150891527534, "rewards/equation_reward_func": 0.39732144959270954, "rewards/format_reward_func": 0.9620536006987095, "step": 240 }, { "completion_length": 278.19643783569336, "epoch": 0.3011354798568984, "grad_norm": 0.07271477039344029, "kl": 0.29461669921875, "learning_rate": 2.3200186419770823e-07, "loss": 0.0003, "reward": 1.3883929252624512, "reward_std": 0.2401469461619854, "rewards/equation_reward_func": 0.43526787869632244, "rewards/format_reward_func": 0.9531250335276127, "step": 242 }, { "completion_length": 240.8593873977661, "epoch": 0.3036242028309224, "grad_norm": 0.04915715622456297, "kl": 0.24176025390625, "learning_rate": 2.284104620715807e-07, "loss": 0.0002, "reward": 1.4765625670552254, "reward_std": 0.14837059983983636, "rewards/equation_reward_func": 0.5033482387661934, "rewards/format_reward_func": 0.9732143133878708, "step": 244 }, { "completion_length": 270.9040307998657, "epoch": 0.3061129258049463, "grad_norm": 0.07553072814564832, "kl": 0.22186279296875, "learning_rate": 2.2482354350136043e-07, "loss": 0.0002, "reward": 1.3917411267757416, "reward_std": 0.24061778374016285, "rewards/equation_reward_func": 0.4386160895228386, "rewards/format_reward_func": 0.9531250335276127, "step": 246 }, { "completion_length": 259.24108505249023, "epoch": 0.3086016487789703, "grad_norm": 0.06131285978491122, "kl": 0.22003173828125, "learning_rate": 2.2124185339182496e-07, "loss": 0.0002, "reward": 1.4140625596046448, "reward_std": 0.19331888854503632, "rewards/equation_reward_func": 0.44754466973245144, "rewards/format_reward_func": 0.9665178842842579, "step": 248 }, { "completion_length": 266.12836265563965, "epoch": 0.31109037175299425, "grad_norm": 0.057251447829503296, "kl": 0.211181640625, "learning_rate": 2.1766613556194344e-07, "loss": 0.0002, "reward": 1.431919701397419, "reward_std": 0.19034441327676177, "rewards/equation_reward_func": 0.4698660969734192, "rewards/format_reward_func": 0.9620536044239998, "step": 250 }, { "completion_length": 263.9140787124634, "epoch": 0.3135790947270182, "grad_norm": 0.06634536235536961, "kl": 0.22381591796875, "learning_rate": 2.1409713259040628e-07, "loss": 0.0002, "reward": 1.4631696939468384, "reward_std": 0.20181358978152275, "rewards/equation_reward_func": 0.49776787031441927, "rewards/format_reward_func": 0.9654018357396126, "step": 252 }, { "completion_length": 270.25447845458984, "epoch": 0.31606781770104214, "grad_norm": 0.08857117161748256, "kl": 0.21771240234375, "learning_rate": 2.105355856614115e-07, "loss": 0.0002, "reward": 1.408482201397419, "reward_std": 0.20398310013115406, "rewards/equation_reward_func": 0.4520089514553547, "rewards/format_reward_func": 0.9564732536673546, "step": 254 }, { "completion_length": 248.61050510406494, "epoch": 0.3185565406750661, "grad_norm": 0.05771718831012432, "kl": 0.21124267578125, "learning_rate": 2.069822344107413e-07, "loss": 0.0002, "reward": 1.4832590073347092, "reward_std": 0.2012131493538618, "rewards/equation_reward_func": 0.5267857331782579, "rewards/format_reward_func": 0.9564732499420643, "step": 256 }, { "completion_length": 265.1105041503906, "epoch": 0.3210452636490901, "grad_norm": 0.07137640163322823, "kl": 0.22540283203125, "learning_rate": 2.034378167721599e-07, "loss": 0.0002, "reward": 1.4263393580913544, "reward_std": 0.22233180236071348, "rewards/equation_reward_func": 0.47656252421438694, "rewards/format_reward_func": 0.9497768171131611, "step": 258 }, { "completion_length": 274.2232265472412, "epoch": 0.323533986623114, "grad_norm": 0.048682994811717506, "kl": 0.2178955078125, "learning_rate": 1.9990306882416485e-07, "loss": 0.0002, "reward": 1.3839286342263222, "reward_std": 0.15760588087141514, "rewards/equation_reward_func": 0.4285714477300644, "rewards/format_reward_func": 0.9553571715950966, "step": 260 }, { "completion_length": 264.10491943359375, "epoch": 0.32602270959713797, "grad_norm": 0.059864933831037394, "kl": 0.20758056640625, "learning_rate": 1.9637872463712362e-07, "loss": 0.0002, "reward": 1.4531250670552254, "reward_std": 0.2065706574358046, "rewards/equation_reward_func": 0.48883931152522564, "rewards/format_reward_func": 0.9642857536673546, "step": 262 }, { "completion_length": 242.52344799041748, "epoch": 0.32851143257116194, "grad_norm": 0.05470521183712939, "kl": 0.20501708984375, "learning_rate": 1.9286551612082773e-07, "loss": 0.0002, "reward": 1.5312500819563866, "reward_std": 0.14761846419423819, "rewards/equation_reward_func": 0.5636160988360643, "rewards/format_reward_func": 0.9676339514553547, "step": 264 }, { "completion_length": 257.9397439956665, "epoch": 0.33100015554518586, "grad_norm": 0.05856528610006056, "kl": 0.19775390625, "learning_rate": 1.8936417287249446e-07, "loss": 0.0002, "reward": 1.4542411416769028, "reward_std": 0.19231480779126287, "rewards/equation_reward_func": 0.4921875223517418, "rewards/format_reward_func": 0.9620536081492901, "step": 266 }, { "completion_length": 260.13952255249023, "epoch": 0.33348887851920983, "grad_norm": 0.06519402114882171, "kl": 0.20098876953125, "learning_rate": 1.8587542202524985e-07, "loss": 0.0002, "reward": 1.433035783469677, "reward_std": 0.1624811594374478, "rewards/equation_reward_func": 0.46093752421438694, "rewards/format_reward_func": 0.9720982499420643, "step": 268 }, { "completion_length": 241.5424222946167, "epoch": 0.3359776014932338, "grad_norm": 0.06174357121788231, "kl": 0.2017822265625, "learning_rate": 1.82399988097123e-07, "loss": 0.0002, "reward": 1.5156250670552254, "reward_std": 0.1582701876759529, "rewards/equation_reward_func": 0.5513393171131611, "rewards/format_reward_func": 0.9642857387661934, "step": 270 }, { "completion_length": 256.6517972946167, "epoch": 0.3384663244672577, "grad_norm": 0.038421184229451566, "kl": 0.20526123046875, "learning_rate": 1.7893859284058378e-07, "loss": 0.0002, "reward": 1.4754464998841286, "reward_std": 0.1418059840798378, "rewards/equation_reward_func": 0.5033482369035482, "rewards/format_reward_func": 0.9720982387661934, "step": 272 }, { "completion_length": 261.31139755249023, "epoch": 0.3409550474412817, "grad_norm": 0.06938416420126632, "kl": 0.19525146484375, "learning_rate": 1.7549195509265407e-07, "loss": 0.0002, "reward": 1.437500074505806, "reward_std": 0.15865246811881661, "rewards/equation_reward_func": 0.45758930779993534, "rewards/format_reward_func": 0.9799107275903225, "step": 274 }, { "completion_length": 237.3839406967163, "epoch": 0.34344377041530566, "grad_norm": 0.05279986019801015, "kl": 0.19287109375, "learning_rate": 1.7206079062562536e-07, "loss": 0.0002, "reward": 1.5078125670552254, "reward_std": 0.14753996301442385, "rewards/equation_reward_func": 0.5323660974390805, "rewards/format_reward_func": 0.9754464589059353, "step": 276 }, { "completion_length": 243.16295623779297, "epoch": 0.3459324933893296, "grad_norm": 0.07742881946994733, "kl": 0.19915771484375, "learning_rate": 1.6864581199841226e-07, "loss": 0.0002, "reward": 1.512276865541935, "reward_std": 0.18381929537281394, "rewards/equation_reward_func": 0.5345982387661934, "rewards/format_reward_func": 0.9776785969734192, "step": 278 }, { "completion_length": 267.8125104904175, "epoch": 0.34842121636335355, "grad_norm": 0.06365543898835445, "kl": 0.1842041015625, "learning_rate": 1.6524772840857388e-07, "loss": 0.0002, "reward": 1.381696492433548, "reward_std": 0.1500142039731145, "rewards/equation_reward_func": 0.40513394586741924, "rewards/format_reward_func": 0.9765625409781933, "step": 280 }, { "completion_length": 260.2701005935669, "epoch": 0.3509099393373775, "grad_norm": 0.06454931261369504, "kl": 0.19287109375, "learning_rate": 1.6186724554503237e-07, "loss": 0.0002, "reward": 1.4531250596046448, "reward_std": 0.16177314054220915, "rewards/equation_reward_func": 0.47433037776499987, "rewards/format_reward_func": 0.9787946753203869, "step": 282 }, { "completion_length": 264.14621353149414, "epoch": 0.35339866231140143, "grad_norm": 0.06827460094739726, "kl": 0.19805908203125, "learning_rate": 1.5850506544152103e-07, "loss": 0.0002, "reward": 1.444196492433548, "reward_std": 0.169928221963346, "rewards/equation_reward_func": 0.47098216973245144, "rewards/format_reward_func": 0.9732143208384514, "step": 284 }, { "completion_length": 267.815860748291, "epoch": 0.3558873852854254, "grad_norm": 0.07748655693200346, "kl": 0.196533203125, "learning_rate": 1.5516188633079107e-07, "loss": 0.0002, "reward": 1.4006696864962578, "reward_std": 0.18472299817949533, "rewards/equation_reward_func": 0.42968752048909664, "rewards/format_reward_func": 0.9709821790456772, "step": 286 }, { "completion_length": 252.82367324829102, "epoch": 0.3583761082594494, "grad_norm": 0.06854165504600304, "kl": 0.20758056640625, "learning_rate": 1.5183840249960784e-07, "loss": 0.0002, "reward": 1.5178572162985802, "reward_std": 0.19481780100613832, "rewards/equation_reward_func": 0.5491071660071611, "rewards/format_reward_func": 0.9687500298023224, "step": 288 }, { "completion_length": 247.53460693359375, "epoch": 0.36086483123347335, "grad_norm": 0.057981171534578976, "kl": 0.20849609375, "learning_rate": 1.4853530414456612e-07, "loss": 0.0002, "reward": 1.4799107685685158, "reward_std": 0.1738822301849723, "rewards/equation_reward_func": 0.506696448661387, "rewards/format_reward_func": 0.9732143208384514, "step": 290 }, { "completion_length": 228.368314743042, "epoch": 0.36335355420749726, "grad_norm": 0.049516342449076566, "kl": 0.2069091796875, "learning_rate": 1.4525327722875568e-07, "loss": 0.0002, "reward": 1.4899554327130318, "reward_std": 0.13066139770671725, "rewards/equation_reward_func": 0.5133928805589676, "rewards/format_reward_func": 0.976562537252903, "step": 292 }, { "completion_length": 251.85492134094238, "epoch": 0.36584227718152124, "grad_norm": 0.061436014992367165, "kl": 0.1915283203125, "learning_rate": 1.4199300333930515e-07, "loss": 0.0002, "reward": 1.4508929178118706, "reward_std": 0.13965235324576497, "rewards/equation_reward_func": 0.4665178805589676, "rewards/format_reward_func": 0.9843750447034836, "step": 294 }, { "completion_length": 237.25224208831787, "epoch": 0.3683310001555452, "grad_norm": 0.061911720113752254, "kl": 0.21185302734375, "learning_rate": 1.3875515954583523e-07, "loss": 0.0002, "reward": 1.5156250670552254, "reward_std": 0.17848694045096636, "rewards/equation_reward_func": 0.5401785969734192, "rewards/format_reward_func": 0.9754464663565159, "step": 296 }, { "completion_length": 260.84599208831787, "epoch": 0.3708197231295691, "grad_norm": 0.06594279646081479, "kl": 0.19842529296875, "learning_rate": 1.3554041825985e-07, "loss": 0.0002, "reward": 1.4151786342263222, "reward_std": 0.20021545700728893, "rewards/equation_reward_func": 0.4464285932481289, "rewards/format_reward_func": 0.9687500186264515, "step": 298 }, { "completion_length": 260.6317090988159, "epoch": 0.3733084461035931, "grad_norm": 0.06036144065480427, "kl": 0.19525146484375, "learning_rate": 1.323494470950949e-07, "loss": 0.0002, "reward": 1.4419643506407738, "reward_std": 0.1797226625494659, "rewards/equation_reward_func": 0.47098216600716114, "rewards/format_reward_func": 0.9709821790456772, "step": 300 }, { "completion_length": 241.9788055419922, "epoch": 0.37579716907761707, "grad_norm": 0.05826454315321918, "kl": 0.19757080078125, "learning_rate": 1.2918290872891236e-07, "loss": 0.0002, "reward": 1.4810268506407738, "reward_std": 0.13545973878353834, "rewards/equation_reward_func": 0.5011160913854837, "rewards/format_reward_func": 0.9799107499420643, "step": 302 }, { "completion_length": 257.9218854904175, "epoch": 0.378285892051641, "grad_norm": 0.08618938040431284, "kl": 0.20965576171875, "learning_rate": 1.260414607646213e-07, "loss": 0.0002, "reward": 1.4888393506407738, "reward_std": 0.17534363875165582, "rewards/equation_reward_func": 0.5122768077999353, "rewards/format_reward_func": 0.9765625223517418, "step": 304 }, { "completion_length": 252.98550128936768, "epoch": 0.38077461502566495, "grad_norm": 0.06784584776225837, "kl": 0.2144775390625, "learning_rate": 1.2292575559495143e-07, "loss": 0.0002, "reward": 1.4765625670552254, "reward_std": 0.18450129078701138, "rewards/equation_reward_func": 0.5066964533179998, "rewards/format_reward_func": 0.9698661006987095, "step": 306 }, { "completion_length": 271.24443435668945, "epoch": 0.3832633379996889, "grad_norm": 0.036854119619471624, "kl": 0.19024658203125, "learning_rate": 1.1983644026655835e-07, "loss": 0.0002, "reward": 1.4185268580913544, "reward_std": 0.123422771692276, "rewards/equation_reward_func": 0.439732164144516, "rewards/format_reward_func": 0.9787946715950966, "step": 308 }, { "completion_length": 263.0301446914673, "epoch": 0.38575206097371284, "grad_norm": 0.06348479835149756, "kl": 0.1917724609375, "learning_rate": 1.1677415634565066e-07, "loss": 0.0002, "reward": 1.4821429252624512, "reward_std": 0.17120425822213292, "rewards/equation_reward_func": 0.5011160876601934, "rewards/format_reward_func": 0.9810268245637417, "step": 310 }, { "completion_length": 271.0122890472412, "epoch": 0.3882407839477368, "grad_norm": 0.06288609866959954, "kl": 0.25067138671875, "learning_rate": 1.1373953978475353e-07, "loss": 0.0003, "reward": 1.4196429252624512, "reward_std": 0.1885003875941038, "rewards/equation_reward_func": 0.4575893102446571, "rewards/format_reward_func": 0.9620535932481289, "step": 312 }, { "completion_length": 270.0948781967163, "epoch": 0.3907295069217608, "grad_norm": 0.05814773664760414, "kl": 0.18487548828125, "learning_rate": 1.1073322079063913e-07, "loss": 0.0002, "reward": 1.4341518431901932, "reward_std": 0.17814994137734175, "rewards/equation_reward_func": 0.45424109511077404, "rewards/format_reward_func": 0.9799107536673546, "step": 314 }, { "completion_length": 248.2522430419922, "epoch": 0.3932182298957847, "grad_norm": 0.0693174506427365, "kl": 0.18951416015625, "learning_rate": 1.0775582369344946e-07, "loss": 0.0002, "reward": 1.4531250596046448, "reward_std": 0.18309945985674858, "rewards/equation_reward_func": 0.48214287776499987, "rewards/format_reward_func": 0.9709821790456772, "step": 316 }, { "completion_length": 241.57367038726807, "epoch": 0.39570695286980867, "grad_norm": 0.07528066976062558, "kl": 0.2047119140625, "learning_rate": 1.0480796681704077e-07, "loss": 0.0002, "reward": 1.4877232685685158, "reward_std": 0.15524994302541018, "rewards/equation_reward_func": 0.5122768115252256, "rewards/format_reward_func": 0.9754464663565159, "step": 318 }, { "completion_length": 239.8069314956665, "epoch": 0.39819567584383264, "grad_norm": 0.05012947107458731, "kl": 0.20562744140625, "learning_rate": 1.018902623505741e-07, "loss": 0.0002, "reward": 1.4888393580913544, "reward_std": 0.12663634540513158, "rewards/equation_reward_func": 0.5100446697324514, "rewards/format_reward_func": 0.9787946827709675, "step": 320 }, { "completion_length": 239.58148384094238, "epoch": 0.4006843988178566, "grad_norm": 0.0628656210330094, "kl": 0.213134765625, "learning_rate": 9.900331622138063e-08, "loss": 0.0002, "reward": 1.5000000596046448, "reward_std": 0.1728154099546373, "rewards/equation_reward_func": 0.525669670663774, "rewards/format_reward_func": 0.9743303954601288, "step": 322 }, { "completion_length": 245.7187614440918, "epoch": 0.40317312179188053, "grad_norm": 0.06489486125976161, "kl": 0.20806884765625, "learning_rate": 9.614772796912681e-08, "loss": 0.0002, "reward": 1.428571492433548, "reward_std": 0.1652527879923582, "rewards/equation_reward_func": 0.4531250214204192, "rewards/format_reward_func": 0.9754464589059353, "step": 324 }, { "completion_length": 224.60268878936768, "epoch": 0.4056618447659045, "grad_norm": 0.0573415983961628, "kl": 0.2230224609375, "learning_rate": 9.332409062130686e-08, "loss": 0.0002, "reward": 1.564732201397419, "reward_std": 0.14763088943436742, "rewards/equation_reward_func": 0.5792410932481289, "rewards/format_reward_func": 0.9854911044239998, "step": 326 }, { "completion_length": 240.5245656967163, "epoch": 0.40815056773992847, "grad_norm": 0.06956094753946698, "kl": 0.1995849609375, "learning_rate": 9.053299057008699e-08, "loss": 0.0002, "reward": 1.4330357909202576, "reward_std": 0.15731613663956523, "rewards/equation_reward_func": 0.4520089477300644, "rewards/format_reward_func": 0.9810268096625805, "step": 328 }, { "completion_length": 270.4955463409424, "epoch": 0.4106392907139524, "grad_norm": 0.07509531278447272, "kl": 0.19549560546875, "learning_rate": 8.777500745052743e-08, "loss": 0.0002, "reward": 1.3649554327130318, "reward_std": 0.17564317397773266, "rewards/equation_reward_func": 0.3984375214204192, "rewards/format_reward_func": 0.9665178805589676, "step": 330 }, { "completion_length": 266.61942863464355, "epoch": 0.41312801368797636, "grad_norm": 0.058100347807015534, "kl": 0.2001953125, "learning_rate": 8.505071402020892e-08, "loss": 0.0002, "reward": 1.350446492433548, "reward_std": 0.17841344932094216, "rewards/equation_reward_func": 0.38504466228187084, "rewards/format_reward_func": 0.965401828289032, "step": 332 }, { "completion_length": 248.6127347946167, "epoch": 0.41561673666200033, "grad_norm": 0.05834481650049284, "kl": 0.198974609375, "learning_rate": 8.236067604028562e-08, "loss": 0.0002, "reward": 1.4609375670552254, "reward_std": 0.17589116655290127, "rewards/equation_reward_func": 0.5022321678698063, "rewards/format_reward_func": 0.9587053917348385, "step": 334 }, { "completion_length": 247.92746686935425, "epoch": 0.41810545963602425, "grad_norm": 0.0591341402478618, "kl": 0.21661376953125, "learning_rate": 7.970545215799327e-08, "loss": 0.0002, "reward": 1.4944197162985802, "reward_std": 0.17136807506904006, "rewards/equation_reward_func": 0.5212053768336773, "rewards/format_reward_func": 0.9732143208384514, "step": 336 }, { "completion_length": 275.84488105773926, "epoch": 0.4205941826100482, "grad_norm": 0.04884352625738446, "kl": 0.20855712890625, "learning_rate": 7.708559379063204e-08, "loss": 0.0002, "reward": 1.3694196976721287, "reward_std": 0.17837184108793736, "rewards/equation_reward_func": 0.4107143059372902, "rewards/format_reward_func": 0.9587053880095482, "step": 338 }, { "completion_length": 243.42411851882935, "epoch": 0.4230829055840722, "grad_norm": 0.06899085908963953, "kl": 48.954345703125, "learning_rate": 7.45016450110534e-08, "loss": 0.0489, "reward": 1.4654018580913544, "reward_std": 0.1875977530144155, "rewards/equation_reward_func": 0.4933036006987095, "rewards/format_reward_func": 0.9720982499420643, "step": 340 }, { "completion_length": 265.886173248291, "epoch": 0.4255716285580961, "grad_norm": 0.05980414020403841, "kl": 0.19793701171875, "learning_rate": 7.195414243467029e-08, "loss": 0.0002, "reward": 1.3984375596046448, "reward_std": 0.175481291487813, "rewards/equation_reward_func": 0.43750001955777407, "rewards/format_reward_func": 0.9609375335276127, "step": 342 }, { "completion_length": 255.4665298461914, "epoch": 0.4280603515321201, "grad_norm": 0.05974337860010266, "kl": 0.197998046875, "learning_rate": 6.944361510801763e-08, "loss": 0.0002, "reward": 1.4419643431901932, "reward_std": 0.1873517008498311, "rewards/equation_reward_func": 0.4787946669384837, "rewards/format_reward_func": 0.9631696790456772, "step": 344 }, { "completion_length": 237.57590103149414, "epoch": 0.43054907450614405, "grad_norm": 0.04126838575736974, "kl": 0.19622802734375, "learning_rate": 6.697058439888283e-08, "loss": 0.0002, "reward": 1.4720982760190964, "reward_std": 0.14282790152356029, "rewards/equation_reward_func": 0.4866071632131934, "rewards/format_reward_func": 0.9854910857975483, "step": 346 }, { "completion_length": 245.25224494934082, "epoch": 0.43303779748016796, "grad_norm": 0.08845105411407385, "kl": 0.1953125, "learning_rate": 6.453556388803288e-08, "loss": 0.0002, "reward": 1.4330357685685158, "reward_std": 0.17873004684224725, "rewards/equation_reward_func": 0.45535716973245144, "rewards/format_reward_func": 0.9776786044239998, "step": 348 }, { "completion_length": 259.90849781036377, "epoch": 0.43552652045419193, "grad_norm": 0.06619651517208977, "kl": 0.20709228515625, "learning_rate": 6.213905926255697e-08, "loss": 0.0002, "reward": 1.38839291036129, "reward_std": 0.17326576448976994, "rewards/equation_reward_func": 0.4196428721770644, "rewards/format_reward_func": 0.9687500298023224, "step": 350 }, { "completion_length": 246.68862628936768, "epoch": 0.4380152434282159, "grad_norm": 0.07466580623927803, "kl": 0.21270751953125, "learning_rate": 5.978156821084987e-08, "loss": 0.0002, "reward": 1.4709822162985802, "reward_std": 0.17257908964529634, "rewards/equation_reward_func": 0.49776787869632244, "rewards/format_reward_func": 0.973214328289032, "step": 352 }, { "completion_length": 238.16072463989258, "epoch": 0.4405039664022399, "grad_norm": 0.05555433456578972, "kl": 0.20025634765625, "learning_rate": 5.7463580319254853e-08, "loss": 0.0002, "reward": 1.464285783469677, "reward_std": 0.18330077826976776, "rewards/equation_reward_func": 0.4821428805589676, "rewards/format_reward_func": 0.9821428917348385, "step": 354 }, { "completion_length": 238.5982265472412, "epoch": 0.4429926893762638, "grad_norm": 0.10010877276744827, "kl": 0.20880126953125, "learning_rate": 5.518557697039081e-08, "loss": 0.0002, "reward": 1.5022322162985802, "reward_std": 0.15463243331760168, "rewards/equation_reward_func": 0.5234375204890966, "rewards/format_reward_func": 0.9787946604192257, "step": 356 }, { "completion_length": 235.23773384094238, "epoch": 0.44548141235028776, "grad_norm": 0.05922980250758421, "kl": 0.208984375, "learning_rate": 5.294803124318145e-08, "loss": 0.0002, "reward": 1.512276865541935, "reward_std": 0.15547437546774745, "rewards/equation_reward_func": 0.5323661006987095, "rewards/format_reward_func": 0.9799107387661934, "step": 358 }, { "completion_length": 247.35046005249023, "epoch": 0.44797013532431174, "grad_norm": 0.040689373027519675, "kl": 0.19696044921875, "learning_rate": 5.07514078146106e-08, "loss": 0.0002, "reward": 1.4263393357396126, "reward_std": 0.1481092283502221, "rewards/equation_reward_func": 0.454241088591516, "rewards/format_reward_func": 0.9720982536673546, "step": 360 }, { "completion_length": 242.27567958831787, "epoch": 0.45045885829833565, "grad_norm": 0.07393799743280928, "kl": 0.20159912109375, "learning_rate": 4.859616286322094e-08, "loss": 0.0002, "reward": 1.4620536342263222, "reward_std": 0.1555815995670855, "rewards/equation_reward_func": 0.47656251676380634, "rewards/format_reward_func": 0.9854910932481289, "step": 362 }, { "completion_length": 248.84710788726807, "epoch": 0.4529475812723596, "grad_norm": 0.04906536376122032, "kl": 0.20123291015625, "learning_rate": 4.648274397437829e-08, "loss": 0.0002, "reward": 1.4654018506407738, "reward_std": 0.1682790988124907, "rewards/equation_reward_func": 0.49218752793967724, "rewards/format_reward_func": 0.9732143133878708, "step": 364 }, { "completion_length": 245.94532299041748, "epoch": 0.4554363042463836, "grad_norm": 0.05802144393054711, "kl": 0.19244384765625, "learning_rate": 4.4411590047320617e-08, "loss": 0.0002, "reward": 1.4464286342263222, "reward_std": 0.15317056560888886, "rewards/equation_reward_func": 0.4676339514553547, "rewards/format_reward_func": 0.9787946604192257, "step": 366 }, { "completion_length": 246.37612628936768, "epoch": 0.4579250272204075, "grad_norm": 0.060239468744741904, "kl": 0.20068359375, "learning_rate": 4.2383131204010494e-08, "loss": 0.0002, "reward": 1.422991156578064, "reward_std": 0.15117083815857768, "rewards/equation_reward_func": 0.45424109790474176, "rewards/format_reward_func": 0.9687500335276127, "step": 368 }, { "completion_length": 242.1529130935669, "epoch": 0.4604137501944315, "grad_norm": 0.04267260387428845, "kl": 0.19769287109375, "learning_rate": 4.039778869981064e-08, "loss": 0.0002, "reward": 1.4732143506407738, "reward_std": 0.12463709712028503, "rewards/equation_reward_func": 0.49107145331799984, "rewards/format_reward_func": 0.9821428842842579, "step": 370 }, { "completion_length": 260.5067090988159, "epoch": 0.46290247316845545, "grad_norm": 0.0544495720289963, "kl": 0.199462890625, "learning_rate": 3.845597483600049e-08, "loss": 0.0002, "reward": 1.4017857909202576, "reward_std": 0.1715238355100155, "rewards/equation_reward_func": 0.4386160932481289, "rewards/format_reward_func": 0.9631696753203869, "step": 372 }, { "completion_length": 253.75782299041748, "epoch": 0.46539119614247937, "grad_norm": 0.05723699687752985, "kl": 0.2158203125, "learning_rate": 3.655809287415284e-08, "loss": 0.0002, "reward": 1.3939732871949673, "reward_std": 0.17196187004446983, "rewards/equation_reward_func": 0.4207589514553547, "rewards/format_reward_func": 0.9732143245637417, "step": 374 }, { "completion_length": 259.69532108306885, "epoch": 0.46787991911650334, "grad_norm": 0.05101025382090558, "kl": 0.1917724609375, "learning_rate": 3.4704536952387285e-08, "loss": 0.0002, "reward": 1.3772322088479996, "reward_std": 0.17750998865813017, "rewards/equation_reward_func": 0.40513394703157246, "rewards/format_reward_func": 0.9720982387661934, "step": 376 }, { "completion_length": 236.41072368621826, "epoch": 0.4703686420905273, "grad_norm": 0.07195347219746893, "kl": 0.1944580078125, "learning_rate": 3.2895692003518575e-08, "loss": 0.0002, "reward": 1.4944197162985802, "reward_std": 0.17748092440888286, "rewards/equation_reward_func": 0.5200893096625805, "rewards/format_reward_func": 0.9743303954601288, "step": 378 }, { "completion_length": 259.48438835144043, "epoch": 0.4728573650645512, "grad_norm": 0.07172484472660048, "kl": 0.19696044921875, "learning_rate": 3.113193367511635e-08, "loss": 0.0002, "reward": 1.3895090073347092, "reward_std": 0.1647741007618606, "rewards/equation_reward_func": 0.41852680779993534, "rewards/format_reward_func": 0.9709821753203869, "step": 380 }, { "completion_length": 245.80804824829102, "epoch": 0.4753460880385752, "grad_norm": 0.04427418304284531, "kl": 0.20074462890625, "learning_rate": 2.9413628251493934e-08, "loss": 0.0002, "reward": 1.4587054327130318, "reward_std": 0.15418589767068624, "rewards/equation_reward_func": 0.4888393096625805, "rewards/format_reward_func": 0.9698661081492901, "step": 382 }, { "completion_length": 241.82032108306885, "epoch": 0.47783481101259917, "grad_norm": 0.05413458692040878, "kl": 0.19281005859375, "learning_rate": 2.774113257764066e-08, "loss": 0.0002, "reward": 1.4654018580913544, "reward_std": 0.1285329912789166, "rewards/equation_reward_func": 0.48995537497103214, "rewards/format_reward_func": 0.975446455180645, "step": 384 }, { "completion_length": 235.2399663925171, "epoch": 0.48032353398662314, "grad_norm": 0.07013851232759652, "kl": 0.20245361328125, "learning_rate": 2.611479398511518e-08, "loss": 0.0002, "reward": 1.439732201397419, "reward_std": 0.18071282655000687, "rewards/equation_reward_func": 0.4687500251457095, "rewards/format_reward_func": 0.9709821790456772, "step": 386 }, { "completion_length": 253.3303689956665, "epoch": 0.48281225696064706, "grad_norm": 0.05817044023141003, "kl": 0.19342041015625, "learning_rate": 2.4534950219914057e-08, "loss": 0.0002, "reward": 1.3939732909202576, "reward_std": 0.1655187914147973, "rewards/equation_reward_func": 0.4185268059372902, "rewards/format_reward_func": 0.9754464738070965, "step": 388 }, { "completion_length": 220.24219703674316, "epoch": 0.48530097993467103, "grad_norm": 0.06057036471445671, "kl": 0.2003173828125, "learning_rate": 2.300192937233128e-08, "loss": 0.0002, "reward": 1.4854911267757416, "reward_std": 0.1361873117275536, "rewards/equation_reward_func": 0.49888395704329014, "rewards/format_reward_func": 0.9866071678698063, "step": 390 }, { "completion_length": 238.94085693359375, "epoch": 0.487789702908695, "grad_norm": 0.059472704194779645, "kl": 0.229736328125, "learning_rate": 2.1516049808822935e-08, "loss": 0.0002, "reward": 1.4229911342263222, "reward_std": 0.19178591016680002, "rewards/equation_reward_func": 0.4486607378348708, "rewards/format_reward_func": 0.9743303880095482, "step": 392 }, { "completion_length": 214.02791023254395, "epoch": 0.4902784258827189, "grad_norm": 0.04800855566257019, "kl": 0.21405029296875, "learning_rate": 2.007762010589098e-08, "loss": 0.0002, "reward": 1.506696492433548, "reward_std": 0.09478258900344372, "rewards/equation_reward_func": 0.5245536025613546, "rewards/format_reward_func": 0.9821428768336773, "step": 394 }, { "completion_length": 247.6830472946167, "epoch": 0.4927671488567429, "grad_norm": 0.07541466310359685, "kl": 0.2535400390625, "learning_rate": 1.8686938986000627e-08, "loss": 0.0003, "reward": 1.3705357685685158, "reward_std": 0.1484149764291942, "rewards/equation_reward_func": 0.39174109196756035, "rewards/format_reward_func": 0.9787946715950966, "step": 396 }, { "completion_length": 239.485502243042, "epoch": 0.49525587183076686, "grad_norm": 0.05438411307581503, "kl": 0.208740234375, "learning_rate": 1.734429525554365e-08, "loss": 0.0002, "reward": 1.4174107760190964, "reward_std": 0.15924348449334502, "rewards/equation_reward_func": 0.4397321632131934, "rewards/format_reward_func": 0.9776785932481289, "step": 398 }, { "completion_length": 240.2142972946167, "epoch": 0.4977445948047908, "grad_norm": 0.05085143905737969, "kl": 0.2021484375, "learning_rate": 1.604996774486145e-08, "loss": 0.0002, "reward": 1.389508992433548, "reward_std": 0.1306094191968441, "rewards/equation_reward_func": 0.407366088591516, "rewards/format_reward_func": 0.9821428805589676, "step": 400 }, { "completion_length": 243.95648193359375, "epoch": 0.5002333177788147, "grad_norm": 0.05035017375480962, "kl": 0.2027587890625, "learning_rate": 1.4804225250339281e-08, "loss": 0.0002, "reward": 1.4475447162985802, "reward_std": 0.16129027446731925, "rewards/equation_reward_func": 0.478794664144516, "rewards/format_reward_func": 0.9687500335276127, "step": 402 }, { "completion_length": 237.8091640472412, "epoch": 0.5027220407528387, "grad_norm": 0.054861170642348474, "kl": 0.1966552734375, "learning_rate": 1.360732647858498e-08, "loss": 0.0002, "reward": 1.4263393431901932, "reward_std": 0.1515321801416576, "rewards/equation_reward_func": 0.44754466880112886, "rewards/format_reward_func": 0.9787946715950966, "step": 404 }, { "completion_length": 219.90625953674316, "epoch": 0.5052107637268627, "grad_norm": 0.07477628715165045, "kl": 0.21026611328125, "learning_rate": 1.2459519992702311e-08, "loss": 0.0002, "reward": 1.5011161491274834, "reward_std": 0.1352310930378735, "rewards/equation_reward_func": 0.5145089495927095, "rewards/format_reward_func": 0.9866071715950966, "step": 406 }, { "completion_length": 255.5993413925171, "epoch": 0.5076994867008866, "grad_norm": 0.05849730835576962, "kl": 0.1883544921875, "learning_rate": 1.1361044160671629e-08, "loss": 0.0002, "reward": 1.3504464849829674, "reward_std": 0.14734927052631974, "rewards/equation_reward_func": 0.37834823317825794, "rewards/format_reward_func": 0.972098246216774, "step": 408 }, { "completion_length": 223.27344799041748, "epoch": 0.5101882096749105, "grad_norm": 0.06665066337341637, "kl": 0.20648193359375, "learning_rate": 1.0312127105846947e-08, "loss": 0.0002, "reward": 1.4966518431901932, "reward_std": 0.15878778649494052, "rewards/equation_reward_func": 0.5122768119908869, "rewards/format_reward_func": 0.9843750409781933, "step": 410 }, { "completion_length": 234.1026906967163, "epoch": 0.5126769326489345, "grad_norm": 0.05933115040400525, "kl": 0.204345703125, "learning_rate": 9.312986659581301e-09, "loss": 0.0002, "reward": 1.4542411416769028, "reward_std": 0.17036408884450793, "rewards/equation_reward_func": 0.47321429941803217, "rewards/format_reward_func": 0.9810268096625805, "step": 412 }, { "completion_length": 231.8727788925171, "epoch": 0.5151656556229585, "grad_norm": 0.06260514018048406, "kl": 0.2193603515625, "learning_rate": 8.363830315988945e-09, "loss": 0.0002, "reward": 1.4832590073347092, "reward_std": 0.1522468039765954, "rewards/equation_reward_func": 0.4988839514553547, "rewards/format_reward_func": 0.9843750335276127, "step": 414 }, { "completion_length": 240.4453239440918, "epoch": 0.5176543785969824, "grad_norm": 0.034718644688255154, "kl": 0.20037841796875, "learning_rate": 7.46485518885462e-09, "loss": 0.0002, "reward": 1.4151786491274834, "reward_std": 0.13582705985754728, "rewards/equation_reward_func": 0.43750002048909664, "rewards/format_reward_func": 0.9776785969734192, "step": 416 }, { "completion_length": 227.1093864440918, "epoch": 0.5201431015710064, "grad_norm": 0.05669666354887382, "kl": 0.234375, "learning_rate": 6.616247970698319e-09, "loss": 0.0002, "reward": 1.4988840073347092, "reward_std": 0.13698529778048396, "rewards/equation_reward_func": 0.5189732406288385, "rewards/format_reward_func": 0.9799107424914837, "step": 418 }, { "completion_length": 262.71653270721436, "epoch": 0.5226318245450303, "grad_norm": 0.054401514129177465, "kl": 0.18310546875, "learning_rate": 5.8181848940044855e-09, "loss": 0.0002, "reward": 1.3281250670552254, "reward_std": 0.1366122462786734, "rewards/equation_reward_func": 0.34375001303851604, "rewards/format_reward_func": 0.9843750298023224, "step": 420 }, { "completion_length": 235.43639755249023, "epoch": 0.5251205475190542, "grad_norm": 0.07834852740210009, "kl": 0.21807861328125, "learning_rate": 5.070831694623135e-09, "loss": 0.0002, "reward": 1.470982201397419, "reward_std": 0.14967900328338146, "rewards/equation_reward_func": 0.48995538521558046, "rewards/format_reward_func": 0.9810268096625805, "step": 422 }, { "completion_length": 250.70983409881592, "epoch": 0.5276092704930783, "grad_norm": 0.06584760953247815, "kl": 0.19866943359375, "learning_rate": 4.374343577351336e-09, "loss": 0.0002, "reward": 1.416294701397419, "reward_std": 0.18149724043905735, "rewards/equation_reward_func": 0.44866074062883854, "rewards/format_reward_func": 0.9676339700818062, "step": 424 }, { "completion_length": 242.2321538925171, "epoch": 0.5300979934671022, "grad_norm": 0.05239977644165029, "kl": 0.2098388671875, "learning_rate": 3.7288651837012745e-09, "loss": 0.0002, "reward": 1.4330357685685158, "reward_std": 0.16506521217525005, "rewards/equation_reward_func": 0.45312501955777407, "rewards/format_reward_func": 0.979910746216774, "step": 426 }, { "completion_length": 215.3493413925171, "epoch": 0.5325867164411261, "grad_norm": 0.08270584148740566, "kl": 0.202392578125, "learning_rate": 3.134530561862081e-09, "loss": 0.0002, "reward": 1.5446429327130318, "reward_std": 0.14123681280761957, "rewards/equation_reward_func": 0.5558035969734192, "rewards/format_reward_func": 0.9888393133878708, "step": 428 }, { "completion_length": 241.829252243042, "epoch": 0.5350754394151501, "grad_norm": 0.054858889512319424, "kl": 0.1895751953125, "learning_rate": 2.5914631388619103e-09, "loss": 0.0002, "reward": 1.4241072088479996, "reward_std": 0.10743419965729117, "rewards/equation_reward_func": 0.43750002048909664, "rewards/format_reward_func": 0.9866071715950966, "step": 430 }, { "completion_length": 227.63059043884277, "epoch": 0.537564162389174, "grad_norm": 0.06298373988036782, "kl": 0.2347412109375, "learning_rate": 2.0997756949353297e-09, "loss": 0.0002, "reward": 1.4654018580913544, "reward_std": 0.13623460568487644, "rewards/equation_reward_func": 0.482142879627645, "rewards/format_reward_func": 0.983258955180645, "step": 432 }, { "completion_length": 231.58482933044434, "epoch": 0.540052885363198, "grad_norm": 0.05735572859329674, "kl": 0.241455078125, "learning_rate": 1.6595703401020844e-09, "loss": 0.0002, "reward": 1.487723283469677, "reward_std": 0.1775469919666648, "rewards/equation_reward_func": 0.5156250279396772, "rewards/format_reward_func": 0.9720982536673546, "step": 434 }, { "completion_length": 226.0413064956665, "epoch": 0.542541608337222, "grad_norm": 0.07098124510409774, "kl": 0.3662109375, "learning_rate": 1.2709384929615596e-09, "loss": 0.0004, "reward": 1.4821429252624512, "reward_std": 0.14848907012492418, "rewards/equation_reward_func": 0.49888394959270954, "rewards/format_reward_func": 0.983258955180645, "step": 436 }, { "completion_length": 233.2154140472412, "epoch": 0.5450303313112459, "grad_norm": 0.0583032288090882, "kl": 0.234619140625, "learning_rate": 9.339608617077165e-10, "loss": 0.0002, "reward": 1.4352679178118706, "reward_std": 0.12030374212190509, "rewards/equation_reward_func": 0.4564732341095805, "rewards/format_reward_func": 0.978794664144516, "step": 438 }, { "completion_length": 231.55581378936768, "epoch": 0.5475190542852699, "grad_norm": 0.05202154689567301, "kl": 0.19830322265625, "learning_rate": 6.487074273681114e-10, "loss": 0.0002, "reward": 1.475446492433548, "reward_std": 0.16154285473749042, "rewards/equation_reward_func": 0.49107144586741924, "rewards/format_reward_func": 0.9843750298023224, "step": 440 }, { "completion_length": 226.89844608306885, "epoch": 0.5500077772592938, "grad_norm": 0.06559019258824787, "kl": 0.212890625, "learning_rate": 4.152374292708538e-10, "loss": 0.0002, "reward": 1.455357201397419, "reward_std": 0.10218051401898265, "rewards/equation_reward_func": 0.4654018096625805, "rewards/format_reward_func": 0.9899553880095482, "step": 442 }, { "completion_length": 220.54799938201904, "epoch": 0.5524965002333178, "grad_norm": 0.05408063170536412, "kl": 0.2149658203125, "learning_rate": 2.3359935274214204e-10, "loss": 0.0002, "reward": 1.5022322237491608, "reward_std": 0.14629516191780567, "rewards/equation_reward_func": 0.5167410969734192, "rewards/format_reward_func": 0.9854911044239998, "step": 444 }, { "completion_length": 228.62389469146729, "epoch": 0.5549852232073418, "grad_norm": 0.04759889084008438, "kl": 0.21331787109375, "learning_rate": 1.0383091903720665e-10, "loss": 0.0002, "reward": 1.507812574505806, "reward_std": 0.13805820420384407, "rewards/equation_reward_func": 0.5245535988360643, "rewards/format_reward_func": 0.9832589700818062, "step": 446 }, { "completion_length": 239.22768783569336, "epoch": 0.5574739461813657, "grad_norm": 0.06951974665374468, "kl": 0.20892333984375, "learning_rate": 2.595907750671533e-11, "loss": 0.0002, "reward": 1.4174107685685158, "reward_std": 0.1395715670660138, "rewards/equation_reward_func": 0.43638394959270954, "rewards/format_reward_func": 0.981026828289032, "step": 448 }, { "completion_length": 230.02009963989258, "epoch": 0.5599626691553896, "grad_norm": 0.06979530747254646, "kl": 0.21221923828125, "learning_rate": 0.0, "loss": 0.0002, "reward": 1.4553571939468384, "reward_std": 0.12265546387061477, "rewards/equation_reward_func": 0.4776785969734192, "rewards/format_reward_func": 0.9776786081492901, "step": 450 }, { "epoch": 0.5599626691553896, "step": 450, "total_flos": 0.0, "train_loss": 0.0003818092903999215, "train_runtime": 35419.6415, "train_samples_per_second": 0.711, "train_steps_per_second": 0.013 } ], "logging_steps": 2, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }