qwen-2.5-3b-r1-countdown / trainer_state.json
mgaimm's picture
Model save
b315b7b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.24,
"eval_steps": 500,
"global_step": 450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 505.9479351043701,
"epoch": 0.0010666666666666667,
"grad_norm": 0.1341146091763646,
"kl": 0.0,
"learning_rate": 7.142857142857142e-08,
"loss": -0.0,
"reward": 0.3046875079162419,
"reward_std": 0.4547263579443097,
"rewards/equation_reward_func": 0.03645833441987634,
"rewards/format_reward_func": 0.26822917466051877,
"step": 2
},
{
"completion_length": 511.1562671661377,
"epoch": 0.0021333333333333334,
"grad_norm": 0.11909890519385766,
"kl": 0.0004194974899291992,
"learning_rate": 1.4285714285714285e-07,
"loss": 0.0,
"reward": 0.3098958386108279,
"reward_std": 0.4707766156643629,
"rewards/equation_reward_func": 0.0494791679084301,
"rewards/format_reward_func": 0.26041667349636555,
"step": 4
},
{
"completion_length": 484.7057456970215,
"epoch": 0.0032,
"grad_norm": 0.10838779091076944,
"kl": 0.0003948211669921875,
"learning_rate": 2.1428571428571426e-07,
"loss": 0.0,
"reward": 0.25520834187045693,
"reward_std": 0.4062541304156184,
"rewards/equation_reward_func": 0.04427083418704569,
"rewards/format_reward_func": 0.2109375053551048,
"step": 6
},
{
"completion_length": 502.9635524749756,
"epoch": 0.004266666666666667,
"grad_norm": 0.11359857035268194,
"kl": 0.00040793418884277344,
"learning_rate": 2.857142857142857e-07,
"loss": 0.0,
"reward": 0.3229166753590107,
"reward_std": 0.4702935107052326,
"rewards/equation_reward_func": 0.07291666860692203,
"rewards/format_reward_func": 0.250000006519258,
"step": 8
},
{
"completion_length": 472.85418128967285,
"epoch": 0.005333333333333333,
"grad_norm": 0.13319802291837166,
"kl": 0.00041985511779785156,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.0,
"reward": 0.3359375074505806,
"reward_std": 0.47765984758734703,
"rewards/equation_reward_func": 0.052083334885537624,
"rewards/format_reward_func": 0.28385417396202683,
"step": 10
},
{
"completion_length": 474.9609489440918,
"epoch": 0.0064,
"grad_norm": 0.1283866658242959,
"kl": 0.0004889965057373047,
"learning_rate": 4.285714285714285e-07,
"loss": 0.0,
"reward": 0.40625000512227416,
"reward_std": 0.5288777491077781,
"rewards/equation_reward_func": 0.0703125016298145,
"rewards/format_reward_func": 0.33593750884756446,
"step": 12
},
{
"completion_length": 487.7526226043701,
"epoch": 0.007466666666666667,
"grad_norm": 0.11491878824082066,
"kl": 0.0008172988891601562,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 0.3984375139698386,
"reward_std": 0.4919305704534054,
"rewards/equation_reward_func": 0.05208333395421505,
"rewards/format_reward_func": 0.3463541781529784,
"step": 14
},
{
"completion_length": 468.70313835144043,
"epoch": 0.008533333333333334,
"grad_norm": 0.12246266971775394,
"kl": 0.0011203289031982422,
"learning_rate": 4.999740409224932e-07,
"loss": 0.0,
"reward": 0.5494791846722364,
"reward_std": 0.5318632125854492,
"rewards/equation_reward_func": 0.08854167023673654,
"rewards/format_reward_func": 0.46093751303851604,
"step": 16
},
{
"completion_length": 454.82292556762695,
"epoch": 0.0096,
"grad_norm": 0.10480668657811888,
"kl": 0.00298309326171875,
"learning_rate": 4.998961690809627e-07,
"loss": 0.0,
"reward": 0.6796875204890966,
"reward_std": 0.5534657873213291,
"rewards/equation_reward_func": 0.06770833465270698,
"rewards/format_reward_func": 0.611979192122817,
"step": 18
},
{
"completion_length": 453.3411560058594,
"epoch": 0.010666666666666666,
"grad_norm": 0.11208435254258003,
"kl": 0.005069732666015625,
"learning_rate": 4.997664006472578e-07,
"loss": 0.0,
"reward": 0.7500000186264515,
"reward_std": 0.5607063695788383,
"rewards/equation_reward_func": 0.0859375016298145,
"rewards/format_reward_func": 0.6640625167638063,
"step": 20
},
{
"completion_length": 450.89063262939453,
"epoch": 0.011733333333333333,
"grad_norm": 0.10552116383248636,
"kl": 0.005932807922363281,
"learning_rate": 4.995847625707292e-07,
"loss": 0.0,
"reward": 0.8593750149011612,
"reward_std": 0.5123661290854216,
"rewards/equation_reward_func": 0.09895833511836827,
"rewards/format_reward_func": 0.7604166939854622,
"step": 22
},
{
"completion_length": 449.1198043823242,
"epoch": 0.0128,
"grad_norm": 0.10482422281110657,
"kl": 0.007808685302734375,
"learning_rate": 4.993512925726318e-07,
"loss": 0.0,
"reward": 0.8958333544433117,
"reward_std": 0.44584160670638084,
"rewards/equation_reward_func": 0.07812500302679837,
"rewards/format_reward_func": 0.8177083507180214,
"step": 24
},
{
"completion_length": 437.75782012939453,
"epoch": 0.013866666666666666,
"grad_norm": 0.08078578907154227,
"kl": 0.0073490142822265625,
"learning_rate": 4.990660391382923e-07,
"loss": 0.0,
"reward": 0.9505208656191826,
"reward_std": 0.40651129884645343,
"rewards/equation_reward_func": 0.09895833604969084,
"rewards/format_reward_func": 0.8515625186264515,
"step": 26
},
{
"completion_length": 441.77345275878906,
"epoch": 0.014933333333333333,
"grad_norm": 0.08279347003242855,
"kl": 0.00899505615234375,
"learning_rate": 4.987290615070384e-07,
"loss": 0.0,
"reward": 0.9583333544433117,
"reward_std": 0.31725937221199274,
"rewards/equation_reward_func": 0.06510416814126074,
"rewards/format_reward_func": 0.893229179084301,
"step": 28
},
{
"completion_length": 438.5364685058594,
"epoch": 0.016,
"grad_norm": 0.07534793657846317,
"kl": 0.01221466064453125,
"learning_rate": 4.983404296598978e-07,
"loss": 0.0,
"reward": 1.0104166828095913,
"reward_std": 0.286367348395288,
"rewards/equation_reward_func": 0.08072916860692203,
"rewards/format_reward_func": 0.9296875186264515,
"step": 30
},
{
"completion_length": 446.8125114440918,
"epoch": 0.017066666666666667,
"grad_norm": 0.07968622664869553,
"kl": 0.011959075927734375,
"learning_rate": 4.979002243050646e-07,
"loss": 0.0,
"reward": 1.0026041977107525,
"reward_std": 0.344503759406507,
"rewards/equation_reward_func": 0.09635416860692203,
"rewards/format_reward_func": 0.9062500223517418,
"step": 32
},
{
"completion_length": 427.65886306762695,
"epoch": 0.018133333333333335,
"grad_norm": 0.08471832672268678,
"kl": 0.027118682861328125,
"learning_rate": 4.974085368611381e-07,
"loss": 0.0,
"reward": 1.0442708693444729,
"reward_std": 0.2840048740617931,
"rewards/equation_reward_func": 0.1015625037252903,
"rewards/format_reward_func": 0.9427083544433117,
"step": 34
},
{
"completion_length": 444.25261878967285,
"epoch": 0.0192,
"grad_norm": 0.06852883069586094,
"kl": 0.01403045654296875,
"learning_rate": 4.968654694381379e-07,
"loss": 0.0,
"reward": 0.9921875335276127,
"reward_std": 0.21781930467113853,
"rewards/equation_reward_func": 0.05208333511836827,
"rewards/format_reward_func": 0.9401041865348816,
"step": 36
},
{
"completion_length": 383.59896659851074,
"epoch": 0.020266666666666665,
"grad_norm": 0.08057979196934888,
"kl": 0.0169677734375,
"learning_rate": 4.962711348162987e-07,
"loss": 0.0,
"reward": 1.1223958656191826,
"reward_std": 0.2702699927613139,
"rewards/equation_reward_func": 0.14062500395812094,
"rewards/format_reward_func": 0.9817708507180214,
"step": 38
},
{
"completion_length": 411.5677185058594,
"epoch": 0.021333333333333333,
"grad_norm": 0.07850698291787955,
"kl": 0.017246246337890625,
"learning_rate": 4.956256564226487e-07,
"loss": 0.0,
"reward": 1.0989583656191826,
"reward_std": 0.2887088777497411,
"rewards/equation_reward_func": 0.13281250279396772,
"rewards/format_reward_func": 0.9661458507180214,
"step": 40
},
{
"completion_length": 396.64584159851074,
"epoch": 0.0224,
"grad_norm": 0.07505103817031399,
"kl": 0.017597198486328125,
"learning_rate": 4.949291683053768e-07,
"loss": 0.0,
"reward": 1.0807291977107525,
"reward_std": 0.2670950279571116,
"rewards/equation_reward_func": 0.11458333698101342,
"rewards/format_reward_func": 0.9661458507180214,
"step": 42
},
{
"completion_length": 387.0468864440918,
"epoch": 0.023466666666666667,
"grad_norm": 0.09175240895759779,
"kl": 0.017871856689453125,
"learning_rate": 4.941818151059955e-07,
"loss": 0.0,
"reward": 1.1015625335276127,
"reward_std": 0.2870901683345437,
"rewards/equation_reward_func": 0.1354166711680591,
"rewards/format_reward_func": 0.9661458507180214,
"step": 44
},
{
"completion_length": 400.3645935058594,
"epoch": 0.024533333333333334,
"grad_norm": 0.09491357639118295,
"kl": 0.019161224365234375,
"learning_rate": 4.933837520293017e-07,
"loss": 0.0,
"reward": 1.070312537252903,
"reward_std": 0.2785795754753053,
"rewards/equation_reward_func": 0.10937500442378223,
"rewards/format_reward_func": 0.9609375260770321,
"step": 46
},
{
"completion_length": 403.8671989440918,
"epoch": 0.0256,
"grad_norm": 0.08449768835766272,
"kl": 0.01947021484375,
"learning_rate": 4.925351448111454e-07,
"loss": 0.0,
"reward": 1.0598958767950535,
"reward_std": 0.1955897193402052,
"rewards/equation_reward_func": 0.08333333604969084,
"rewards/format_reward_func": 0.9765625149011612,
"step": 48
},
{
"completion_length": 384.1770896911621,
"epoch": 0.02666666666666667,
"grad_norm": 0.09879170444522951,
"kl": 0.02040863037109375,
"learning_rate": 4.91636169684011e-07,
"loss": 0.0,
"reward": 1.1223958730697632,
"reward_std": 0.31093722581863403,
"rewards/equation_reward_func": 0.1406250037252903,
"rewards/format_reward_func": 0.9817708432674408,
"step": 50
},
{
"completion_length": 391.669282913208,
"epoch": 0.027733333333333332,
"grad_norm": 0.10757568231914379,
"kl": 0.0244903564453125,
"learning_rate": 4.906870133404186e-07,
"loss": 0.0,
"reward": 1.1197916977107525,
"reward_std": 0.3494974756613374,
"rewards/equation_reward_func": 0.15885417140088975,
"rewards/format_reward_func": 0.9609375186264515,
"step": 52
},
{
"completion_length": 387.16407012939453,
"epoch": 0.0288,
"grad_norm": 0.0916962283697697,
"kl": 0.02394866943359375,
"learning_rate": 4.896878728941531e-07,
"loss": 0.0,
"reward": 1.1067708656191826,
"reward_std": 0.25607615802437067,
"rewards/equation_reward_func": 0.1328125020954758,
"rewards/format_reward_func": 0.9739583544433117,
"step": 54
},
{
"completion_length": 346.4114646911621,
"epoch": 0.029866666666666666,
"grad_norm": 0.09993350369732659,
"kl": 0.0276031494140625,
"learning_rate": 4.886389558393284e-07,
"loss": 0.0,
"reward": 1.1510416939854622,
"reward_std": 0.2859157114289701,
"rewards/equation_reward_func": 0.16145833767950535,
"rewards/format_reward_func": 0.9895833432674408,
"step": 56
},
{
"completion_length": 361.99219512939453,
"epoch": 0.030933333333333334,
"grad_norm": 0.11653485215024455,
"kl": 0.02984619140625,
"learning_rate": 4.875404800072976e-07,
"loss": 0.0,
"reward": 1.1640625447034836,
"reward_std": 0.3471745736896992,
"rewards/equation_reward_func": 0.18750000558793545,
"rewards/format_reward_func": 0.9765625149011612,
"step": 58
},
{
"completion_length": 367.1015739440918,
"epoch": 0.032,
"grad_norm": 0.07180913754511904,
"kl": 0.03044891357421875,
"learning_rate": 4.86392673521415e-07,
"loss": 0.0,
"reward": 1.0911458805203438,
"reward_std": 0.1999878236092627,
"rewards/equation_reward_func": 0.10416666930541396,
"rewards/format_reward_func": 0.9869791753590107,
"step": 60
},
{
"completion_length": 366.5208435058594,
"epoch": 0.03306666666666667,
"grad_norm": 0.08088172620555445,
"kl": 0.0330810546875,
"learning_rate": 4.851957747496606e-07,
"loss": 0.0,
"reward": 1.1510416939854622,
"reward_std": 0.28296295227482915,
"rewards/equation_reward_func": 0.16927083488553762,
"rewards/format_reward_func": 0.9817708469927311,
"step": 62
},
{
"completion_length": 357.73178482055664,
"epoch": 0.034133333333333335,
"grad_norm": 0.0844167380266008,
"kl": 0.03631591796875,
"learning_rate": 4.839500322551386e-07,
"loss": 0.0,
"reward": 1.1197916939854622,
"reward_std": 0.2452517431229353,
"rewards/equation_reward_func": 0.14843750186264515,
"rewards/format_reward_func": 0.9713541753590107,
"step": 64
},
{
"completion_length": 353.9739685058594,
"epoch": 0.0352,
"grad_norm": 0.0778527671209511,
"kl": 0.041229248046875,
"learning_rate": 4.826557047444563e-07,
"loss": 0.0,
"reward": 1.1796875298023224,
"reward_std": 0.30663188826292753,
"rewards/equation_reward_func": 0.19791667279787362,
"rewards/format_reward_func": 0.9817708469927311,
"step": 66
},
{
"completion_length": 348.2239685058594,
"epoch": 0.03626666666666667,
"grad_norm": 0.07408528500512421,
"kl": 0.044708251953125,
"learning_rate": 4.813130610139993e-07,
"loss": 0.0,
"reward": 1.0729167014360428,
"reward_std": 0.17930190591141582,
"rewards/equation_reward_func": 0.0885416695382446,
"rewards/format_reward_func": 0.9843750149011612,
"step": 68
},
{
"completion_length": 318.35938835144043,
"epoch": 0.037333333333333336,
"grad_norm": 0.10471668022395769,
"kl": 0.0505828857421875,
"learning_rate": 4.799223798941089e-07,
"loss": 0.0001,
"reward": 1.187500037252903,
"reward_std": 0.2974981819279492,
"rewards/equation_reward_func": 0.2031250053551048,
"rewards/format_reward_func": 0.9843750111758709,
"step": 70
},
{
"completion_length": 312.2213659286499,
"epoch": 0.0384,
"grad_norm": 0.08445574387607607,
"kl": 0.058990478515625,
"learning_rate": 4.78483950191177e-07,
"loss": 0.0001,
"reward": 1.1562500298023224,
"reward_std": 0.23554043704643846,
"rewards/equation_reward_func": 0.17187500651925802,
"rewards/format_reward_func": 0.9843750111758709,
"step": 72
},
{
"completion_length": 320.13542556762695,
"epoch": 0.039466666666666664,
"grad_norm": 0.10154941280104149,
"kl": 0.0615997314453125,
"learning_rate": 4.769980706276687e-07,
"loss": 0.0001,
"reward": 1.1770833730697632,
"reward_std": 0.26962050748988986,
"rewards/equation_reward_func": 0.19270834000781178,
"rewards/format_reward_func": 0.9843750111758709,
"step": 74
},
{
"completion_length": 334.70052909851074,
"epoch": 0.04053333333333333,
"grad_norm": 0.08509345877302323,
"kl": 0.061676025390625,
"learning_rate": 4.7546504978008595e-07,
"loss": 0.0001,
"reward": 1.1458333730697632,
"reward_std": 0.20033816620707512,
"rewards/equation_reward_func": 0.15885417233221233,
"rewards/format_reward_func": 0.986979179084301,
"step": 76
},
{
"completion_length": 333.23438453674316,
"epoch": 0.0416,
"grad_norm": 0.10027144175078107,
"kl": 0.065399169921875,
"learning_rate": 4.738852060148848e-07,
"loss": 0.0001,
"reward": 1.1171875447034836,
"reward_std": 0.23261011950671673,
"rewards/equation_reward_func": 0.13541667070239782,
"rewards/format_reward_func": 0.9817708507180214,
"step": 78
},
{
"completion_length": 331.69011878967285,
"epoch": 0.042666666666666665,
"grad_norm": 0.07507534432076213,
"kl": 0.071014404296875,
"learning_rate": 4.722588674223593e-07,
"loss": 0.0001,
"reward": 1.1276042014360428,
"reward_std": 0.2506814347580075,
"rewards/equation_reward_func": 0.14843750442378223,
"rewards/format_reward_func": 0.9791666828095913,
"step": 80
},
{
"completion_length": 344.0781364440918,
"epoch": 0.04373333333333333,
"grad_norm": 0.09863254302808237,
"kl": 0.070526123046875,
"learning_rate": 4.70586371748506e-07,
"loss": 0.0001,
"reward": 1.2031250447034836,
"reward_std": 0.2764001186005771,
"rewards/equation_reward_func": 0.2109375074505806,
"rewards/format_reward_func": 0.9921875074505806,
"step": 82
},
{
"completion_length": 330.4479274749756,
"epoch": 0.0448,
"grad_norm": 0.10155910053999813,
"kl": 0.07550048828125,
"learning_rate": 4.6886806632488363e-07,
"loss": 0.0001,
"reward": 1.2708333730697632,
"reward_std": 0.3232872476801276,
"rewards/equation_reward_func": 0.28906251140870154,
"rewards/format_reward_func": 0.9817708469927311,
"step": 84
},
{
"completion_length": 341.1197986602783,
"epoch": 0.04586666666666667,
"grad_norm": 0.09455703883061281,
"kl": 0.07513427734375,
"learning_rate": 4.6710430799648143e-07,
"loss": 0.0001,
"reward": 1.1953125298023224,
"reward_std": 0.3194303079508245,
"rewards/equation_reward_func": 0.2291666753590107,
"rewards/format_reward_func": 0.9661458507180214,
"step": 86
},
{
"completion_length": 341.54427909851074,
"epoch": 0.046933333333333334,
"grad_norm": 0.08290471243926564,
"kl": 0.077423095703125,
"learning_rate": 4.652954630476127e-07,
"loss": 0.0001,
"reward": 1.1979167014360428,
"reward_std": 0.2291324818506837,
"rewards/equation_reward_func": 0.2239583395421505,
"rewards/format_reward_func": 0.9739583469927311,
"step": 88
},
{
"completion_length": 335.0989627838135,
"epoch": 0.048,
"grad_norm": 0.10748566516697469,
"kl": 0.088287353515625,
"learning_rate": 4.6344190712584713e-07,
"loss": 0.0001,
"reward": 1.1692708730697632,
"reward_std": 0.3015799345448613,
"rewards/equation_reward_func": 0.1979166700039059,
"rewards/format_reward_func": 0.9713541902601719,
"step": 90
},
{
"completion_length": 314.3177185058594,
"epoch": 0.04906666666666667,
"grad_norm": 0.10758855837243832,
"kl": 0.08538818359375,
"learning_rate": 4.615440251639995e-07,
"loss": 0.0001,
"reward": 1.3151041977107525,
"reward_std": 0.3682410903275013,
"rewards/equation_reward_func": 0.33593750838190317,
"rewards/format_reward_func": 0.9791666753590107,
"step": 92
},
{
"completion_length": 313.01302909851074,
"epoch": 0.050133333333333335,
"grad_norm": 0.124066638172858,
"kl": 0.0859375,
"learning_rate": 4.596022113001894e-07,
"loss": 0.0001,
"reward": 1.276041705161333,
"reward_std": 0.30914933141320944,
"rewards/equation_reward_func": 0.29427084303461015,
"rewards/format_reward_func": 0.9817708432674408,
"step": 94
},
{
"completion_length": 334.85417556762695,
"epoch": 0.0512,
"grad_norm": 0.1018803932324317,
"kl": 0.088104248046875,
"learning_rate": 4.576168687959895e-07,
"loss": 0.0001,
"reward": 1.2135417014360428,
"reward_std": 0.2573512555100024,
"rewards/equation_reward_func": 0.2395833416376263,
"rewards/format_reward_func": 0.9739583469927311,
"step": 96
},
{
"completion_length": 351.9817810058594,
"epoch": 0.05226666666666667,
"grad_norm": 0.10509374857128695,
"kl": 0.098785400390625,
"learning_rate": 4.555884099526793e-07,
"loss": 0.0001,
"reward": 1.250000037252903,
"reward_std": 0.29483586829155684,
"rewards/equation_reward_func": 0.27083334093913436,
"rewards/format_reward_func": 0.9791666753590107,
"step": 98
},
{
"completion_length": 358.38021755218506,
"epoch": 0.05333333333333334,
"grad_norm": 0.0978516383302316,
"kl": 0.08575439453125,
"learning_rate": 4.5351725602562174e-07,
"loss": 0.0001,
"reward": 1.2942708656191826,
"reward_std": 0.32903878297656775,
"rewards/equation_reward_func": 0.3203125046566129,
"rewards/format_reward_func": 0.9739583432674408,
"step": 100
},
{
"completion_length": 361.90365409851074,
"epoch": 0.0544,
"grad_norm": 0.07892841773395727,
"kl": 0.092681884765625,
"learning_rate": 4.514038371367791e-07,
"loss": 0.0001,
"reward": 1.2838542014360428,
"reward_std": 0.23603887297213078,
"rewards/equation_reward_func": 0.2942708432674408,
"rewards/format_reward_func": 0.9895833395421505,
"step": 102
},
{
"completion_length": 370.2447986602783,
"epoch": 0.055466666666666664,
"grad_norm": 0.07956969957231312,
"kl": 0.088226318359375,
"learning_rate": 4.4924859218538936e-07,
"loss": 0.0001,
"reward": 1.2682292088866234,
"reward_std": 0.2611841419711709,
"rewards/equation_reward_func": 0.2838541760575026,
"rewards/format_reward_func": 0.9843750074505806,
"step": 104
},
{
"completion_length": 405.403657913208,
"epoch": 0.05653333333333333,
"grad_norm": 0.10207984517578009,
"kl": 0.0877227783203125,
"learning_rate": 4.470519687568185e-07,
"loss": 0.0001,
"reward": 1.2786458618938923,
"reward_std": 0.27022232208400965,
"rewards/equation_reward_func": 0.31250000838190317,
"rewards/format_reward_func": 0.9661458507180214,
"step": 106
},
{
"completion_length": 392.93490982055664,
"epoch": 0.0576,
"grad_norm": 0.08438917528245744,
"kl": 0.0877685546875,
"learning_rate": 4.4481442302960923e-07,
"loss": 0.0001,
"reward": 1.3072917014360428,
"reward_std": 0.31525306357070804,
"rewards/equation_reward_func": 0.34375000838190317,
"rewards/format_reward_func": 0.963541679084301,
"step": 108
},
{
"completion_length": 399.8698024749756,
"epoch": 0.058666666666666666,
"grad_norm": 0.08270590545214734,
"kl": 0.09637451171875,
"learning_rate": 4.4253641968074505e-07,
"loss": 0.0001,
"reward": 1.268229190260172,
"reward_std": 0.24568770825862885,
"rewards/equation_reward_func": 0.3046875062864274,
"rewards/format_reward_func": 0.9635416828095913,
"step": 110
},
{
"completion_length": 409.60417556762695,
"epoch": 0.05973333333333333,
"grad_norm": 0.10271913225077348,
"kl": 0.0924072265625,
"learning_rate": 4.402184317891501e-07,
"loss": 0.0001,
"reward": 1.2812500335276127,
"reward_std": 0.33530174382030964,
"rewards/equation_reward_func": 0.3385416748933494,
"rewards/format_reward_func": 0.9427083544433117,
"step": 112
},
{
"completion_length": 416.4088649749756,
"epoch": 0.0608,
"grad_norm": 0.08166810576477633,
"kl": 0.095794677734375,
"learning_rate": 4.37860940737443e-07,
"loss": 0.0001,
"reward": 1.1770833805203438,
"reward_std": 0.26351519441232085,
"rewards/equation_reward_func": 0.22395834187045693,
"rewards/format_reward_func": 0.9531250149011612,
"step": 114
},
{
"completion_length": 390.3463611602783,
"epoch": 0.06186666666666667,
"grad_norm": 0.09414353563065953,
"kl": 0.11090087890625,
"learning_rate": 4.354644361119671e-07,
"loss": 0.0001,
"reward": 1.398437537252903,
"reward_std": 0.30470984475687146,
"rewards/equation_reward_func": 0.42187501583248377,
"rewards/format_reward_func": 0.9765625149011612,
"step": 116
},
{
"completion_length": 378.01563358306885,
"epoch": 0.06293333333333333,
"grad_norm": 0.07635029320541607,
"kl": 0.124725341796875,
"learning_rate": 4.3302941560111716e-07,
"loss": 0.0001,
"reward": 1.3958333730697632,
"reward_std": 0.36394598754122853,
"rewards/equation_reward_func": 0.4166666765231639,
"rewards/format_reward_func": 0.9791666865348816,
"step": 118
},
{
"completion_length": 392.65625953674316,
"epoch": 0.064,
"grad_norm": 0.0833024147650861,
"kl": 0.1026611328125,
"learning_rate": 4.3055638489198236e-07,
"loss": 0.0001,
"reward": 1.3359375298023224,
"reward_std": 0.37286510691046715,
"rewards/equation_reward_func": 0.3906250102445483,
"rewards/format_reward_func": 0.9453125186264515,
"step": 120
},
{
"completion_length": 399.5078182220459,
"epoch": 0.06506666666666666,
"grad_norm": 0.0892199212165042,
"kl": 0.1014404296875,
"learning_rate": 4.280458575653296e-07,
"loss": 0.0001,
"reward": 1.3307292088866234,
"reward_std": 0.3504871279001236,
"rewards/equation_reward_func": 0.38802084513008595,
"rewards/format_reward_func": 0.9427083507180214,
"step": 122
},
{
"completion_length": 450.1354331970215,
"epoch": 0.06613333333333334,
"grad_norm": 0.06581923430481687,
"kl": 0.114990234375,
"learning_rate": 4.2549835498894665e-07,
"loss": 0.0001,
"reward": 1.2604166939854622,
"reward_std": 0.3068140549585223,
"rewards/equation_reward_func": 0.32552084559574723,
"rewards/format_reward_func": 0.9348958544433117,
"step": 124
},
{
"completion_length": 390.036470413208,
"epoch": 0.0672,
"grad_norm": 0.07114986931726634,
"kl": 0.10528564453125,
"learning_rate": 4.229144062093679e-07,
"loss": 0.0001,
"reward": 1.3723958730697632,
"reward_std": 0.29870040342211723,
"rewards/equation_reward_func": 0.39843751303851604,
"rewards/format_reward_func": 0.9739583469927311,
"step": 126
},
{
"completion_length": 392.59115505218506,
"epoch": 0.06826666666666667,
"grad_norm": 0.0877107079994648,
"kl": 0.109405517578125,
"learning_rate": 4.2029454784200675e-07,
"loss": 0.0001,
"reward": 1.390625037252903,
"reward_std": 0.280646042432636,
"rewards/equation_reward_func": 0.42447917722165585,
"rewards/format_reward_func": 0.9661458507180214,
"step": 128
},
{
"completion_length": 421.0078191757202,
"epoch": 0.06933333333333333,
"grad_norm": 0.09643905280459295,
"kl": 0.10009765625,
"learning_rate": 4.1763932395971433e-07,
"loss": 0.0001,
"reward": 1.2942708693444729,
"reward_std": 0.3986189612187445,
"rewards/equation_reward_func": 0.3567708421032876,
"rewards/format_reward_func": 0.9375000223517418,
"step": 130
},
{
"completion_length": 461.4791736602783,
"epoch": 0.0704,
"grad_norm": 0.06366382823979087,
"kl": 0.101837158203125,
"learning_rate": 4.1494928597979117e-07,
"loss": 0.0001,
"reward": 1.2760417088866234,
"reward_std": 0.27500381181016564,
"rewards/equation_reward_func": 0.32291667629033327,
"rewards/format_reward_func": 0.9531250186264515,
"step": 132
},
{
"completion_length": 394.3255319595337,
"epoch": 0.07146666666666666,
"grad_norm": 0.0853911421540347,
"kl": 0.130126953125,
"learning_rate": 4.122249925494726e-07,
"loss": 0.0001,
"reward": 1.403645858168602,
"reward_std": 0.25308565702289343,
"rewards/equation_reward_func": 0.43229168001562357,
"rewards/format_reward_func": 0.9713541828095913,
"step": 134
},
{
"completion_length": 426.466157913208,
"epoch": 0.07253333333333334,
"grad_norm": 0.0692987274556644,
"kl": 0.11669921875,
"learning_rate": 4.094670094299131e-07,
"loss": 0.0001,
"reward": 1.281250037252903,
"reward_std": 0.316250397823751,
"rewards/equation_reward_func": 0.33072917559184134,
"rewards/format_reward_func": 0.9505208469927311,
"step": 136
},
{
"completion_length": 439.9010543823242,
"epoch": 0.0736,
"grad_norm": 0.08170658774133101,
"kl": 0.104949951171875,
"learning_rate": 4.066759093786931e-07,
"loss": 0.0001,
"reward": 1.2760416977107525,
"reward_std": 0.35973797645419836,
"rewards/equation_reward_func": 0.3411458428017795,
"rewards/format_reward_func": 0.9348958618938923,
"step": 138
},
{
"completion_length": 416.6406354904175,
"epoch": 0.07466666666666667,
"grad_norm": 0.06667199771271264,
"kl": 0.12353515625,
"learning_rate": 4.038522720308732e-07,
"loss": 0.0001,
"reward": 1.3854167088866234,
"reward_std": 0.21267010737210512,
"rewards/equation_reward_func": 0.4088541741948575,
"rewards/format_reward_func": 0.9765625149011612,
"step": 140
},
{
"completion_length": 440.536470413208,
"epoch": 0.07573333333333333,
"grad_norm": 0.0850091968151683,
"kl": 0.11474609375,
"learning_rate": 4.009966837786194e-07,
"loss": 0.0001,
"reward": 1.3255208693444729,
"reward_std": 0.30754279950633645,
"rewards/equation_reward_func": 0.36458334629423916,
"rewards/format_reward_func": 0.9609375074505806,
"step": 142
},
{
"completion_length": 401.09896659851074,
"epoch": 0.0768,
"grad_norm": 0.0768836508261685,
"kl": 0.116485595703125,
"learning_rate": 3.981097376494259e-07,
"loss": 0.0001,
"reward": 1.4557292237877846,
"reward_std": 0.31219895882532,
"rewards/equation_reward_func": 0.5026041828095913,
"rewards/format_reward_func": 0.9531250186264515,
"step": 144
},
{
"completion_length": 461.122407913208,
"epoch": 0.07786666666666667,
"grad_norm": 0.10600525349484782,
"kl": 0.114044189453125,
"learning_rate": 3.951920331829592e-07,
"loss": 0.0001,
"reward": 1.2890625335276127,
"reward_std": 0.2976598385721445,
"rewards/equation_reward_func": 0.3255208437331021,
"rewards/format_reward_func": 0.963541679084301,
"step": 146
},
{
"completion_length": 413.9713649749756,
"epoch": 0.07893333333333333,
"grad_norm": 0.0723392326431143,
"kl": 0.123321533203125,
"learning_rate": 3.922441763065506e-07,
"loss": 0.0001,
"reward": 1.4088542014360428,
"reward_std": 0.23494611913338304,
"rewards/equation_reward_func": 0.43750000931322575,
"rewards/format_reward_func": 0.9713541828095913,
"step": 148
},
{
"completion_length": 458.7604331970215,
"epoch": 0.08,
"grad_norm": 0.09048584328529992,
"kl": 0.12384033203125,
"learning_rate": 3.8926677920936093e-07,
"loss": 0.0001,
"reward": 1.2656250223517418,
"reward_std": 0.3245450472459197,
"rewards/equation_reward_func": 0.32031250838190317,
"rewards/format_reward_func": 0.9453125111758709,
"step": 150
},
{
"completion_length": 378.33073902130127,
"epoch": 0.08106666666666666,
"grad_norm": 0.10455674533718096,
"kl": 0.13720703125,
"learning_rate": 3.862604602152464e-07,
"loss": 0.0001,
"reward": 1.4244792014360428,
"reward_std": 0.26624298514798284,
"rewards/equation_reward_func": 0.46093751210719347,
"rewards/format_reward_func": 0.9635416828095913,
"step": 152
},
{
"completion_length": 421.9140796661377,
"epoch": 0.08213333333333334,
"grad_norm": 0.10103705731464013,
"kl": 0.129638671875,
"learning_rate": 3.8322584365434934e-07,
"loss": 0.0001,
"reward": 1.3255208879709244,
"reward_std": 0.24930242728441954,
"rewards/equation_reward_func": 0.3723958465270698,
"rewards/format_reward_func": 0.9531250186264515,
"step": 154
},
{
"completion_length": 463.58334159851074,
"epoch": 0.0832,
"grad_norm": 0.09221432956401719,
"kl": 0.127166748046875,
"learning_rate": 3.8016355973344173e-07,
"loss": 0.0001,
"reward": 1.234375037252903,
"reward_std": 0.2910663695074618,
"rewards/equation_reward_func": 0.28906250605359674,
"rewards/format_reward_func": 0.9453125260770321,
"step": 156
},
{
"completion_length": 419.51563835144043,
"epoch": 0.08426666666666667,
"grad_norm": 0.08138226453807305,
"kl": 0.1285400390625,
"learning_rate": 3.7707424440504863e-07,
"loss": 0.0001,
"reward": 1.3489583730697632,
"reward_std": 0.23599386168643832,
"rewards/equation_reward_func": 0.37500001094304025,
"rewards/format_reward_func": 0.9739583544433117,
"step": 158
},
{
"completion_length": 361.1015729904175,
"epoch": 0.08533333333333333,
"grad_norm": 0.13299459818559828,
"kl": 0.15423583984375,
"learning_rate": 3.739585392353787e-07,
"loss": 0.0002,
"reward": 1.434895858168602,
"reward_std": 0.28986221412196755,
"rewards/equation_reward_func": 0.458333341171965,
"rewards/format_reward_func": 0.9765625186264515,
"step": 160
},
{
"completion_length": 391.0599117279053,
"epoch": 0.0864,
"grad_norm": 0.10062549742509476,
"kl": 0.140045166015625,
"learning_rate": 3.7081709127108767e-07,
"loss": 0.0001,
"reward": 1.4244791939854622,
"reward_std": 0.2554763099178672,
"rewards/equation_reward_func": 0.4427083439659327,
"rewards/format_reward_func": 0.9817708469927311,
"step": 162
},
{
"completion_length": 346.71094512939453,
"epoch": 0.08746666666666666,
"grad_norm": 0.07557865430106443,
"kl": 0.165771484375,
"learning_rate": 3.6765055290490513e-07,
"loss": 0.0002,
"reward": 1.510416716337204,
"reward_std": 0.23889524163678288,
"rewards/equation_reward_func": 0.5390625223517418,
"rewards/format_reward_func": 0.9713541828095913,
"step": 164
},
{
"completion_length": 374.559907913208,
"epoch": 0.08853333333333334,
"grad_norm": 0.09484161296330915,
"kl": 0.145050048828125,
"learning_rate": 3.644595817401501e-07,
"loss": 0.0001,
"reward": 1.4140625596046448,
"reward_std": 0.2526052575558424,
"rewards/equation_reward_func": 0.43229167931713164,
"rewards/format_reward_func": 0.9817708507180214,
"step": 166
},
{
"completion_length": 389.614595413208,
"epoch": 0.0896,
"grad_norm": 0.10850466477020716,
"kl": 0.140869140625,
"learning_rate": 3.6124484045416483e-07,
"loss": 0.0001,
"reward": 1.3411458730697632,
"reward_std": 0.20541261043399572,
"rewards/equation_reward_func": 0.3515625118743628,
"rewards/format_reward_func": 0.9895833432674408,
"step": 168
},
{
"completion_length": 314.33594512939453,
"epoch": 0.09066666666666667,
"grad_norm": 0.09160402552556286,
"kl": 0.159759521484375,
"learning_rate": 3.580069966606949e-07,
"loss": 0.0002,
"reward": 1.4739583805203438,
"reward_std": 0.2342346585355699,
"rewards/equation_reward_func": 0.5000000114087015,
"rewards/format_reward_func": 0.9739583469927311,
"step": 170
},
{
"completion_length": 359.8906354904175,
"epoch": 0.09173333333333333,
"grad_norm": 0.09610423165466968,
"kl": 0.154388427734375,
"learning_rate": 3.547467227712444e-07,
"loss": 0.0002,
"reward": 1.437500037252903,
"reward_std": 0.1834291499108076,
"rewards/equation_reward_func": 0.45572918467223644,
"rewards/format_reward_func": 0.9817708469927311,
"step": 172
},
{
"completion_length": 365.7343854904175,
"epoch": 0.0928,
"grad_norm": 0.09889865100739882,
"kl": 0.15478515625,
"learning_rate": 3.5146469585543386e-07,
"loss": 0.0002,
"reward": 1.414062537252903,
"reward_std": 0.19458062946796417,
"rewards/equation_reward_func": 0.4322916797827929,
"rewards/format_reward_func": 0.9817708432674408,
"step": 174
},
{
"completion_length": 348.29427909851074,
"epoch": 0.09386666666666667,
"grad_norm": 0.09367098793216834,
"kl": 0.159515380859375,
"learning_rate": 3.481615975003922e-07,
"loss": 0.0002,
"reward": 1.4739583879709244,
"reward_std": 0.15797653933987021,
"rewards/equation_reward_func": 0.4921875149011612,
"rewards/format_reward_func": 0.9817708469927311,
"step": 176
},
{
"completion_length": 357.6145963668823,
"epoch": 0.09493333333333333,
"grad_norm": 0.07894542256229298,
"kl": 0.150299072265625,
"learning_rate": 3.448381136692089e-07,
"loss": 0.0002,
"reward": 1.4401042126119137,
"reward_std": 0.2548735234886408,
"rewards/equation_reward_func": 0.4765625074505806,
"rewards/format_reward_func": 0.9635416828095913,
"step": 178
},
{
"completion_length": 353.8281354904175,
"epoch": 0.096,
"grad_norm": 0.10120368862706956,
"kl": 0.1510009765625,
"learning_rate": 3.4149493455847897e-07,
"loss": 0.0002,
"reward": 1.377604216337204,
"reward_std": 0.17720257258042693,
"rewards/equation_reward_func": 0.3932291795499623,
"rewards/format_reward_func": 0.9843750074505806,
"step": 180
},
{
"completion_length": 337.3437547683716,
"epoch": 0.09706666666666666,
"grad_norm": 0.06857743257585827,
"kl": 0.171661376953125,
"learning_rate": 3.3813275445496766e-07,
"loss": 0.0002,
"reward": 1.3958333879709244,
"reward_std": 0.216899492777884,
"rewards/equation_reward_func": 0.4140625144354999,
"rewards/format_reward_func": 0.9817708469927311,
"step": 182
},
{
"completion_length": 373.3619899749756,
"epoch": 0.09813333333333334,
"grad_norm": 0.07039499292151902,
"kl": 0.185546875,
"learning_rate": 3.347522715914262e-07,
"loss": 0.0002,
"reward": 1.2838542088866234,
"reward_std": 0.14952099742367864,
"rewards/equation_reward_func": 0.29947917186655104,
"rewards/format_reward_func": 0.9843750037252903,
"step": 184
},
{
"completion_length": 343.94011783599854,
"epoch": 0.0992,
"grad_norm": 0.07557857715641425,
"kl": 0.172119140625,
"learning_rate": 3.313541880015877e-07,
"loss": 0.0002,
"reward": 1.3671875521540642,
"reward_std": 0.15858241729438305,
"rewards/equation_reward_func": 0.380208347691223,
"rewards/format_reward_func": 0.986979179084301,
"step": 186
},
{
"completion_length": 360.5390748977661,
"epoch": 0.10026666666666667,
"grad_norm": 0.11214755840839478,
"kl": 0.213623046875,
"learning_rate": 3.279392093743747e-07,
"loss": 0.0002,
"reward": 1.3880208730697632,
"reward_std": 0.19066602177917957,
"rewards/equation_reward_func": 0.4010416786186397,
"rewards/format_reward_func": 0.9869791753590107,
"step": 188
},
{
"completion_length": 322.72397232055664,
"epoch": 0.10133333333333333,
"grad_norm": 0.09281179127833081,
"kl": 0.2625732421875,
"learning_rate": 3.245080449073459e-07,
"loss": 0.0003,
"reward": 1.4557292088866234,
"reward_std": 0.21278624143451452,
"rewards/equation_reward_func": 0.4765625102445483,
"rewards/format_reward_func": 0.9791666753590107,
"step": 190
},
{
"completion_length": 337.25261306762695,
"epoch": 0.1024,
"grad_norm": 0.12372831662094742,
"kl": 0.18206787109375,
"learning_rate": 3.210614071594162e-07,
"loss": 0.0002,
"reward": 1.4218750447034836,
"reward_std": 0.21987988194450736,
"rewards/equation_reward_func": 0.440104179084301,
"rewards/format_reward_func": 0.9817708395421505,
"step": 192
},
{
"completion_length": 344.55729961395264,
"epoch": 0.10346666666666667,
"grad_norm": 0.06378441341807725,
"kl": 0.167724609375,
"learning_rate": 3.1760001190287695e-07,
"loss": 0.0002,
"reward": 1.351562537252903,
"reward_std": 0.14025551918894053,
"rewards/equation_reward_func": 0.36979167722165585,
"rewards/format_reward_func": 0.9817708432674408,
"step": 194
},
{
"completion_length": 314.03646659851074,
"epoch": 0.10453333333333334,
"grad_norm": 0.09003427985578723,
"kl": 0.17474365234375,
"learning_rate": 3.141245779747502e-07,
"loss": 0.0002,
"reward": 1.4479167237877846,
"reward_std": 0.2472039177082479,
"rewards/equation_reward_func": 0.46875001722946763,
"rewards/format_reward_func": 0.979166679084301,
"step": 196
},
{
"completion_length": 282.7083406448364,
"epoch": 0.1056,
"grad_norm": 0.1290749910466798,
"kl": 0.21856689453125,
"learning_rate": 3.106358271275056e-07,
"loss": 0.0002,
"reward": 1.476562537252903,
"reward_std": 0.1649267366155982,
"rewards/equation_reward_func": 0.4843750186264515,
"rewards/format_reward_func": 0.9921875074505806,
"step": 198
},
{
"completion_length": 276.01823806762695,
"epoch": 0.10666666666666667,
"grad_norm": 0.08477695235189277,
"kl": 0.1934814453125,
"learning_rate": 3.0713448387917227e-07,
"loss": 0.0002,
"reward": 1.5468750298023224,
"reward_std": 0.13912134431302547,
"rewards/equation_reward_func": 0.552083348389715,
"rewards/format_reward_func": 0.9947916716337204,
"step": 200
},
{
"completion_length": 317.55730152130127,
"epoch": 0.10773333333333333,
"grad_norm": 2.4369008488049477,
"kl": 5.66986083984375,
"learning_rate": 3.0362127536287636e-07,
"loss": 0.0057,
"reward": 1.421875037252903,
"reward_std": 0.16129080019891262,
"rewards/equation_reward_func": 0.4270833423361182,
"rewards/format_reward_func": 0.9947916679084301,
"step": 202
},
{
"completion_length": 321.65625762939453,
"epoch": 0.1088,
"grad_norm": 0.09396431699981035,
"kl": 0.18658447265625,
"learning_rate": 3.0009693117583523e-07,
"loss": 0.0002,
"reward": 1.4348958656191826,
"reward_std": 0.19856942351907492,
"rewards/equation_reward_func": 0.453125013737008,
"rewards/format_reward_func": 0.9817708469927311,
"step": 204
},
{
"completion_length": 330.83855056762695,
"epoch": 0.10986666666666667,
"grad_norm": 0.07687252722110068,
"kl": 0.1839599609375,
"learning_rate": 2.965621832278401e-07,
"loss": 0.0002,
"reward": 1.377604216337204,
"reward_std": 0.15589443547651172,
"rewards/equation_reward_func": 0.39583334885537624,
"rewards/format_reward_func": 0.9817708469927311,
"step": 206
},
{
"completion_length": 328.23438358306885,
"epoch": 0.11093333333333333,
"grad_norm": 0.0810844061250071,
"kl": 0.1962890625,
"learning_rate": 2.9301776558925875e-07,
"loss": 0.0002,
"reward": 1.3697917014360428,
"reward_std": 0.20208620419725776,
"rewards/equation_reward_func": 0.4036458465270698,
"rewards/format_reward_func": 0.9661458544433117,
"step": 208
},
{
"completion_length": 313.95313262939453,
"epoch": 0.112,
"grad_norm": 0.0851816615508796,
"kl": 0.23468017578125,
"learning_rate": 2.894644143385885e-07,
"loss": 0.0002,
"reward": 1.3958333767950535,
"reward_std": 0.18581857532262802,
"rewards/equation_reward_func": 0.4244791748933494,
"rewards/format_reward_func": 0.971354179084301,
"step": 210
},
{
"completion_length": 326.42448806762695,
"epoch": 0.11306666666666666,
"grad_norm": 0.0786620471083819,
"kl": 0.19378662109375,
"learning_rate": 2.859028674095937e-07,
"loss": 0.0002,
"reward": 1.4010417014360428,
"reward_std": 0.1981433075852692,
"rewards/equation_reward_func": 0.42708334513008595,
"rewards/format_reward_func": 0.9739583432674408,
"step": 212
},
{
"completion_length": 308.4192781448364,
"epoch": 0.11413333333333334,
"grad_norm": 0.08352066179848143,
"kl": 0.189453125,
"learning_rate": 2.823338644380566e-07,
"loss": 0.0002,
"reward": 1.4401042126119137,
"reward_std": 0.2307603359222412,
"rewards/equation_reward_func": 0.47135418048128486,
"rewards/format_reward_func": 0.9687500149011612,
"step": 214
},
{
"completion_length": 341.86198711395264,
"epoch": 0.1152,
"grad_norm": 0.10334760188864624,
"kl": 0.22479248046875,
"learning_rate": 2.7875814660817504e-07,
"loss": 0.0002,
"reward": 1.3880208693444729,
"reward_std": 0.2630339222960174,
"rewards/equation_reward_func": 0.4244791779201478,
"rewards/format_reward_func": 0.9635416828095913,
"step": 216
},
{
"completion_length": 390.3906364440918,
"epoch": 0.11626666666666667,
"grad_norm": 0.1090470945421399,
"kl": 0.2252197265625,
"learning_rate": 2.751764564986396e-07,
"loss": 0.0002,
"reward": 1.223958358168602,
"reward_std": 0.23174711503088474,
"rewards/equation_reward_func": 0.27864584047347307,
"rewards/format_reward_func": 0.9453125223517418,
"step": 218
},
{
"completion_length": 329.63282585144043,
"epoch": 0.11733333333333333,
"grad_norm": 0.10077949546695844,
"kl": 0.2005615234375,
"learning_rate": 2.715895379284194e-07,
"loss": 0.0002,
"reward": 1.3958333730697632,
"reward_std": 0.26168868225067854,
"rewards/equation_reward_func": 0.4427083469927311,
"rewards/format_reward_func": 0.9531250223517418,
"step": 220
},
{
"completion_length": 358.4375104904175,
"epoch": 0.1184,
"grad_norm": 0.08964140632655672,
"kl": 0.21527099609375,
"learning_rate": 2.6799813580229174e-07,
"loss": 0.0002,
"reward": 1.3593750447034836,
"reward_std": 0.25906086526811123,
"rewards/equation_reward_func": 0.4036458386108279,
"rewards/format_reward_func": 0.9557291902601719,
"step": 222
},
{
"completion_length": 343.70834255218506,
"epoch": 0.11946666666666667,
"grad_norm": 0.07620045097589506,
"kl": 0.19964599609375,
"learning_rate": 2.6440299595614606e-07,
"loss": 0.0002,
"reward": 1.3307291939854622,
"reward_std": 0.2277261232957244,
"rewards/equation_reward_func": 0.3697916753590107,
"rewards/format_reward_func": 0.9609375223517418,
"step": 224
},
{
"completion_length": 343.32032108306885,
"epoch": 0.12053333333333334,
"grad_norm": 0.08700892029776192,
"kl": 0.2109375,
"learning_rate": 2.6080486500209347e-07,
"loss": 0.0002,
"reward": 1.3541667014360428,
"reward_std": 0.21279292972758412,
"rewards/equation_reward_func": 0.40104167512618005,
"rewards/format_reward_func": 0.9531250149011612,
"step": 226
},
{
"completion_length": 275.88542556762695,
"epoch": 0.1216,
"grad_norm": 0.11035562445594559,
"kl": 0.202880859375,
"learning_rate": 2.572044901734166e-07,
"loss": 0.0002,
"reward": 1.5833333730697632,
"reward_std": 0.25712650874629617,
"rewards/equation_reward_func": 0.6197916865348816,
"rewards/format_reward_func": 0.9635416716337204,
"step": 228
},
{
"completion_length": 314.13542318344116,
"epoch": 0.12266666666666666,
"grad_norm": 0.10200349640738855,
"kl": 0.21881103515625,
"learning_rate": 2.536026191693893e-07,
"loss": 0.0002,
"reward": 1.4505208693444729,
"reward_std": 0.29838538402691483,
"rewards/equation_reward_func": 0.5156250093132257,
"rewards/format_reward_func": 0.9348958544433117,
"step": 230
},
{
"completion_length": 334.46615505218506,
"epoch": 0.12373333333333333,
"grad_norm": 0.104610809797409,
"kl": 0.20111083984375,
"learning_rate": 2.5e-07,
"loss": 0.0002,
"reward": 1.4479166939854622,
"reward_std": 0.32854113075882196,
"rewards/equation_reward_func": 0.4921875128056854,
"rewards/format_reward_func": 0.9557291902601719,
"step": 232
},
{
"completion_length": 348.3463592529297,
"epoch": 0.1248,
"grad_norm": 0.11970668433207705,
"kl": 0.21832275390625,
"learning_rate": 2.4639738083061073e-07,
"loss": 0.0002,
"reward": 1.2968750409781933,
"reward_std": 0.28170605981722474,
"rewards/equation_reward_func": 0.35937500884756446,
"rewards/format_reward_func": 0.9375000149011612,
"step": 234
},
{
"completion_length": 353.0625104904175,
"epoch": 0.12586666666666665,
"grad_norm": 0.11146940462264297,
"kl": 0.2242431640625,
"learning_rate": 2.4279550982658345e-07,
"loss": 0.0002,
"reward": 1.283854205161333,
"reward_std": 0.2352255848236382,
"rewards/equation_reward_func": 0.3359375107102096,
"rewards/format_reward_func": 0.9479166828095913,
"step": 236
},
{
"completion_length": 328.27865982055664,
"epoch": 0.12693333333333334,
"grad_norm": 0.09019157224178884,
"kl": 0.2286376953125,
"learning_rate": 2.3919513499790646e-07,
"loss": 0.0002,
"reward": 1.4036458767950535,
"reward_std": 0.2419091323390603,
"rewards/equation_reward_func": 0.4557291849050671,
"rewards/format_reward_func": 0.947916679084301,
"step": 238
},
{
"completion_length": 292.41146755218506,
"epoch": 0.128,
"grad_norm": 0.11216015195235872,
"kl": 0.20770263671875,
"learning_rate": 2.3559700404385394e-07,
"loss": 0.0002,
"reward": 1.4218750521540642,
"reward_std": 0.214123603887856,
"rewards/equation_reward_func": 0.45833334303461015,
"rewards/format_reward_func": 0.963541679084301,
"step": 240
},
{
"completion_length": 289.44011306762695,
"epoch": 0.12906666666666666,
"grad_norm": 0.06748907528166415,
"kl": 0.21905517578125,
"learning_rate": 2.3200186419770823e-07,
"loss": 0.0002,
"reward": 1.4973958730697632,
"reward_std": 0.21762575302273035,
"rewards/equation_reward_func": 0.5390625144354999,
"rewards/format_reward_func": 0.9583333544433117,
"step": 242
},
{
"completion_length": 312.97396659851074,
"epoch": 0.13013333333333332,
"grad_norm": 0.08585761519803439,
"kl": 0.22705078125,
"learning_rate": 2.284104620715807e-07,
"loss": 0.0002,
"reward": 1.382812537252903,
"reward_std": 0.2223974741064012,
"rewards/equation_reward_func": 0.4218750149011612,
"rewards/format_reward_func": 0.9609375186264515,
"step": 244
},
{
"completion_length": 281.0208377838135,
"epoch": 0.1312,
"grad_norm": 0.09243139356469632,
"kl": 0.22540283203125,
"learning_rate": 2.2482354350136043e-07,
"loss": 0.0002,
"reward": 1.4947917088866234,
"reward_std": 0.23069008206948638,
"rewards/equation_reward_func": 0.5208333479240537,
"rewards/format_reward_func": 0.9739583469927311,
"step": 246
},
{
"completion_length": 301.8541736602783,
"epoch": 0.13226666666666667,
"grad_norm": 0.09610646803963738,
"kl": 0.22479248046875,
"learning_rate": 2.2124185339182496e-07,
"loss": 0.0002,
"reward": 1.3802083730697632,
"reward_std": 0.1815217286348343,
"rewards/equation_reward_func": 0.4270833432674408,
"rewards/format_reward_func": 0.9531250186264515,
"step": 248
},
{
"completion_length": 347.1015729904175,
"epoch": 0.13333333333333333,
"grad_norm": 0.1746681094283612,
"kl": 0.23883056640625,
"learning_rate": 2.1766613556194344e-07,
"loss": 0.0002,
"reward": 1.2213542088866234,
"reward_std": 0.22283816616982222,
"rewards/equation_reward_func": 0.26041667559184134,
"rewards/format_reward_func": 0.9609375149011612,
"step": 250
},
{
"completion_length": 293.10677909851074,
"epoch": 0.1344,
"grad_norm": 0.07969455343084161,
"kl": 0.305419921875,
"learning_rate": 2.1409713259040628e-07,
"loss": 0.0003,
"reward": 1.4114583730697632,
"reward_std": 0.2006126595661044,
"rewards/equation_reward_func": 0.432291679084301,
"rewards/format_reward_func": 0.9791666753590107,
"step": 252
},
{
"completion_length": 295.825532913208,
"epoch": 0.13546666666666668,
"grad_norm": 0.11043203499359036,
"kl": 0.215087890625,
"learning_rate": 2.105355856614115e-07,
"loss": 0.0002,
"reward": 1.4114583730697632,
"reward_std": 0.3007270940579474,
"rewards/equation_reward_func": 0.46354168374091387,
"rewards/format_reward_func": 0.9479166828095913,
"step": 254
},
{
"completion_length": 295.0286531448364,
"epoch": 0.13653333333333334,
"grad_norm": 0.10604018583177363,
"kl": 0.2293701171875,
"learning_rate": 2.069822344107413e-07,
"loss": 0.0002,
"reward": 1.4401042088866234,
"reward_std": 0.16259960131719708,
"rewards/equation_reward_func": 0.46875001303851604,
"rewards/format_reward_func": 0.9713541828095913,
"step": 256
},
{
"completion_length": 312.09897232055664,
"epoch": 0.1376,
"grad_norm": 0.11581309250324548,
"kl": 0.22454833984375,
"learning_rate": 2.034378167721599e-07,
"loss": 0.0002,
"reward": 1.3411458618938923,
"reward_std": 0.31250663055107,
"rewards/equation_reward_func": 0.39322918001562357,
"rewards/format_reward_func": 0.9479166939854622,
"step": 258
},
{
"completion_length": 301.36980152130127,
"epoch": 0.13866666666666666,
"grad_norm": 0.10375800085599599,
"kl": 0.24493408203125,
"learning_rate": 1.9990306882416485e-07,
"loss": 0.0002,
"reward": 1.4635416977107525,
"reward_std": 0.2693312247283757,
"rewards/equation_reward_func": 0.5104166809469461,
"rewards/format_reward_func": 0.9531250149011612,
"step": 260
},
{
"completion_length": 314.0885524749756,
"epoch": 0.13973333333333332,
"grad_norm": 0.1087966329523751,
"kl": 0.222900390625,
"learning_rate": 1.9637872463712362e-07,
"loss": 0.0002,
"reward": 1.4062500409781933,
"reward_std": 0.26262100599706173,
"rewards/equation_reward_func": 0.44270834792405367,
"rewards/format_reward_func": 0.963541679084301,
"step": 262
},
{
"completion_length": 281.7239646911621,
"epoch": 0.1408,
"grad_norm": 0.09695420136164315,
"kl": 0.26971435546875,
"learning_rate": 1.9286551612082773e-07,
"loss": 0.0003,
"reward": 1.4479167088866234,
"reward_std": 0.2460037199780345,
"rewards/equation_reward_func": 0.48697918094694614,
"rewards/format_reward_func": 0.9609375149011612,
"step": 264
},
{
"completion_length": 315.37240505218506,
"epoch": 0.14186666666666667,
"grad_norm": 0.10037156483806228,
"kl": 0.24798583984375,
"learning_rate": 1.8936417287249446e-07,
"loss": 0.0002,
"reward": 1.3385417088866234,
"reward_std": 0.2581388554535806,
"rewards/equation_reward_func": 0.39583334466442466,
"rewards/format_reward_func": 0.9427083469927311,
"step": 266
},
{
"completion_length": 334.924485206604,
"epoch": 0.14293333333333333,
"grad_norm": 0.15279355937220046,
"kl": 0.26873779296875,
"learning_rate": 1.8587542202524985e-07,
"loss": 0.0003,
"reward": 1.268229205161333,
"reward_std": 0.28603212209418416,
"rewards/equation_reward_func": 0.3177083428017795,
"rewards/format_reward_func": 0.9505208544433117,
"step": 268
},
{
"completion_length": 292.8906297683716,
"epoch": 0.144,
"grad_norm": 0.09627939797808117,
"kl": 0.25811767578125,
"learning_rate": 1.82399988097123e-07,
"loss": 0.0003,
"reward": 1.3828125335276127,
"reward_std": 0.240143911447376,
"rewards/equation_reward_func": 0.4479166786186397,
"rewards/format_reward_func": 0.9348958544433117,
"step": 270
},
{
"completion_length": 303.7291774749756,
"epoch": 0.14506666666666668,
"grad_norm": 0.09414307623625273,
"kl": 0.25408935546875,
"learning_rate": 1.7893859284058378e-07,
"loss": 0.0003,
"reward": 1.3671875298023224,
"reward_std": 0.24746731435880065,
"rewards/equation_reward_func": 0.4218750111758709,
"rewards/format_reward_func": 0.9453125149011612,
"step": 272
},
{
"completion_length": 276.94792318344116,
"epoch": 0.14613333333333334,
"grad_norm": 0.1299701036522939,
"kl": 0.57366943359375,
"learning_rate": 1.7549195509265407e-07,
"loss": 0.0006,
"reward": 1.4348958730697632,
"reward_std": 0.2572689475491643,
"rewards/equation_reward_func": 0.4791666748933494,
"rewards/format_reward_func": 0.9557291902601719,
"step": 274
},
{
"completion_length": 248.166672706604,
"epoch": 0.1472,
"grad_norm": 0.08206460484425186,
"kl": 0.256103515625,
"learning_rate": 1.7206079062562536e-07,
"loss": 0.0003,
"reward": 1.5833333656191826,
"reward_std": 0.21109008882194757,
"rewards/equation_reward_func": 0.6145833458285779,
"rewards/format_reward_func": 0.9687500260770321,
"step": 276
},
{
"completion_length": 305.27865409851074,
"epoch": 0.14826666666666666,
"grad_norm": 0.10621644156716899,
"kl": 0.2762451171875,
"learning_rate": 1.6864581199841226e-07,
"loss": 0.0003,
"reward": 1.312500026077032,
"reward_std": 0.24705103458836675,
"rewards/equation_reward_func": 0.36718751094304025,
"rewards/format_reward_func": 0.9453125186264515,
"step": 278
},
{
"completion_length": 286.6406297683716,
"epoch": 0.14933333333333335,
"grad_norm": 0.10751127049009096,
"kl": 0.26580810546875,
"learning_rate": 1.6524772840857388e-07,
"loss": 0.0003,
"reward": 1.3072916977107525,
"reward_std": 0.2637113491073251,
"rewards/equation_reward_func": 0.38281250768341124,
"rewards/format_reward_func": 0.9244791865348816,
"step": 280
},
{
"completion_length": 275.32552909851074,
"epoch": 0.1504,
"grad_norm": 0.10203495847611208,
"kl": 0.29620361328125,
"learning_rate": 1.6186724554503237e-07,
"loss": 0.0003,
"reward": 1.4687500409781933,
"reward_std": 0.23805115604773164,
"rewards/equation_reward_func": 0.5156250128056854,
"rewards/format_reward_func": 0.9531250186264515,
"step": 282
},
{
"completion_length": 312.76823806762695,
"epoch": 0.15146666666666667,
"grad_norm": 0.09010102560559675,
"kl": 0.26605224609375,
"learning_rate": 1.5850506544152103e-07,
"loss": 0.0003,
"reward": 1.2786458618938923,
"reward_std": 0.27972705382853746,
"rewards/equation_reward_func": 0.35156250977888703,
"rewards/format_reward_func": 0.9270833432674408,
"step": 284
},
{
"completion_length": 263.97916984558105,
"epoch": 0.15253333333333333,
"grad_norm": 0.09699956880184334,
"kl": 0.271728515625,
"learning_rate": 1.5516188633079107e-07,
"loss": 0.0003,
"reward": 1.4088542088866234,
"reward_std": 0.21715012891218066,
"rewards/equation_reward_func": 0.432291679084301,
"rewards/format_reward_func": 0.9765625186264515,
"step": 286
},
{
"completion_length": 284.93750762939453,
"epoch": 0.1536,
"grad_norm": 0.13730205530993134,
"kl": 0.26202392578125,
"learning_rate": 1.5183840249960784e-07,
"loss": 0.0003,
"reward": 1.2916666977107525,
"reward_std": 0.2690475699491799,
"rewards/equation_reward_func": 0.3411458432674408,
"rewards/format_reward_func": 0.9505208507180214,
"step": 288
},
{
"completion_length": 313.8724036216736,
"epoch": 0.15466666666666667,
"grad_norm": 0.10982987970993405,
"kl": 0.25787353515625,
"learning_rate": 1.4853530414456612e-07,
"loss": 0.0003,
"reward": 1.3359375447034836,
"reward_std": 0.28103851480409503,
"rewards/equation_reward_func": 0.38020834419876337,
"rewards/format_reward_func": 0.9557291865348816,
"step": 290
},
{
"completion_length": 280.13021659851074,
"epoch": 0.15573333333333333,
"grad_norm": 0.10569696273751499,
"kl": 0.2752685546875,
"learning_rate": 1.4525327722875568e-07,
"loss": 0.0003,
"reward": 1.3723958730697632,
"reward_std": 0.253665282856673,
"rewards/equation_reward_func": 0.4270833453629166,
"rewards/format_reward_func": 0.9453125223517418,
"step": 292
},
{
"completion_length": 266.9010486602783,
"epoch": 0.1568,
"grad_norm": 0.1273947740183966,
"kl": 0.2657470703125,
"learning_rate": 1.4199300333930515e-07,
"loss": 0.0003,
"reward": 1.4635417088866234,
"reward_std": 0.28517728950828314,
"rewards/equation_reward_func": 0.5026041837409139,
"rewards/format_reward_func": 0.9609375111758709,
"step": 294
},
{
"completion_length": 328.830735206604,
"epoch": 0.15786666666666666,
"grad_norm": 0.1699855426323704,
"kl": 0.2620849609375,
"learning_rate": 1.3875515954583523e-07,
"loss": 0.0003,
"reward": 1.2187500447034836,
"reward_std": 0.3317327341064811,
"rewards/equation_reward_func": 0.2942708428017795,
"rewards/format_reward_func": 0.9244791902601719,
"step": 296
},
{
"completion_length": 329.12240982055664,
"epoch": 0.15893333333333334,
"grad_norm": 0.14001227147909825,
"kl": 0.27099609375,
"learning_rate": 1.3554041825985e-07,
"loss": 0.0003,
"reward": 1.1979167014360428,
"reward_std": 0.2845407989807427,
"rewards/equation_reward_func": 0.28125000931322575,
"rewards/format_reward_func": 0.9166666865348816,
"step": 298
},
{
"completion_length": 283.0989661216736,
"epoch": 0.16,
"grad_norm": 0.10223346879835553,
"kl": 0.24761962890625,
"learning_rate": 1.323494470950949e-07,
"loss": 0.0002,
"reward": 1.429687537252903,
"reward_std": 0.26960491156205535,
"rewards/equation_reward_func": 0.47135418094694614,
"rewards/format_reward_func": 0.9583333544433117,
"step": 300
},
{
"completion_length": 254.60156726837158,
"epoch": 0.16106666666666666,
"grad_norm": 0.08918786164304986,
"kl": 0.260986328125,
"learning_rate": 1.2918290872891236e-07,
"loss": 0.0003,
"reward": 1.4348958805203438,
"reward_std": 0.15168809751048684,
"rewards/equation_reward_func": 0.458333347691223,
"rewards/format_reward_func": 0.9765625111758709,
"step": 302
},
{
"completion_length": 280.38021516799927,
"epoch": 0.16213333333333332,
"grad_norm": 0.10981016883182508,
"kl": 0.26275634765625,
"learning_rate": 1.260414607646213e-07,
"loss": 0.0003,
"reward": 1.3880208618938923,
"reward_std": 0.2798879165202379,
"rewards/equation_reward_func": 0.42708334303461015,
"rewards/format_reward_func": 0.9609375111758709,
"step": 304
},
{
"completion_length": 236.63802528381348,
"epoch": 0.1632,
"grad_norm": 0.1126860308935798,
"kl": 0.24639892578125,
"learning_rate": 1.2292575559495143e-07,
"loss": 0.0002,
"reward": 1.5338541939854622,
"reward_std": 0.21581484470516443,
"rewards/equation_reward_func": 0.5598958488553762,
"rewards/format_reward_func": 0.9739583507180214,
"step": 306
},
{
"completion_length": 274.51823902130127,
"epoch": 0.16426666666666667,
"grad_norm": 0.11433058952931557,
"kl": 0.2418212890625,
"learning_rate": 1.1983644026655835e-07,
"loss": 0.0002,
"reward": 1.3984375298023224,
"reward_std": 0.2787149855867028,
"rewards/equation_reward_func": 0.4505208458285779,
"rewards/format_reward_func": 0.9479166828095913,
"step": 308
},
{
"completion_length": 269.51303005218506,
"epoch": 0.16533333333333333,
"grad_norm": 0.10900628538932935,
"kl": 0.2515869140625,
"learning_rate": 1.1677415634565066e-07,
"loss": 0.0003,
"reward": 1.4531250298023224,
"reward_std": 0.21808092296123505,
"rewards/equation_reward_func": 0.5026041760575026,
"rewards/format_reward_func": 0.950520858168602,
"step": 310
},
{
"completion_length": 276.29948711395264,
"epoch": 0.1664,
"grad_norm": 0.10471445766441949,
"kl": 0.24822998046875,
"learning_rate": 1.1373953978475353e-07,
"loss": 0.0002,
"reward": 1.4088542014360428,
"reward_std": 0.2563867177814245,
"rewards/equation_reward_func": 0.4557291786186397,
"rewards/format_reward_func": 0.9531250260770321,
"step": 312
},
{
"completion_length": 281.54167652130127,
"epoch": 0.16746666666666668,
"grad_norm": 0.11476171924959432,
"kl": 0.25030517578125,
"learning_rate": 1.1073322079063913e-07,
"loss": 0.0003,
"reward": 1.419270884245634,
"reward_std": 0.2665014350786805,
"rewards/equation_reward_func": 0.46354168374091387,
"rewards/format_reward_func": 0.9557291865348816,
"step": 314
},
{
"completion_length": 282.43490409851074,
"epoch": 0.16853333333333334,
"grad_norm": 0.0841971248428421,
"kl": 0.21875,
"learning_rate": 1.0775582369344946e-07,
"loss": 0.0002,
"reward": 1.424479216337204,
"reward_std": 0.2608643379062414,
"rewards/equation_reward_func": 0.46875001303851604,
"rewards/format_reward_func": 0.9557291902601719,
"step": 316
},
{
"completion_length": 273.4349060058594,
"epoch": 0.1696,
"grad_norm": 0.09515899802774246,
"kl": 0.24407958984375,
"learning_rate": 1.0480796681704077e-07,
"loss": 0.0002,
"reward": 1.4010417088866234,
"reward_std": 0.2546477783471346,
"rewards/equation_reward_func": 0.4453125174622983,
"rewards/format_reward_func": 0.9557291865348816,
"step": 318
},
{
"completion_length": 316.6458435058594,
"epoch": 0.17066666666666666,
"grad_norm": 0.099987410497596,
"kl": 0.23638916015625,
"learning_rate": 1.018902623505741e-07,
"loss": 0.0002,
"reward": 1.2942708656191826,
"reward_std": 0.29723000014200807,
"rewards/equation_reward_func": 0.3671875111758709,
"rewards/format_reward_func": 0.927083358168602,
"step": 320
},
{
"completion_length": 288.893235206604,
"epoch": 0.17173333333333332,
"grad_norm": 0.11246455050265577,
"kl": 0.23480224609375,
"learning_rate": 9.900331622138063e-08,
"loss": 0.0002,
"reward": 1.3723958730697632,
"reward_std": 0.289981079287827,
"rewards/equation_reward_func": 0.4114583421032876,
"rewards/format_reward_func": 0.9609375149011612,
"step": 322
},
{
"completion_length": 264.9921979904175,
"epoch": 0.1728,
"grad_norm": 0.10025221120521255,
"kl": 0.24884033203125,
"learning_rate": 9.614772796912681e-08,
"loss": 0.0002,
"reward": 1.398437537252903,
"reward_std": 0.21295037120580673,
"rewards/equation_reward_func": 0.4296875102445483,
"rewards/format_reward_func": 0.9687500111758709,
"step": 324
},
{
"completion_length": 283.9114661216736,
"epoch": 0.17386666666666667,
"grad_norm": 0.08393060980669469,
"kl": 0.2662353515625,
"learning_rate": 9.332409062130686e-08,
"loss": 0.0003,
"reward": 1.3046875298023224,
"reward_std": 0.211736383382231,
"rewards/equation_reward_func": 0.3437500102445483,
"rewards/format_reward_func": 0.9609375149011612,
"step": 326
},
{
"completion_length": 280.97657108306885,
"epoch": 0.17493333333333333,
"grad_norm": 0.09266235555090595,
"kl": 0.26544189453125,
"learning_rate": 9.053299057008699e-08,
"loss": 0.0003,
"reward": 1.3619792014360428,
"reward_std": 0.18739549908787012,
"rewards/equation_reward_func": 0.40364584513008595,
"rewards/format_reward_func": 0.9583333544433117,
"step": 328
},
{
"completion_length": 266.9140729904175,
"epoch": 0.176,
"grad_norm": 0.17475099073751835,
"kl": 0.24237060546875,
"learning_rate": 8.777500745052743e-08,
"loss": 0.0002,
"reward": 1.4192708879709244,
"reward_std": 0.2251653028652072,
"rewards/equation_reward_func": 0.45312501629814506,
"rewards/format_reward_func": 0.9661458544433117,
"step": 330
},
{
"completion_length": 281.2734489440918,
"epoch": 0.17706666666666668,
"grad_norm": 0.11185068411943261,
"kl": 0.24456787109375,
"learning_rate": 8.505071402020892e-08,
"loss": 0.0002,
"reward": 1.393229216337204,
"reward_std": 0.2644071178510785,
"rewards/equation_reward_func": 0.4453125111758709,
"rewards/format_reward_func": 0.947916679084301,
"step": 332
},
{
"completion_length": 283.9192819595337,
"epoch": 0.17813333333333334,
"grad_norm": 0.14116520705594282,
"kl": 0.2410888671875,
"learning_rate": 8.236067604028562e-08,
"loss": 0.0002,
"reward": 1.3723958656191826,
"reward_std": 0.2818891149945557,
"rewards/equation_reward_func": 0.41406250931322575,
"rewards/format_reward_func": 0.9583333544433117,
"step": 334
},
{
"completion_length": 260.65104579925537,
"epoch": 0.1792,
"grad_norm": 0.1336225513443869,
"kl": 0.239501953125,
"learning_rate": 7.970545215799327e-08,
"loss": 0.0002,
"reward": 1.4869791939854622,
"reward_std": 0.28690007980912924,
"rewards/equation_reward_func": 0.5390625186264515,
"rewards/format_reward_func": 0.9479166902601719,
"step": 336
},
{
"completion_length": 248.3099012374878,
"epoch": 0.18026666666666666,
"grad_norm": 0.09856720056681173,
"kl": 0.23907470703125,
"learning_rate": 7.708559379063204e-08,
"loss": 0.0002,
"reward": 1.4817708656191826,
"reward_std": 0.23133338056504726,
"rewards/equation_reward_func": 0.5260416767559946,
"rewards/format_reward_func": 0.9557291902601719,
"step": 338
},
{
"completion_length": 290.5052146911621,
"epoch": 0.18133333333333335,
"grad_norm": 0.1122615481772805,
"kl": 0.24444580078125,
"learning_rate": 7.45016450110534e-08,
"loss": 0.0002,
"reward": 1.2838542014360428,
"reward_std": 0.2104581743478775,
"rewards/equation_reward_func": 0.32291667233221233,
"rewards/format_reward_func": 0.9609375186264515,
"step": 340
},
{
"completion_length": 284.09896516799927,
"epoch": 0.1824,
"grad_norm": 0.09965326339693975,
"kl": 0.2493896484375,
"learning_rate": 7.195414243467029e-08,
"loss": 0.0002,
"reward": 1.3906250335276127,
"reward_std": 0.27104497281834483,
"rewards/equation_reward_func": 0.4401041779201478,
"rewards/format_reward_func": 0.9505208469927311,
"step": 342
},
{
"completion_length": 282.057297706604,
"epoch": 0.18346666666666667,
"grad_norm": 0.08762325381098879,
"kl": 0.32757568359375,
"learning_rate": 6.944361510801763e-08,
"loss": 0.0003,
"reward": 1.2994792014360428,
"reward_std": 0.23241478390991688,
"rewards/equation_reward_func": 0.3385416711680591,
"rewards/format_reward_func": 0.9609375186264515,
"step": 344
},
{
"completion_length": 279.9739685058594,
"epoch": 0.18453333333333333,
"grad_norm": 0.14683183029957406,
"kl": 0.6046142578125,
"learning_rate": 6.697058439888283e-08,
"loss": 0.0006,
"reward": 1.3697917088866234,
"reward_std": 0.26870738714933395,
"rewards/equation_reward_func": 0.41406251629814506,
"rewards/format_reward_func": 0.955729179084301,
"step": 346
},
{
"completion_length": 270.08594703674316,
"epoch": 0.1856,
"grad_norm": 0.12276857645312758,
"kl": 0.24163818359375,
"learning_rate": 6.453556388803288e-08,
"loss": 0.0002,
"reward": 1.4062500484287739,
"reward_std": 0.28894974663853645,
"rewards/equation_reward_func": 0.4557291795499623,
"rewards/format_reward_func": 0.9505208544433117,
"step": 348
},
{
"completion_length": 256.07552909851074,
"epoch": 0.18666666666666668,
"grad_norm": 0.1261473193256241,
"kl": 0.29376220703125,
"learning_rate": 6.213905926255697e-08,
"loss": 0.0003,
"reward": 1.4479167014360428,
"reward_std": 0.250754666980356,
"rewards/equation_reward_func": 0.5078125149011612,
"rewards/format_reward_func": 0.9401041902601719,
"step": 350
},
{
"completion_length": 249.1354274749756,
"epoch": 0.18773333333333334,
"grad_norm": 0.08506597582252638,
"kl": 0.244384765625,
"learning_rate": 5.978156821084987e-08,
"loss": 0.0002,
"reward": 1.432291705161333,
"reward_std": 0.19336163811385632,
"rewards/equation_reward_func": 0.47916667349636555,
"rewards/format_reward_func": 0.9531250260770321,
"step": 352
},
{
"completion_length": 277.8489685058594,
"epoch": 0.1888,
"grad_norm": 0.12037895470125451,
"kl": 0.23748779296875,
"learning_rate": 5.7463580319254853e-08,
"loss": 0.0002,
"reward": 1.3437500521540642,
"reward_std": 0.2519768704660237,
"rewards/equation_reward_func": 0.4114583432674408,
"rewards/format_reward_func": 0.9322916865348816,
"step": 354
},
{
"completion_length": 252.59375858306885,
"epoch": 0.18986666666666666,
"grad_norm": 0.11733297431372698,
"kl": 0.239013671875,
"learning_rate": 5.518557697039081e-08,
"loss": 0.0002,
"reward": 1.4557292014360428,
"reward_std": 0.2128398958593607,
"rewards/equation_reward_func": 0.4921875107102096,
"rewards/format_reward_func": 0.9635416828095913,
"step": 356
},
{
"completion_length": 252.0677137374878,
"epoch": 0.19093333333333334,
"grad_norm": 0.08775856965094549,
"kl": 2.04522705078125,
"learning_rate": 5.294803124318145e-08,
"loss": 0.0021,
"reward": 1.5104167088866234,
"reward_std": 0.2261988613754511,
"rewards/equation_reward_func": 0.5338541800156236,
"rewards/format_reward_func": 0.9765625149011612,
"step": 358
},
{
"completion_length": 266.70313262939453,
"epoch": 0.192,
"grad_norm": 0.10933086508784831,
"kl": 0.2430419921875,
"learning_rate": 5.07514078146106e-08,
"loss": 0.0002,
"reward": 1.3984375409781933,
"reward_std": 0.22465246403589845,
"rewards/equation_reward_func": 0.4401041786186397,
"rewards/format_reward_func": 0.9583333507180214,
"step": 360
},
{
"completion_length": 244.33073902130127,
"epoch": 0.19306666666666666,
"grad_norm": 0.09058401208636457,
"kl": 0.2347412109375,
"learning_rate": 4.859616286322094e-08,
"loss": 0.0002,
"reward": 1.4895833730697632,
"reward_std": 0.20016511622816324,
"rewards/equation_reward_func": 0.5260416809469461,
"rewards/format_reward_func": 0.9635416828095913,
"step": 362
},
{
"completion_length": 254.1927146911621,
"epoch": 0.19413333333333332,
"grad_norm": 0.06760472710437652,
"kl": 0.24163818359375,
"learning_rate": 4.648274397437829e-08,
"loss": 0.0002,
"reward": 1.416666705161333,
"reward_std": 0.1794181428849697,
"rewards/equation_reward_func": 0.447916679084301,
"rewards/format_reward_func": 0.9687500186264515,
"step": 364
},
{
"completion_length": 265.5052156448364,
"epoch": 0.1952,
"grad_norm": 0.09934227406541099,
"kl": 0.24114990234375,
"learning_rate": 4.4411590047320617e-08,
"loss": 0.0002,
"reward": 1.437500037252903,
"reward_std": 0.23727863328531384,
"rewards/equation_reward_func": 0.47916667722165585,
"rewards/format_reward_func": 0.9583333507180214,
"step": 366
},
{
"completion_length": 287.24219131469727,
"epoch": 0.19626666666666667,
"grad_norm": 0.08869368411582416,
"kl": 0.2509765625,
"learning_rate": 4.2383131204010494e-08,
"loss": 0.0003,
"reward": 1.3229166977107525,
"reward_std": 0.2696537869051099,
"rewards/equation_reward_func": 0.36718750558793545,
"rewards/format_reward_func": 0.9557291828095913,
"step": 368
},
{
"completion_length": 250.90886116027832,
"epoch": 0.19733333333333333,
"grad_norm": 0.11141469624967881,
"kl": 0.2425537109375,
"learning_rate": 4.039778869981064e-08,
"loss": 0.0002,
"reward": 1.408854205161333,
"reward_std": 0.2594145955517888,
"rewards/equation_reward_func": 0.45572917349636555,
"rewards/format_reward_func": 0.9531250223517418,
"step": 370
},
{
"completion_length": 288.8619861602783,
"epoch": 0.1984,
"grad_norm": 0.09328173881518842,
"kl": 0.260009765625,
"learning_rate": 3.845597483600049e-08,
"loss": 0.0003,
"reward": 1.2708333618938923,
"reward_std": 0.24974829843267798,
"rewards/equation_reward_func": 0.3281250046566129,
"rewards/format_reward_func": 0.942708358168602,
"step": 372
},
{
"completion_length": 257.04167652130127,
"epoch": 0.19946666666666665,
"grad_norm": 0.11987502766296552,
"kl": 0.26214599609375,
"learning_rate": 3.655809287415284e-08,
"loss": 0.0003,
"reward": 1.4140625521540642,
"reward_std": 0.23185446253046393,
"rewards/equation_reward_func": 0.45572918001562357,
"rewards/format_reward_func": 0.9583333544433117,
"step": 374
},
{
"completion_length": 246.8671932220459,
"epoch": 0.20053333333333334,
"grad_norm": 0.07938676127449044,
"kl": 0.2530517578125,
"learning_rate": 3.4704536952387285e-08,
"loss": 0.0003,
"reward": 1.4531250298023224,
"reward_std": 0.2485762145370245,
"rewards/equation_reward_func": 0.4843750090803951,
"rewards/format_reward_func": 0.9687500186264515,
"step": 376
},
{
"completion_length": 271.40625762939453,
"epoch": 0.2016,
"grad_norm": 0.10397425885690677,
"kl": 0.253662109375,
"learning_rate": 3.2895692003518575e-08,
"loss": 0.0003,
"reward": 1.372395884245634,
"reward_std": 0.2290022149682045,
"rewards/equation_reward_func": 0.419270847691223,
"rewards/format_reward_func": 0.9531250149011612,
"step": 378
},
{
"completion_length": 277.70313262939453,
"epoch": 0.20266666666666666,
"grad_norm": 0.11176010513775461,
"kl": 0.2552490234375,
"learning_rate": 3.113193367511635e-08,
"loss": 0.0003,
"reward": 1.3489583693444729,
"reward_std": 0.3045574314892292,
"rewards/equation_reward_func": 0.4114583481568843,
"rewards/format_reward_func": 0.9375000186264515,
"step": 380
},
{
"completion_length": 264.783860206604,
"epoch": 0.20373333333333332,
"grad_norm": 0.09633127157125651,
"kl": 0.2548828125,
"learning_rate": 2.9413628251493934e-08,
"loss": 0.0003,
"reward": 1.3932292014360428,
"reward_std": 0.26427287235856056,
"rewards/equation_reward_func": 0.4401041737291962,
"rewards/format_reward_func": 0.9531250111758709,
"step": 382
},
{
"completion_length": 249.04167366027832,
"epoch": 0.2048,
"grad_norm": 0.06042361226548213,
"kl": 0.25054931640625,
"learning_rate": 2.774113257764066e-08,
"loss": 0.0003,
"reward": 1.4322917088866234,
"reward_std": 0.19768574135378003,
"rewards/equation_reward_func": 0.4817708507180214,
"rewards/format_reward_func": 0.9505208469927311,
"step": 384
},
{
"completion_length": 289.47136306762695,
"epoch": 0.20586666666666667,
"grad_norm": 0.09636095745621918,
"kl": 0.24542236328125,
"learning_rate": 2.611479398511518e-08,
"loss": 0.0002,
"reward": 1.291666705161333,
"reward_std": 0.22107936535030603,
"rewards/equation_reward_func": 0.3463541760575026,
"rewards/format_reward_func": 0.9453125186264515,
"step": 386
},
{
"completion_length": 247.9687581062317,
"epoch": 0.20693333333333333,
"grad_norm": 0.10748505650467376,
"kl": 0.2657470703125,
"learning_rate": 2.4534950219914057e-08,
"loss": 0.0003,
"reward": 1.494791705161333,
"reward_std": 0.24816493690013885,
"rewards/equation_reward_func": 0.5312500125728548,
"rewards/format_reward_func": 0.9635416828095913,
"step": 388
},
{
"completion_length": 248.68490505218506,
"epoch": 0.208,
"grad_norm": 0.11757891850912854,
"kl": 0.2335205078125,
"learning_rate": 2.300192937233128e-08,
"loss": 0.0002,
"reward": 1.4505208656191826,
"reward_std": 0.22064228588715196,
"rewards/equation_reward_func": 0.4895833458285779,
"rewards/format_reward_func": 0.9609375186264515,
"step": 390
},
{
"completion_length": 271.49480056762695,
"epoch": 0.20906666666666668,
"grad_norm": 0.07918511324806074,
"kl": 0.23931884765625,
"learning_rate": 2.1516049808822935e-08,
"loss": 0.0002,
"reward": 1.3515625409781933,
"reward_std": 0.18755131447687745,
"rewards/equation_reward_func": 0.38020834140479565,
"rewards/format_reward_func": 0.9713541828095913,
"step": 392
},
{
"completion_length": 259.51563835144043,
"epoch": 0.21013333333333334,
"grad_norm": 0.1495234231858708,
"kl": 0.241455078125,
"learning_rate": 2.007762010589098e-08,
"loss": 0.0002,
"reward": 1.4947917088866234,
"reward_std": 0.33302151458337903,
"rewards/equation_reward_func": 0.5468750111758709,
"rewards/format_reward_func": 0.9479166828095913,
"step": 394
},
{
"completion_length": 280.27344608306885,
"epoch": 0.2112,
"grad_norm": 0.10448152858384566,
"kl": 0.25335693359375,
"learning_rate": 1.8686938986000627e-08,
"loss": 0.0003,
"reward": 1.3593750335276127,
"reward_std": 0.2329879915341735,
"rewards/equation_reward_func": 0.3984375107102096,
"rewards/format_reward_func": 0.9609375186264515,
"step": 396
},
{
"completion_length": 243.24479961395264,
"epoch": 0.21226666666666666,
"grad_norm": 0.10158686561243806,
"kl": 0.26129150390625,
"learning_rate": 1.734429525554365e-08,
"loss": 0.0003,
"reward": 1.5312500447034836,
"reward_std": 0.26366367703303695,
"rewards/equation_reward_func": 0.5677083469927311,
"rewards/format_reward_func": 0.9635416828095913,
"step": 398
},
{
"completion_length": 262.8463611602783,
"epoch": 0.21333333333333335,
"grad_norm": 0.0879686678616527,
"kl": 0.23956298828125,
"learning_rate": 1.604996774486145e-08,
"loss": 0.0002,
"reward": 1.4479167088866234,
"reward_std": 0.2433197470381856,
"rewards/equation_reward_func": 0.4973958428017795,
"rewards/format_reward_func": 0.9505208544433117,
"step": 400
},
{
"completion_length": 255.82292366027832,
"epoch": 0.2144,
"grad_norm": 0.09500435271087032,
"kl": 0.23638916015625,
"learning_rate": 1.4804225250339281e-08,
"loss": 0.0002,
"reward": 1.4192708730697632,
"reward_std": 0.23259615385904908,
"rewards/equation_reward_func": 0.4609375165309757,
"rewards/format_reward_func": 0.9583333469927311,
"step": 402
},
{
"completion_length": 217.63802671432495,
"epoch": 0.21546666666666667,
"grad_norm": 0.10034531857683562,
"kl": 0.2501220703125,
"learning_rate": 1.360732647858498e-08,
"loss": 0.0003,
"reward": 1.5156250335276127,
"reward_std": 0.17062418861314654,
"rewards/equation_reward_func": 0.5442708432674408,
"rewards/format_reward_func": 0.9713541865348816,
"step": 404
},
{
"completion_length": 234.06771516799927,
"epoch": 0.21653333333333333,
"grad_norm": 0.12227115923971459,
"kl": 0.248779296875,
"learning_rate": 1.2459519992702311e-08,
"loss": 0.0002,
"reward": 1.4921875298023224,
"reward_std": 0.23618489829823375,
"rewards/equation_reward_func": 0.5234375111758709,
"rewards/format_reward_func": 0.9687500149011612,
"step": 406
},
{
"completion_length": 243.70052909851074,
"epoch": 0.2176,
"grad_norm": 0.15926056972625335,
"kl": 0.27069091796875,
"learning_rate": 1.1361044160671629e-08,
"loss": 0.0003,
"reward": 1.4505208618938923,
"reward_std": 0.28682674188166857,
"rewards/equation_reward_func": 0.5052083535119891,
"rewards/format_reward_func": 0.9453125149011612,
"step": 408
},
{
"completion_length": 275.8385486602783,
"epoch": 0.21866666666666668,
"grad_norm": 0.12757068910568817,
"kl": 0.24969482421875,
"learning_rate": 1.0312127105846947e-08,
"loss": 0.0002,
"reward": 1.3645833730697632,
"reward_std": 0.21345845330506563,
"rewards/equation_reward_func": 0.40104167675599456,
"rewards/format_reward_func": 0.9635416828095913,
"step": 410
},
{
"completion_length": 287.200532913208,
"epoch": 0.21973333333333334,
"grad_norm": 0.11457759488995656,
"kl": 0.244384765625,
"learning_rate": 9.312986659581301e-09,
"loss": 0.0002,
"reward": 1.3229166977107525,
"reward_std": 0.21469376189634204,
"rewards/equation_reward_func": 0.3593750111758709,
"rewards/format_reward_func": 0.9635416865348816,
"step": 412
},
{
"completion_length": 265.48438358306885,
"epoch": 0.2208,
"grad_norm": 0.12852298602657852,
"kl": 0.2640380859375,
"learning_rate": 8.363830315988945e-09,
"loss": 0.0003,
"reward": 1.3463541828095913,
"reward_std": 0.23709475807845592,
"rewards/equation_reward_func": 0.39583334093913436,
"rewards/format_reward_func": 0.9505208507180214,
"step": 414
},
{
"completion_length": 271.54688358306885,
"epoch": 0.22186666666666666,
"grad_norm": 0.08478231012580131,
"kl": 0.28369140625,
"learning_rate": 7.46485518885462e-09,
"loss": 0.0003,
"reward": 1.3489583693444729,
"reward_std": 0.22044954542070627,
"rewards/equation_reward_func": 0.3932291807141155,
"rewards/format_reward_func": 0.9557291828095913,
"step": 416
},
{
"completion_length": 243.90625667572021,
"epoch": 0.22293333333333334,
"grad_norm": 0.10846557765273872,
"kl": 0.24072265625,
"learning_rate": 6.616247970698319e-09,
"loss": 0.0002,
"reward": 1.533854205161333,
"reward_std": 0.2181540415622294,
"rewards/equation_reward_func": 0.5598958469927311,
"rewards/format_reward_func": 0.9739583469927311,
"step": 418
},
{
"completion_length": 264.71094608306885,
"epoch": 0.224,
"grad_norm": 0.1142319675311567,
"kl": 0.261962890625,
"learning_rate": 5.8181848940044855e-09,
"loss": 0.0003,
"reward": 1.4114583805203438,
"reward_std": 0.22676061373203993,
"rewards/equation_reward_func": 0.4635416807141155,
"rewards/format_reward_func": 0.9479166828095913,
"step": 420
},
{
"completion_length": 229.906259059906,
"epoch": 0.22506666666666666,
"grad_norm": 0.09559713141008308,
"kl": 0.2418212890625,
"learning_rate": 5.070831694623135e-09,
"loss": 0.0002,
"reward": 1.531250037252903,
"reward_std": 0.19829656789079309,
"rewards/equation_reward_func": 0.5703125149011612,
"rewards/format_reward_func": 0.9609375186264515,
"step": 422
},
{
"completion_length": 248.05209159851074,
"epoch": 0.22613333333333333,
"grad_norm": 0.11040696978140259,
"kl": 0.24212646484375,
"learning_rate": 4.374343577351336e-09,
"loss": 0.0002,
"reward": 1.4192708656191826,
"reward_std": 0.27204828383401036,
"rewards/equation_reward_func": 0.4531250123400241,
"rewards/format_reward_func": 0.9661458544433117,
"step": 424
},
{
"completion_length": 257.89584159851074,
"epoch": 0.2272,
"grad_norm": 0.0992000332189083,
"kl": 0.24200439453125,
"learning_rate": 3.7288651837012745e-09,
"loss": 0.0002,
"reward": 1.4062500298023224,
"reward_std": 0.2646353510208428,
"rewards/equation_reward_func": 0.45312501094304025,
"rewards/format_reward_func": 0.9531250223517418,
"step": 426
},
{
"completion_length": 231.69792461395264,
"epoch": 0.22826666666666667,
"grad_norm": 0.1407050044165881,
"kl": 0.27252197265625,
"learning_rate": 3.134530561862081e-09,
"loss": 0.0003,
"reward": 1.4869792014360428,
"reward_std": 0.15594792971387506,
"rewards/equation_reward_func": 0.5104166809469461,
"rewards/format_reward_func": 0.9765625149011612,
"step": 428
},
{
"completion_length": 276.9817762374878,
"epoch": 0.22933333333333333,
"grad_norm": 0.1167791204621414,
"kl": 0.24884033203125,
"learning_rate": 2.5914631388619103e-09,
"loss": 0.0002,
"reward": 1.3463542088866234,
"reward_std": 0.21618649549782276,
"rewards/equation_reward_func": 0.3906250107102096,
"rewards/format_reward_func": 0.9557291902601719,
"step": 430
},
{
"completion_length": 249.96094417572021,
"epoch": 0.2304,
"grad_norm": 0.10179382560252617,
"kl": 0.24853515625,
"learning_rate": 2.0997756949353297e-09,
"loss": 0.0002,
"reward": 1.4817708656191826,
"reward_std": 0.20142082124948502,
"rewards/equation_reward_func": 0.513020845130086,
"rewards/format_reward_func": 0.9687500111758709,
"step": 432
},
{
"completion_length": 304.00261211395264,
"epoch": 0.23146666666666665,
"grad_norm": 0.12633771333357205,
"kl": 0.2705078125,
"learning_rate": 1.6595703401020844e-09,
"loss": 0.0003,
"reward": 1.2734375409781933,
"reward_std": 0.2817671154625714,
"rewards/equation_reward_func": 0.3307291779201478,
"rewards/format_reward_func": 0.9427083507180214,
"step": 434
},
{
"completion_length": 244.33594417572021,
"epoch": 0.23253333333333334,
"grad_norm": 0.12174371002417166,
"kl": 0.24542236328125,
"learning_rate": 1.2709384929615596e-09,
"loss": 0.0002,
"reward": 1.4817708805203438,
"reward_std": 0.24246670864522457,
"rewards/equation_reward_func": 0.5208333441987634,
"rewards/format_reward_func": 0.9609375186264515,
"step": 436
},
{
"completion_length": 274.6692781448364,
"epoch": 0.2336,
"grad_norm": 0.08153629624949502,
"kl": 0.2364501953125,
"learning_rate": 9.339608617077165e-10,
"loss": 0.0002,
"reward": 1.3958333730697632,
"reward_std": 0.18359084147959948,
"rewards/equation_reward_func": 0.44531250884756446,
"rewards/format_reward_func": 0.9505208544433117,
"step": 438
},
{
"completion_length": 271.5208406448364,
"epoch": 0.23466666666666666,
"grad_norm": 0.10062195336090982,
"kl": 0.25860595703125,
"learning_rate": 6.487074273681114e-10,
"loss": 0.0003,
"reward": 1.3567708730697632,
"reward_std": 0.2888470063917339,
"rewards/equation_reward_func": 0.4062500102445483,
"rewards/format_reward_func": 0.950520858168602,
"step": 440
},
{
"completion_length": 279.31511306762695,
"epoch": 0.23573333333333332,
"grad_norm": 0.11347953554392516,
"kl": 0.27435302734375,
"learning_rate": 4.152374292708538e-10,
"loss": 0.0003,
"reward": 1.328125037252903,
"reward_std": 0.2600484313443303,
"rewards/equation_reward_func": 0.3723958460614085,
"rewards/format_reward_func": 0.9557291902601719,
"step": 442
},
{
"completion_length": 229.1927137374878,
"epoch": 0.2368,
"grad_norm": 0.08761777334438094,
"kl": 0.23480224609375,
"learning_rate": 2.3359935274214204e-10,
"loss": 0.0002,
"reward": 1.5416667088866234,
"reward_std": 0.19063151394948363,
"rewards/equation_reward_func": 0.5651041842065752,
"rewards/format_reward_func": 0.9765625149011612,
"step": 444
},
{
"completion_length": 269.6927146911621,
"epoch": 0.23786666666666667,
"grad_norm": 0.08325007668726372,
"kl": 0.24908447265625,
"learning_rate": 1.0383091903720665e-10,
"loss": 0.0002,
"reward": 1.3802083730697632,
"reward_std": 0.19487999146804214,
"rewards/equation_reward_func": 0.41927084675990045,
"rewards/format_reward_func": 0.9609375223517418,
"step": 446
},
{
"completion_length": 252.57813453674316,
"epoch": 0.23893333333333333,
"grad_norm": 0.07984790038875238,
"kl": 0.24322509765625,
"learning_rate": 2.595907750671533e-11,
"loss": 0.0002,
"reward": 1.4505208730697632,
"reward_std": 0.1807808456942439,
"rewards/equation_reward_func": 0.47656251257285476,
"rewards/format_reward_func": 0.9739583469927311,
"step": 448
},
{
"completion_length": 277.79948806762695,
"epoch": 0.24,
"grad_norm": 0.11593052361546653,
"kl": 0.26580810546875,
"learning_rate": 0.0,
"loss": 0.0003,
"reward": 1.3541667088866234,
"reward_std": 0.2557070981711149,
"rewards/equation_reward_func": 0.40364584303461015,
"rewards/format_reward_func": 0.9505208469927311,
"step": 450
},
{
"epoch": 0.24,
"step": 450,
"total_flos": 0.0,
"train_loss": 0.00020930594997387746,
"train_runtime": 20107.4364,
"train_samples_per_second": 0.537,
"train_steps_per_second": 0.022
}
],
"logging_steps": 2,
"max_steps": 450,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}